From d0b47f85c0c0a430e2b7ed84a8b45de506c93d88 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Wed, 6 May 2026 15:35:48 +0200 Subject: [PATCH] release: prepare v0.20.2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bump workspace version 0.20.1 → 0.20.2 - Update all internal zeph-* workspace dependency versions to 0.20.2 - Update TUI splash snapshot for new version - Update CHANGELOG.md with v0.20.2 release section - Update README.md: Gonka native provider fully implemented, feature highlights - Update crate READMEs: GonkaProvider, SpeculationEngine, MemCoT, TrajectorySentinel, ScopedToolExecutor, ExecutionContext, MCP startup auto-retry - Update book/src/ docs: APEX-MEM, MemCoT, TypedPage compaction, goal lifecycle/TACO, capability governance, MCP retry, ExecutionContext - Update specs/: MemCoT sub-spec, APEX-MEM fixes, orchestrator AdmissionGate, MCP retry contract, CLI purge command, Gonka status implemented, agent-loop goal lifecycle/TACO/ExecutionContext, context TypedPage, tools DynExecutor, hooks tracing --- CHANGELOG.md | 5 +- Cargo.lock | 60 ++--- Cargo.toml | 60 ++--- README.md | 34 +-- book/src/advanced/context.md | 33 +++ book/src/advanced/orchestrator.md | 60 +++++ book/src/advanced/tools.md | 226 ++++++++++++++++++ book/src/concepts/graph-memory.md | 24 ++ book/src/concepts/hooks.md | 34 +++ book/src/concepts/memory.md | 123 ++++++++++ book/src/concepts/providers.md | 17 +- book/src/guides/gonka.md | 93 +++++++ book/src/guides/mcp.md | 65 +++++ book/src/reference/cli.md | 54 +++++ crates/zeph-core/README.md | 27 ++- crates/zeph-llm/README.md | 34 ++- crates/zeph-mcp/README.md | 24 ++ crates/zeph-memory/README.md | 19 ++ crates/zeph-tools/README.md | 19 ++ ...idgets__splash__tests__splash_default.snap | 3 +- specs/002-agent-loop/spec.md | 173 ++++++++++++++ specs/004-memory/004-13-memory-memcot.md | 213 +++++++++++++++++ specs/004-memory/004-7-memory-apex-magma.md | 63 ++++- specs/006-tools/spec.md | 33 +++ specs/008-mcp/008-1-lifecycle.md | 69 +++++- specs/009-orchestration/spec.md | 56 ++++- specs/021-zeph-context/spec.md | 64 ++++- specs/028-hooks/spec.md | 40 ++++ specs/047-cli-modes/spec.md | 65 ++++- specs/051-gonka-gateway/spec.md | 2 +- specs/052-gonka-native/spec.md | 10 +- specs/README.md | 9 +- 32 files changed, 1706 insertions(+), 105 deletions(-) create mode 100644 specs/004-memory/004-13-memory-memcot.md diff --git a/CHANGELOG.md b/CHANGELOG.md index f0e7d1784..a9406f2cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +## [0.20.2] - 2026-05-06 + ### Added - feat(cli): `zeph gonka doctor` diagnostic subcommand — checks vault key resolution, signer @@ -5627,7 +5629,8 @@ let agent = Agent::new(provider, channel, &skills_prompt, executor); - Agent::run() uses tokio::select! to race channel messages against shutdown signal [0.16.0]: https://github.com/bug-ops/zeph/compare/v0.15.3...v0.16.0 -[Unreleased]: https://github.com/bug-ops/zeph/compare/v0.20.1...HEAD +[Unreleased]: https://github.com/bug-ops/zeph/compare/v0.20.2...HEAD +[0.20.2]: https://github.com/bug-ops/zeph/compare/v0.20.1...v0.20.2 [0.20.1]: https://github.com/bug-ops/zeph/compare/v0.20.0...v0.20.1 [0.20.0]: https://github.com/bug-ops/zeph/compare/v0.19.3...v0.20.0 [0.19.3]: https://github.com/bug-ops/zeph/compare/v0.19.2...v0.19.3 diff --git a/Cargo.lock b/Cargo.lock index 9bdf4884f..9f9815f07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10108,7 +10108,7 @@ dependencies = [ [[package]] name = "zeph" -version = "0.20.1" +version = "0.20.2" dependencies = [ "anyhow", "async-trait", @@ -10177,7 +10177,7 @@ dependencies = [ [[package]] name = "zeph-a2a" -version = "0.20.1" +version = "0.20.2" dependencies = [ "axum 0.8.9", "base64 0.22.1", @@ -10207,7 +10207,7 @@ dependencies = [ [[package]] name = "zeph-acp" -version = "0.20.1" +version = "0.20.2" dependencies = [ "agent-client-protocol", "agent-client-protocol-tokio", @@ -10249,7 +10249,7 @@ dependencies = [ [[package]] name = "zeph-agent-context" -version = "0.20.1" +version = "0.20.2" dependencies = [ "chrono", "futures", @@ -10271,7 +10271,7 @@ dependencies = [ [[package]] name = "zeph-agent-feedback" -version = "0.20.1" +version = "0.20.2" dependencies = [ "regex", "schemars 1.2.1", @@ -10286,7 +10286,7 @@ dependencies = [ [[package]] name = "zeph-agent-persistence" -version = "0.20.1" +version = "0.20.2" dependencies = [ "serde", "serde_json", @@ -10302,7 +10302,7 @@ dependencies = [ [[package]] name = "zeph-agent-tools" -version = "0.20.1" +version = "0.20.2" dependencies = [ "futures", "serde", @@ -10324,7 +10324,7 @@ dependencies = [ [[package]] name = "zeph-bench" -version = "0.20.1" +version = "0.20.2" dependencies = [ "clap", "schemars 1.2.1", @@ -10345,7 +10345,7 @@ dependencies = [ [[package]] name = "zeph-channels" -version = "0.20.1" +version = "0.20.2" dependencies = [ "axum 0.8.9", "criterion", @@ -10372,7 +10372,7 @@ dependencies = [ [[package]] name = "zeph-commands" -version = "0.20.1" +version = "0.20.2" dependencies = [ "serde", "thiserror 2.0.18", @@ -10382,7 +10382,7 @@ dependencies = [ [[package]] name = "zeph-common" -version = "0.20.1" +version = "0.20.2" dependencies = [ "blake3", "cpu-time", @@ -10408,7 +10408,7 @@ dependencies = [ [[package]] name = "zeph-config" -version = "0.20.1" +version = "0.20.2" dependencies = [ "dirs", "insta", @@ -10427,7 +10427,7 @@ dependencies = [ [[package]] name = "zeph-context" -version = "0.20.1" +version = "0.20.2" dependencies = [ "blake3", "futures", @@ -10448,7 +10448,7 @@ dependencies = [ [[package]] name = "zeph-core" -version = "0.20.1" +version = "0.20.2" dependencies = [ "age", "base64 0.22.1", @@ -10515,7 +10515,7 @@ dependencies = [ [[package]] name = "zeph-db" -version = "0.20.1" +version = "0.20.2" dependencies = [ "regex", "sqlx", @@ -10530,7 +10530,7 @@ dependencies = [ [[package]] name = "zeph-experiments" -version = "0.20.1" +version = "0.20.2" dependencies = [ "futures", "ordered-float 5.3.0", @@ -10553,7 +10553,7 @@ dependencies = [ [[package]] name = "zeph-gateway" -version = "0.20.1" +version = "0.20.2" dependencies = [ "axum 0.8.9", "blake3", @@ -10572,7 +10572,7 @@ dependencies = [ [[package]] name = "zeph-index" -version = "0.20.1" +version = "0.20.2" dependencies = [ "futures", "ignore", @@ -10606,7 +10606,7 @@ dependencies = [ [[package]] name = "zeph-llm" -version = "0.20.1" +version = "0.20.2" dependencies = [ "async-stream", "audioadapter-buffers", @@ -10654,7 +10654,7 @@ dependencies = [ [[package]] name = "zeph-mcp" -version = "0.20.1" +version = "0.20.2" dependencies = [ "async-trait", "blake3", @@ -10687,7 +10687,7 @@ dependencies = [ [[package]] name = "zeph-memory" -version = "0.20.1" +version = "0.20.2" dependencies = [ "arc-swap", "blake3", @@ -10726,7 +10726,7 @@ dependencies = [ [[package]] name = "zeph-orchestration" -version = "0.20.1" +version = "0.20.2" dependencies = [ "blake3", "dirs", @@ -10753,7 +10753,7 @@ dependencies = [ [[package]] name = "zeph-plugins" -version = "0.20.1" +version = "0.20.2" dependencies = [ "anyhow", "dirs", @@ -10775,7 +10775,7 @@ dependencies = [ [[package]] name = "zeph-sanitizer" -version = "0.20.1" +version = "0.20.2" dependencies = [ "proptest", "regex", @@ -10795,7 +10795,7 @@ dependencies = [ [[package]] name = "zeph-scheduler" -version = "0.20.1" +version = "0.20.2" dependencies = [ "chrono", "cron", @@ -10815,7 +10815,7 @@ dependencies = [ [[package]] name = "zeph-skills" -version = "0.20.1" +version = "0.20.2" dependencies = [ "anyhow", "blake3", @@ -10849,7 +10849,7 @@ dependencies = [ [[package]] name = "zeph-subagent" -version = "0.20.1" +version = "0.20.2" dependencies = [ "dirs", "indoc", @@ -10876,7 +10876,7 @@ dependencies = [ [[package]] name = "zeph-tools" -version = "0.20.1" +version = "0.20.2" dependencies = [ "arc-swap", "dashmap", @@ -10923,7 +10923,7 @@ dependencies = [ [[package]] name = "zeph-tui" -version = "0.20.1" +version = "0.20.2" dependencies = [ "chrono", "crossterm", @@ -10960,7 +10960,7 @@ dependencies = [ [[package]] name = "zeph-vault" -version = "0.20.1" +version = "0.20.2" dependencies = [ "age", "proptest", diff --git a/Cargo.toml b/Cargo.toml index d83c67f35..0cd85ad97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ resolver = "3" [workspace.package] edition = "2024" rust-version = "1.95" -version = "0.20.1" +version = "0.20.2" authors = ["bug-ops"] license = "MIT" repository = "https://github.com/bug-ops/zeph" @@ -143,35 +143,35 @@ uuid = "1.23.1" walkdir = "2.5" wiremock = "0.6.5" zeroize = { version = "1.8.2", default-features = false } -zeph-a2a = { path = "crates/zeph-a2a", version = "0.20.0" } -zeph-agent-context = { path = "crates/zeph-agent-context", version = "0.20.0" } -zeph-agent-feedback = { path = "crates/zeph-agent-feedback", version = "0.20.0" } -zeph-agent-persistence = { path = "crates/zeph-agent-persistence", version = "0.20.0" } -zeph-agent-tools = { path = "crates/zeph-agent-tools", version = "0.20.0" } -zeph-bench = { path = "crates/zeph-bench", version = "0.20.0" } -zeph-acp = { path = "crates/zeph-acp", version = "0.20.0" } -zeph-db = { path = "crates/zeph-db", default-features = false, version = "0.20.0" } -zeph-channels = { path = "crates/zeph-channels", version = "0.20.0" } -zeph-common = { path = "crates/zeph-common", version = "0.20.0" } -zeph-config = { path = "crates/zeph-config", version = "0.20.0" } -zeph-commands = { path = "crates/zeph-commands", version = "0.20.0" } -zeph-context = { path = "crates/zeph-context", version = "0.20.0" } -zeph-core = { path = "crates/zeph-core", version = "0.20.0" } -zeph-experiments = { path = "crates/zeph-experiments", version = "0.20.0" } -zeph-gateway = { path = "crates/zeph-gateway", version = "0.20.0" } -zeph-index = { path = "crates/zeph-index", version = "0.20.0" } -zeph-llm = { path = "crates/zeph-llm", version = "0.20.0" } -zeph-mcp = { path = "crates/zeph-mcp", version = "0.20.0" } -zeph-memory = { path = "crates/zeph-memory", default-features = false, version = "0.20.0" } -zeph-scheduler = { path = "crates/zeph-scheduler", version = "0.20.0" } -zeph-skills = { path = "crates/zeph-skills", version = "0.20.0" } -zeph-tools = { path = "crates/zeph-tools", version = "0.20.0" } -zeph-tui = { path = "crates/zeph-tui", version = "0.20.0" } -zeph-vault = { path = "crates/zeph-vault", version = "0.20.0" } -zeph-orchestration = { path = "crates/zeph-orchestration", version = "0.20.0" } -zeph-plugins = { path = "crates/zeph-plugins", version = "0.20.0" } -zeph-sanitizer = { path = "crates/zeph-sanitizer", version = "0.20.0" } -zeph-subagent = { path = "crates/zeph-subagent", version = "0.20.0" } +zeph-a2a = { path = "crates/zeph-a2a", version = "0.20.2" } +zeph-agent-context = { path = "crates/zeph-agent-context", version = "0.20.2" } +zeph-agent-feedback = { path = "crates/zeph-agent-feedback", version = "0.20.2" } +zeph-agent-persistence = { path = "crates/zeph-agent-persistence", version = "0.20.2" } +zeph-agent-tools = { path = "crates/zeph-agent-tools", version = "0.20.2" } +zeph-bench = { path = "crates/zeph-bench", version = "0.20.2" } +zeph-acp = { path = "crates/zeph-acp", version = "0.20.2" } +zeph-db = { path = "crates/zeph-db", default-features = false, version = "0.20.2" } +zeph-channels = { path = "crates/zeph-channels", version = "0.20.2" } +zeph-common = { path = "crates/zeph-common", version = "0.20.2" } +zeph-config = { path = "crates/zeph-config", version = "0.20.2" } +zeph-commands = { path = "crates/zeph-commands", version = "0.20.2" } +zeph-context = { path = "crates/zeph-context", version = "0.20.2" } +zeph-core = { path = "crates/zeph-core", version = "0.20.2" } +zeph-experiments = { path = "crates/zeph-experiments", version = "0.20.2" } +zeph-gateway = { path = "crates/zeph-gateway", version = "0.20.2" } +zeph-index = { path = "crates/zeph-index", version = "0.20.2" } +zeph-llm = { path = "crates/zeph-llm", version = "0.20.2" } +zeph-mcp = { path = "crates/zeph-mcp", version = "0.20.2" } +zeph-memory = { path = "crates/zeph-memory", default-features = false, version = "0.20.2" } +zeph-scheduler = { path = "crates/zeph-scheduler", version = "0.20.2" } +zeph-skills = { path = "crates/zeph-skills", version = "0.20.2" } +zeph-tools = { path = "crates/zeph-tools", version = "0.20.2" } +zeph-tui = { path = "crates/zeph-tui", version = "0.20.2" } +zeph-vault = { path = "crates/zeph-vault", version = "0.20.2" } +zeph-orchestration = { path = "crates/zeph-orchestration", version = "0.20.2" } +zeph-plugins = { path = "crates/zeph-plugins", version = "0.20.2" } +zeph-sanitizer = { path = "crates/zeph-sanitizer", version = "0.20.2" } +zeph-subagent = { path = "crates/zeph-subagent", version = "0.20.2" } [workspace.lints.rust] unsafe_code = "deny" diff --git a/README.md b/README.md index 6bcf68609..3259b11e5 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Unlike single-session assistants, Zeph is designed to remember *why* a decision | An agent that survives long projects | SQLite conversation history, semantic recall, graph memory, session digests, trajectory memory, and goal-aware compaction. | | Lower infrastructure cost | A default SQLite vector backend, local Ollama defaults, feature-gated bundles, and provider routing for simple vs. hard tasks. | | More than keyword memory | Typed graph facts, BFS recall, SYNAPSE spreading activation, MMR reranking, temporal decay, and write-quality gates. | -| Provider freedom | Ollama, Claude, OpenAI, Gemini, Candle, any OpenAI-compatible endpoint, and a Gonka.ai path through GonkaGate. | +| Provider freedom | Ollama, Claude, OpenAI, Gemini, Candle, any OpenAI-compatible endpoint, and Gonka.ai via GonkaGate or the native signed-transport provider. | | Agent-grade safety | Age-encrypted vault secrets, sandboxed tool execution, MCP injection detection, SSRF guards, PII filtering, and exfiltration checks. | | Daily operator ergonomics | CLI, TUI dashboard, MCP tools, plugins, skills, sub-agents, ACP for IDEs, A2A, scheduler, and JSON output modes. | @@ -64,12 +64,9 @@ zeph ## Gonka.ai -Zeph is being wired for Gonka.ai in two phases: +Zeph supports Gonka.ai inference in two modes: -- **GonkaGate today:** use the existing OpenAI-compatible provider path and store the `gp-...` key in the age vault as `ZEPH_COMPATIBLE_GONKAGATE_API_KEY`. -- **Native Gonka next:** the `gonka` provider config shape and vault key resolution have landed; the signed native transport is the active follow-up. - -Example GonkaGate provider: +**GonkaGate (OpenAI-compatible gateway):** store the `gp-...` key in the age vault as `ZEPH_COMPATIBLE_GONKAGATE_API_KEY` and use the `compatible` provider type: ```toml [[llm.providers]] @@ -80,14 +77,21 @@ model = "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" default = true ``` -Zeph resolves compatible provider secrets by name. For `name = "gonkagate"`, store the gateway key under: +**Native Gonka provider (signed transport):** use `type = "gonka"` with node endpoints. The native provider supports chat, streaming, embeddings, and tool calls over a signed request transport. Store the key as `ZEPH_GONKA_API_KEY` in the vault and declare nodes via `--init` wizard or directly in config: -```text -ZEPH_COMPATIBLE_GONKAGATE_API_KEY +```toml +[[llm.providers]] +name = "gonka" +type = "gonka" +model = "qwen3-235b" +default = true + +[[llm.gonka_nodes]] +url = "https://node.example.gonka.ai" +weight = 1 ``` -> [!NOTE] -> Native `type = "gonka"` currently validates configuration but still returns "gonka provider is not yet implemented" at provider construction. Use GonkaGate until the signed node transport lands. +Run `zeph init` and select the GonkaGate option in the wizard to configure either mode interactively. ## What Makes It Different @@ -197,11 +201,11 @@ cargo build --release --features full | Area | Highlights | |---|---| -| Memory | SQLite/PostgreSQL history, embedded SQLite vectors or Qdrant, graph memory, SYNAPSE, SleepGate, ReasoningBank, document RAG. | -| Context | Goal-aware compaction, typed pages, tool-output archive, session recap, active-goal injection. | +| Memory | SQLite/PostgreSQL history, embedded SQLite vectors or Qdrant, graph memory, SYNAPSE, SleepGate, APEX-MEM write-quality gates, MemCoT Zoom-In/Out recall views, document RAG. | +| Context | Goal-aware compaction, TypedPage assembler pipeline, TACO output compression, tool-output archive, session recap, active-goal injection. | | Skills | `SKILL.md` registry, hot reload, BM25 + embedding matching, trust levels, self-learning skill improvement. | -| Providers | Ollama, Claude, OpenAI, Gemini, OpenAI-compatible APIs, Candle local inference, adaptive routing. | -| Tools | Shell, file, web, MCP, tool quotas, approval gates, audit trail, sandboxing, output compression. | +| Providers | Ollama, Claude, OpenAI, Gemini, OpenAI-compatible APIs, Gonka native inference, Candle local inference, adaptive routing. | +| Tools | Shell, file, web, MCP, tool quotas, approval gates, audit trail, sandboxing, output compression, speculative dispatch, TrajectorySentinel capability governance. | | Interfaces | CLI, TUI, Telegram, Discord, Slack, ACP, A2A, HTTP gateway, scheduler daemon. | | Code intelligence | Tree-sitter indexing, semantic repo map, LSP diagnostics and hover context through MCP. | | Observability | Debug dumps, JSONL mode, Prometheus metrics, OpenTelemetry traces, profiling builds. | diff --git a/book/src/advanced/context.md b/book/src/advanced/context.md index abf9fdd89..4d3a45b0e 100644 --- a/book/src/advanced/context.md +++ b/book/src/advanced/context.md @@ -155,6 +155,39 @@ After each tool execution, `maybe_summarize_tool_pair()` checks whether the numb Summarization runs synchronously between tool iterations. If the LLM call fails, the error is logged and the pair is left unsummarized. +## TypedPage and ClawVM Context Compaction + +During context compaction, Zeph produces pages of different types — tool outputs, conversation turns, memory excerpts, system context — each with distinct fidelity requirements. ClawVM (Compact Low-Alignment View Machine) classifies every compacted page into a `PageType` enum and enforces per-type `PageInvariant` traits at compaction boundaries. This ensures that critical information structures are preserved during summarization. + +**Page types and their invariants:** + +| Type | Content | Invariant | +|------|---------|-----------| +| `ToolOutput` | Single tool result (bash output, file read, etc.) | No orphaned ToolUse/ToolResult pairs — tool requests and responses remain linked | +| `ConversationTurn` | User or assistant message | Multipart structure intact — text, tool calls, and reasoning blocks stay together | +| `MemoryExcerpt` | Recalled or injected semantic memory | Citation completeness — references to facts or sources remain valid | +| `SystemContext` | Project context (ZEPH.md) + instructions | No truncation of logical sections — guidelines remain self-contained | + +**How it works:** + +1. **Classification** — as the LLM produces a summary, each output message is tokenized and assigned a `PageType` based on its source +2. **Validation** — before the page enters the SQLite store, `PageInvariant::validate()` is called to check fidelity constraints +3. **Audit logging** — when invariants succeed, an audit record is appended to a bounded async sink, allowing external systems to verify enforcement +4. **Graceful degradation** — if validation fails, the page is either rejected (strict mode) or admitted with a warning flag (permissive mode), depending on `compaction.invariant_mode` + +**Configuration:** + +```toml +[memory.compaction] +invariant_mode = "permissive" # "strict" | "permissive" (default: "permissive") +audit_enabled = true # Log invariant checks to SQLite (default: false) +``` + +- `strict` — reject pages that fail invariant checks. Compaction may not produce a summary if too many pages are rejected. Use for safety-critical deployments. +- `permissive` — admit pages with failed invariants but flag them with a warning. Ensures compaction always completes. Use for long sessions where occasional information loss is acceptable. + +When `audit_enabled = true`, each compaction pass writes invariant check results to the `compaction_audit` table, allowing you to detect which page types are degrading. Query this table to identify patterns where critical information is being lost during compaction. + ### Summary Provider Configuration By default, tool-pair summarization uses the primary LLM provider. You can dedicate a faster or cheaper model to this task using either the structured `[llm.summary_provider]` section or the `summary_model` string shorthand. diff --git a/book/src/advanced/orchestrator.md b/book/src/advanced/orchestrator.md index deb740ad9..7df8fa14f 100644 --- a/book/src/advanced/orchestrator.md +++ b/book/src/advanced/orchestrator.md @@ -86,6 +86,66 @@ embedding_model = "nomic-embed-text" # dedicated embedding model embed = true ``` +## Orchestration-Tier Provider Routing + +Sub-agent orchestration runs several internal LLM tasks that are distinct from user-facing reasoning: + +- **Scheduling and aggregation** — combining multiple sub-agent outputs into a coherent result +- **Predicate evaluation** — deciding whether a task completed successfully (true/false classifiers) +- **Task verification** — double-checking a result before returning it to the user + +These tasks can often be handled by smaller/faster models without impacting overall quality. The `orchestrator_provider` field routes all three through a single dedicated provider: + +```toml +[[llm.providers]] +name = "fast" +type = "ollama" +model = "qwen3:1.7b" + +[[llm.providers]] +name = "quality" +type = "claude" +model = "claude-sonnet-4-6" +default = true + +[orchestration] +orchestrator_provider = "fast" # Use fast model for scheduling-tier LLM calls +planner_provider = "quality" # Use quality model for planning (stays on quality provider) +``` + +The resolution order is: + +- `LlmAggregator` (output synthesis) → `orchestrator_provider` → primary +- `PlanVerifier` (verification check) → `verify_provider` → `orchestrator_provider` → primary +- `PredicateEvaluator` (predicate logic) → `predicate_provider` → `orchestrator_provider` → primary + +When `planner_provider` is explicitly set, it is NOT overridden by `orchestrator_provider`. Planning is a complex task and always uses the quality provider. + +> [!WARNING] +> Routing `LlmAggregator` through a cheap/fast model may reduce final output quality because aggregation produces user-visible text. Test thoroughly with your workload before relying on this optimization in production. + +## Admission Control and Concurrency Limits + +To prevent provider overcommit when many sub-agents are running, set `max_concurrent` per provider. This limits the number of simultaneous in-flight orchestration calls to that provider: + +```toml +[[llm.providers]] +name = "api" +type = "openai" +model = "gpt-4o" +max_concurrent = 10 # Allow up to 10 concurrent sub-agent API calls + +[[llm.providers]] +name = "local" +type = "ollama" +model = "qwen3:8b" +max_concurrent = 4 # Ollama server has less capacity +``` + +The `AdmissionGate` enforces these limits at spawn time. When a provider reaches its limit, new tasks are deferred with exponential backoff until a previous task completes and frees a permit. + +Currently the concurrency limit is enforced (tasks are delayed), but cost budgets are warn-only: when a task completes with token usage exceeding `[orchestration] default_task_budget_cents`, a warning is logged but the task is not rejected. Hard budget enforcement is deferred pending per-task `CostTracker` scoping. + ## SLM Provider Recommendations Each Zeph subsystem that calls an LLM exposes a `*_provider` config field. Matching the model size to task complexity reduces cost and latency without sacrificing quality. The table below lists the recommended model tier for each subsystem: diff --git a/book/src/advanced/tools.md b/book/src/advanced/tools.md index 25966dffb..59b4e6dc3 100644 --- a/book/src/advanced/tools.md +++ b/book/src/advanced/tools.md @@ -298,6 +298,232 @@ If 3 consecutive tool iterations produce identical output strings, the loop brea At the start of each iteration, the agent estimates total token usage. If usage exceeds 80% of the configured `context_budget_tokens`, the loop stops to avoid exceeding the model's context window. +## Per-Turn Execution Context + +Each tool invocation receives a `ExecutionContext` that carries contextual information about the turn in which it is executing: + +```rust +pub struct ExecutionContext { + pub turn_id: String, // UUID of the current agent turn + pub goal_id: Option, // UUID of the active /plan goal (if any) + pub skill_name: Option,// Name of the active skill (if matched) + pub timestamp_ms: u64, // Unix timestamp of turn start +} +``` + +This context is available to tool executors via `ShellExecutor::context()` and can be used to: + +- **Audit and tracing** — correlate tool invocations with the turn that triggered them +- **Goal-aware behavior** — adjust tool output based on the active goal or skill +- **Session reconstruction** — reconstruct the execution sequence from audit logs + +Tool executors can opt-in to receiving the context: + +```toml +[tools.shell] +enable_execution_context = true # expose turn_id, goal_id, skill_name to hooks and auditing +``` + +When enabled, the context is propagated to shell command hooks (`hooks.file_changed`, `hooks.cwd_changed`) as environment variables: + +| Variable | Source | +|----------|--------| +| `ZEPH_TURN_ID` | `ExecutionContext::turn_id` | +| `ZEPH_GOAL_ID` | `ExecutionContext::goal_id` (omitted if no active goal) | +| `ZEPH_SKILL_NAME` | `ExecutionContext::skill_name` (omitted if no active skill) | + +## Goal Lifecycle and TACO Output Compression + +When a `/plan` goal is active, tool outputs are subject to automatic compression via TACO (Tool-Aware Context Optimization). TACO uses a goal-aware compression strategy that: + +1. **Preserves goal-relevant outputs** — tool results that directly address the active goal are never compressed +2. **Compresses tangential outputs** — results from exploratory or debugging tools outside the critical path are condensed into 2-3 line summaries +3. **Caches outputs** — compressed outputs are memoized so identical tool calls don't re-compress + +**Goal lifecycle:** + +When `/plan "Build a REST API"` is invoked: + +1. A `TaskGraph` is created with UUID and stored in SQLite +2. Each tool invocation in the context of that plan gets `ExecutionContext::goal_id = ` +3. At context assembly time, tool outputs are scored by relevance to the goal via: + - Token count (smaller = more compressible) + - Tool type (shell outputs compressed more aggressively than file reads) + - Goal distance (proximity to the core task path) +4. When the goal completes, TACO stops applying compression and returns to normal tool output display + +**Configuration:** + +```toml +[tools.compression] +enabled = true +goal_aware = true # Enable goal-aware compression (default: false) +compression_threshold_tokens = 300 # Compress outputs larger than this (default: 300) +preserve_shell_errors = true # Never compress shell commands with exit_code != 0 (default: true) + +# Compression strategies per tool type +[tools.compression.strategies] +bash = "aggressive" # Compress shell output to 2-3 lines +read = "moderate" # Keep file read outputs; only trim beyond 500 chars +web_scrape = "moderate" # Keep scrape results; summarize only if > 1000 chars +find_path = "aggressive" # Compress find results to "X files matching pattern" +``` + +When `goal_aware = true`, the compression strategy dynamically adjusts based on task relevance. A `grep` result that mentions the active goal's API function is preserved; one that mentions unrelated code is summarized. + +**Example:** + +```toml +# Without TACO +$ bash command: "cargo build --release" +[output: 50 lines of compiler messages] + +$ read file: "src/lib.rs" +[output: 200 lines of source code] + +# With TACO (goal_aware=true, active goal is "add error handling") +$ bash command: "cargo build --release" +[error handling additions: 3 relevant compiler messages; 47 others elided] + +$ read file: "src/lib.rs" +[read src/lib.rs: 200 lines] (preserved because goal-adjacent; file reads not compressed) +``` + +## Capability Governance: TrajectorySentinel and ScopedToolExecutor + +Tool execution can be gated by external security or governance policies. Two mechanisms work together: + +### TrajectorySentinel + +`TrajectorySentinel` observes the trajectory (sequence) of tool calls across a session and blocks calls that violate a learned policy. It learns patterns from: + +- **Prior sessions** — tool sequences that caused errors, security violations, or policy breaches +- **User feedback** — when the user marks a tool result as "unacceptable" or "revoke", that sequence is marked as off-limits +- **Static allowlist** — tools listed in `[tools.governance]` are always available + +Enable trajectory-based blocking: + +```toml +[tools.governance] +trajectory_enabled = true +block_risky_patterns = true # Default: false (off unless explicitly enabled) +blocked_sequences = [ + ["bash", "rm", "-rf", "/"], # Never allow a full filesystem delete + ["write", "config.toml", "password"], # Never write credentials to config +] +``` + +The sentinel stores successful and failed sequences in SQLite and uses them to score subsequent invocations. A tool call can be blocked if: + +- Its sequence matches a `blocked_sequences` entry +- Its sequence is semantically similar to a recent error sequence (via embedding similarity) + +### ScopedToolExecutor + +`ScopedToolExecutor` wraps an inner executor and applies permission checks before delegating. It enforces: + +1. **Per-tool access control** — which tools can be invoked (allowlist or denylist) +2. **Per-parameter validation** — constraints on file paths, command content, URL domains +3. **Runtime permission escalation** — tools requiring higher trust level prompt the user before execution + +```toml +[tools.scoped] +enabled = true + +# Deny list: block specific tools +denied_tools = ["delete_path", "bash"] + +# Allow list: only these tools are available (if set, denied_tools is ignored) +# allowed_tools = ["read", "write", "fetch"] + +# Per-tool parameter constraints +[[tools.scoped.constraints]] +tool = "bash" +deny_patterns = ["rm -rf", "sudo", ":(){:|:|:|:}"] # block dangerous commands + +[[tools.scoped.constraints]] +tool = "write" +allowed_paths = ["/tmp", "/workspace"] # only write to these directories +``` + +When a tool invocation violates a constraint, the agent receives an error message indicating which constraint was violated. The user can override with `/approve ` if they trust the specific invocation. + +Both mechanisms complement file path sandboxing and OS-level process sandboxing — they add policy enforcement at the Zeph orchestration layer. + +## Per-Turn Execution Context + +`ShellExecutor` maintains a per-turn `ExecutionContext` that persists across iterations within a single agent turn. This context includes: + +- **Working directory** — set by the user or previous tool invocation; carries forward to subsequent commands +- **Environment variable overrides** — set via `export` or shell commands +- **Session history** — command history from previous iterations, available via shell history commands +- **Parsed state** — extracted values from previous tool outputs (e.g., URLs, file paths, parsed JSON) + +The context is created at the start of each turn and discarded when the turn completes, ensuring tool outputs don't bleed into subsequent unrelated conversations. + +```bash +> cd /path/to/project +[bash] cd /path/to/project + +> cargo build +[bash] cargo build # runs in /path/to/project (context persisted) + +> find src -name "*.rs" | head +[bash] find src -name "*.rs" | head # also runs in /path/to/project +``` + +## Goal Lifecycle and TACO Output Compression + +When the agent is running toward an explicit goal (via `/plan` or `[agent] goal_text` config), tool outputs are evaluated for relevance to that goal. TACO (Token-Aware Compression Orchestration) applies goal-aware output filtering that removes off-topic information. + +During each tool invocation: + +1. **Goal relevance scoring** — TACO scores the tool output for relevance to the current goal using embedding similarity +2. **Compression** — Off-topic sections are replaced with `[output filtered: ]` placeholders +3. **Preservation** — Output directly matching the goal or containing errors is always preserved + +Enable TACO by setting a goal: + +```bash +> /plan Implement authentication middleware for the REST API +``` + +Configuration for compression thresholds: + +```toml +[tools.compression] +goal_relevance_threshold = 0.5 # Skip sections with relevance < 0.5 +preserve_errors = true # Always keep error messages +max_preserved_chars = 4096 # Hard limit on preserved output size +``` + +When no goal is active, TACO is disabled and all tool output is preserved. + +## TrajectorySentinel and ScopedToolExecutor + +To prevent tool misuse and enforce capability governance, Zeph optionally wraps executors with `TrajectorySentinel` (tracks execution patterns) and `ScopedToolExecutor` (enforces per-user scope and trust levels). + +`ScopedToolExecutor` ensures that: + +- **Per-user scope** — tools run as the configured user (e.g., `www-data` for web services), not the agent process owner +- **Trust delegation** — sensitive tools (e.g., `rm`, `sudo`) require an elevated trust level +- **Capability auditing** — all tool invocations are logged with user, timestamp, and scope context + +Enable scoped execution via `[tools.scope]`: + +```toml +[tools.scope] +enabled = true +run_as_user = "zeph" # Execute tools as this user (via sudo if needed) +require_capability = false # Require elevated permissions +audit_all_invocations = true # Log every tool call +``` + +When enabled, the executor constructs a `ToolScope` binding the user identity, permission level, and audit context. The scope is passed through all tool execution layers — file access, shell commands, and MCP tools are all aware of and respect the scope. + +> [!WARNING] +> Scope enforcement requires the agent to run with sufficient privileges (typically `root` or via `sudo`) to switch user contexts. Running as an unprivileged user with `run_as_user = "other-user"` will fail with a permission error. + ## Permissions The `[tools.permissions]` section defines pattern-based access control per tool. Each tool ID maps to an ordered array of rules. Rules use glob patterns matched case-insensitively against the tool input (command string for `bash`, file path for file tools). First matching rule wins; if no rule matches, the default action is `Ask`. diff --git a/book/src/concepts/graph-memory.md b/book/src/concepts/graph-memory.md index f32073682..53b27acc8 100644 --- a/book/src/concepts/graph-memory.md +++ b/book/src/concepts/graph-memory.md @@ -92,6 +92,30 @@ After each user message is persisted, Zeph spawns a background extraction task ( Extraction runs non-blocking via `spawn_graph_extraction` — the agent loop continues without waiting for it to finish. A configurable timeout (`extraction_timeout_secs`, default: 15) prevents slow LLM calls from accumulating. +### Using a Dedicated Provider for Extraction + +Graph extraction tasks produce JSON-structured responses that have low prompt/response cosine similarity (~0.55–0.70). When a routing quality gate is active (via `[llm.router] quality_gate`), extraction calls may be systematically rejected by the gate and rerouted through fallback providers, adding unnecessary latency. + +To avoid quality gate false positives, dedicate a provider to graph extraction tasks: + +```toml +[[llm.providers]] +name = "fast" +type = "ollama" +model = "qwen3:8b" + +[memory.graph] +enabled = true +extract_provider = "fast" # Use the "fast" provider for extraction, bypassing quality gate +max_entities_per_message = 10 +max_edges_per_message = 15 +``` + +When `extract_provider` is set to a named provider, graph extraction (and downstream note linking and community summarization) use that provider without routing signals or quality gates applied. When empty (default), the system uses the agent's primary provider. + +> [!TIP] +> For best results, match `extract_provider` to the provider name used by `extract_model`. If `extract_model = "gpt-4o-mini"`, use a provider entry with `type = "openai"` and `model = "gpt-4o-mini"`, then set `extract_provider` to that provider's name. + ### Security Messages flagged with injection patterns are excluded from extraction. When the content sanitizer detects injection markers (`has_injection_flags = true`), `maybe_spawn_graph_extraction` returns early without queuing any work. This prevents untrusted content from poisoning the knowledge graph. diff --git a/book/src/concepts/hooks.md b/book/src/concepts/hooks.md index 1c43528ad..3ce1b054d 100644 --- a/book/src/concepts/hooks.md +++ b/book/src/concepts/hooks.md @@ -94,6 +94,40 @@ args = ["check", "--quiet"] | `hooks.file_changed.handlers[].command` | `string` | — | Executable to run | | `hooks.file_changed.handlers[].args` | `Vec` | `[]` | Arguments (env vars expanded) | +## Hook Tracing and Instrumentation + +All hook execution is instrumented with distributed tracing. Each hook invocation generates: + +- `zeph.hooks.cwd_changed` span — execution of a `cwd_changed` hook +- `zeph.hooks.file_changed` span — execution of a `file_changed` hook + +Spans include: + +| Attribute | Value | +|-----------|-------| +| `hook.command` | Executable name (e.g., `cargo`, `git`) | +| `hook.args` | Full argument list | +| `hook.duration_ms` | Execution wall-clock time | +| `hook.exit_code` | Process exit code (if available) | + +Traces are exported to your configured telemetry backend (local Chrome JSON or Jaeger OTLP) and are visible in profiling tools like Perfetto. This allows you to identify slow hooks and optimize them. + +## Hook Propagation on Config Reload + +When `zeph reload-config` is called (or config changes are hot-reloaded), hooks are immediately re-parsed and re-registered. The TUI and scheduler receive hook update notifications so they can reconfigure watchers without restarting. + +For `file_changed` hooks: +1. Old watchers are stopped +2. New watch paths are parsed from the updated config +3. Handlers are registered with the new watcher +4. The next file modification triggers the updated hooks + +For `cwd_changed` hooks: +1. The hook list is updated in memory +2. The next working directory change fires the new hooks + +This enables configuration updates without restarting the agent process. + ## Reactive Events Zeph fires reactive events when the environment changes beneath the agent. Events are processed synchronously before the next agent turn, ensuring hooks complete before the LLM sees the updated context. diff --git a/book/src/concepts/memory.md b/book/src/concepts/memory.md index 71f2b7ab7..501198559 100644 --- a/book/src/concepts/memory.md +++ b/book/src/concepts/memory.md @@ -283,6 +283,47 @@ contradiction_risk_threshold = 0.7 # Flag if graph edges show conflict Quality gate operates downstream of A-MAC admission, making both gates independent and composable. +### APEX-MEM: Advanced Quality Gating + +APEX-MEM (Adaptive Page Extraction and eXtension Memory) provides an advanced quality validation layer that runs during the memory write path. When enabled, candidate memories are validated using a multi-dimensional scoring system before being admitted into the vector store. + +**Key features:** + +- **insert_or_supersede semantics** — when a high-confidence fact contradicts an existing memory, the system promotes the newer fact and marks the older one as superseded rather than keeping both +- **Multi-dimensional validation** — scores candidates on information density (entropy), citation quality (reference completeness), and factual confidence +- **Fail-open design** — validation errors are logged but never block writes; the message is admitted with conservative default scores + +Enable APEX-MEM in `[memory.quality_gate]`: + +```toml +[memory.quality_gate] +enabled = true +use_advanced_scoring = true # Enable APEX-MEM multi-dimensional validation (default: false) +information_value_threshold = 0.3 # Skip admission if similarity exceeds this +reference_completeness_threshold = 0.5 # Require pronoun/deictic clarity +contradiction_risk_threshold = 0.7 # Flag if graph edges show conflict +``` + +When `use_advanced_scoring = true`, each candidate message receives three independent scores: + +| Dimension | Meaning | Score Range | +|-----------|---------|-------------| +| **Information density** | How much unique information vs. repetition | 0.0–1.0 (higher = more useful) | +| **Citation quality** | Whether meaning is self-contained or deictic | 0.0–1.0 (higher = clearer standalone) | +| **Confidence** | Presence of hedging markers ("I think", "maybe", etc.) | 0.0–1.0 (higher = more confident) | + +The composite score is a weighted blend: `0.35 * density + 0.35 * citation + 0.30 * confidence`. Messages scoring below `information_value_threshold` are rejected. + +**insert_or_supersede behavior:** + +When a new memory contradicts an existing one (detected via graph edge conflicts), APEX-MEM evaluates both the old and new facts: + +1. If the new fact has higher `confidence` + `information_density`, it is inserted and the old fact is marked `superseded_by = ` +2. If the old fact scores higher, it is retained and the new fact is silently rejected +3. If scores are within `contradiction_margin` (default: 0.05), both are kept and a contradiction flag is set in the graph for later resolution + +This enables natural knowledge evolution without vector index bloat from conflicting information. + ### RL-Based Admission Strategy The default `heuristic` strategy uses static weights and an optional LLM call for the `future_utility` factor. The `rl` strategy replaces the `future_utility` LLM call with a trained logistic regression model that learns from actual recall outcomes. @@ -747,6 +788,88 @@ scene_sweep_interval_secs = 7200 # how often the scene consolidation sweep > [!NOTE] > `scene_similarity_threshold` is validated to be in `[0.5, 1.0]` and `scene_batch_size` must be `>= 1`. Invalid values are rejected at startup. +## MemCoT: Semantic State Accumulation + +MemCoT (Memory Chain-of-Thought) tracks the agent's semantic understanding state across turns via incremental entity and value updates. Instead of storing discrete messages, MemCoT accumulates fact streams that represent how the agent's model of the world evolves — capturing decisions, contradiction resolutions, and inferred conclusions. + +The `SemanticStateAccumulator` maintains: + +- **Entity snapshots** — current values for tracked entities (project status, decision state, file paths) +- **Contradiction flags** — when the agent detects conflicting information, flags the conflict and records the resolution +- **Decision ledger** — explicit decisions made by the user or agent (e.g., "switched from vim to neovim", "decided to use Claude instead of Ollama") +- **Inferred states** — conclusions drawn from multiple facts (e.g., "auth module is now stable" inferred from "all tests pass + no open issues") + +MemCoT is complementary to traditional semantic recall: while vector search finds *similar messages*, MemCoT finds *related state transitions*. This is particularly useful for: + +- **Long explorations** — tracking how a codebase design evolved over 50+ turns +- **Decision audits** — "why did we choose X?" answered by the decision ledger +- **Contradiction resolution** — detecting when the agent's context drifts and needs correction + +### Zoom-In / Zoom-Out Recall Views + +MemCoT supports two complementary query patterns for state retrieval: + +**Zoom-in** — Retrieve the full derivation chain for a specific fact. Given a state like "auth module is stable", the zoom-in view returns: + +1. The current fact value ("auth module is stable on commit abc123") +2. All intermediate facts that contributed to this inference ("all tests pass", "no open issues", "PR #42 merged") +3. The contradiction resolution history if this fact superseded an earlier conflicting state +4. The decision events that led to the conclusion (e.g., "user confirmed code review complete") + +The depth is bounded by `zoom_in_max_depth` to prevent returning derivation chains deeper than human working memory can follow. + +**Zoom-out** — Retrieve only high-level state transitions without intermediate details. Given 50 turns of development, zoom-out returns: + +- Aggregation level 1 (facts) — all state transitions with equal weight +- Aggregation level 2 (decisions) — only explicit user or agent decisions (default) +- Aggregation level 3 (milestones) — major milestone decisions (e.g., "architecture chosen", "first deploy") + +The aggregation level is set via `zoom_out_level`. Higher levels reduce token usage by suppressing intermediate inferences and focusing on decision points. + +**Configuration:** + +```toml +[memory.memcot] +enabled = true +accumulator_provider = "fast" # Provider for state summarization; falls back to primary +zoom_in_max_depth = 5 # Max steps in derivation chain (>= 1) +zoom_out_level = 2 # Aggregation level: 1=facts, 2=decisions, 3=milestones +``` + +**Injection into context:** + +When enabled, MemCoT state snapshots are stored in SQLite with timestamps and source facts. At context assembly time, both zoom views are injected before semantic recall results: + +1. Zoom-in for facts matching the current query (deep causality view) +2. Zoom-out for recent state transitions (high-level summary view) + +The dual-recall design allows the agent to answer both deep "why did we choose X?" questions (via zoom-in derivations) and strategic "what's changed since last session?" questions (via zoom-out aggregates). + +**Examples:** + +``` +Zoom-in query: "Why is the payment module blocked?" +Returns: payment module is blocked (current) ← pending legal review ← GDPR compliance ← user decision to add GDPR +(4 steps: decision → inference → inference → current) + +Zoom-out query: "What happened in this session?" +Returns: (Decision) switched from SQLite to PostgreSQL; (Milestone) schema v3 deployed; (Decision) enabled read replicas +(3 decision-level events, intermediate facts hidden) +``` + +## Memory Retrieval Failure Logging + +When a semantic memory search returns zero results or falls below the confidence threshold, Zeph optionally records this in the `memory_retrieval_failures` table. This supports the OmniMem self-improvement loop: by analyzing patterns in no-hit turns, the memory admission and recall systems can be tuned to improve coverage. + +Enable failure logging in `[memory]`: + +```toml +[memory] +log_retrieval_failures = true # Record no-hit recalls for analysis +``` + +Logged failures include the query, timestamp, applied filters, and confidence score. A background analyzer can use these logs to detect categories of questions your memory system fails on and adjust admission strategies accordingly. + ## Next Steps - [Set Up Semantic Memory](../guides/semantic-memory.md) — Qdrant setup guide diff --git a/book/src/concepts/providers.md b/book/src/concepts/providers.md index 573e0aa6b..fb2994747 100644 --- a/book/src/concepts/providers.md +++ b/book/src/concepts/providers.md @@ -9,6 +9,7 @@ Zeph supports multiple LLM backends. Choose based on your needs: | OpenAI | Cloud | Yes | Yes | Yes | Ecosystem, GPT-4o, GPT-5 | | Gemini | Cloud | Yes | Yes | Yes | Google ecosystem, long context, extended thinking | | Compatible | Cloud | Varies | Varies | Varies | Together AI, Groq, Fireworks | +| Gonka | Decentralized | Yes | Via compatible | Yes | Privacy, decentralized inference, cost control | | Candle | Local | No | No | No | Minimal footprint | Claude does not support embeddings natively. Use a multi-provider setup with `embed = true` on an Ollama or OpenAI provider entry to combine Claude chat with local embeddings. Gemini supports embeddings via the `text-embedding-004` model — set `embedding_model` in the Gemini `[[llm.providers]]` entry to enable. @@ -41,6 +42,20 @@ ZEPH_LLM_PROVIDER=openai ZEPH_OPENAI_API_KEY=sk-... zeph ZEPH_LLM_PROVIDER=gemini ZEPH_GEMINI_API_KEY=AIza... zeph ``` +**Gonka (native)**: + +```bash +zeph vault set ZEPH_GONKA_PRIVATE_KEY +zeph init # select "Gonka (native)" when prompted +``` + +**Gonka (GonkaGate)**: + +```bash +zeph vault set ZEPH_COMPATIBLE_GONKAGATE_API_KEY gp-... +zeph init # select "Gonka (GonkaGate)" when prompted +``` + ## Gemini Zeph supports Google Gemini as a first-class LLM backend. Gemini is a strong choice when you want access to Google's latest models (Gemini 2.5 Pro, Gemini 2.0 Flash), very long context windows, extended thinking, or native multimodal reasoning. @@ -134,7 +149,7 @@ Change the `type` field in the `[[llm.providers]]` entry. All skills, memory, an ```toml [llm] [[llm.providers]] -type = "claude" # ollama, claude, openai, gemini, candle, compatible +type = "claude" # ollama, claude, openai, gemini, gonka, candle, compatible model = "claude-sonnet-4-6" ``` diff --git a/book/src/guides/gonka.md b/book/src/guides/gonka.md index 9d8a3ae14..9b68930a4 100644 --- a/book/src/guides/gonka.md +++ b/book/src/guides/gonka.md @@ -2,6 +2,12 @@ [Gonka](https://gonka.ai) is a decentralized AI inference network built on a Cosmos-SDK chain that routes LLM requests to a peer-to-peer pool of GPU operators. Zeph supports two access paths. +Gonka is particularly useful for: + +- **Privacy-preserving inference** — Requests are signed with your key; no account credentials stored on Gonka servers +- **Cost control** — Direct token consumption with no markup or subscription fees +- **Decentralization** — Work is distributed across independent GPU operators + ## Path A: GonkaGate (Recommended for quick start) GonkaGate is a hosted gateway to the Gonka network with USD-denominated billing — no token staking required. @@ -73,6 +79,93 @@ address = "gonka1..." **Pricing:** GNK token consumption per inference. +## How GonkaProvider Works + +The native Gonka integration (Path B) uses three components working together: + +### RequestSigner + +`RequestSigner` handles request authentication using your secp256k1 private key. Every request is signed with: + +1. **Request serialization** — The message payload (chat parameters, tools, etc.) is serialized to JSON +2. **Signing** — The payload is signed using secp256k1 ECDSA with your private key +3. **Envelope** — The signature and public key are included in the request headers + +### EndpointPool + +`EndpointPool` manages multiple Gonka nodes for redundancy and load distribution: + +- Maintains a pool of healthy node endpoints from `[[llm.providers.gonka_nodes]]` entries +- Performs health checks to detect unavailable nodes +- Routes requests round-robin across available nodes +- Falls back to alternative nodes on failure + +### Capabilities + +GonkaProvider supports all standard Zeph LLM capabilities: + +| Capability | Supported | Notes | +|------------|-----------|-------| +| Chat (single-turn) | Yes | Standard text-to-text inference | +| Chat streaming (SSE) | Yes | Streaming tokens via Server-Sent Events | +| Tool use (function calling) | Yes | Full tool definitions and results supported | +| Tool streaming | Yes | Incremental tool call generation during streaming | +| Embeddings | Yes | Vector generation for semantic memory and skill matching | +| Vision (image input) | Via compatible models | Use base64-encoded images | + +## Configuration Details + +### Full Native Gonka Config Example + +```toml +[llm] + +[[llm.providers]] +type = "gonka" +name = "gonka-mainnet" +model = "gpt-4o" +gonka_chain_prefix = "gonka" +max_tokens = 4096 + +# List of available inference nodes +[[llm.providers.gonka_nodes]] +url = "https://node1.gonka.ai" +address = "gonka1acnx3cpm8cz5nqu24aql4cqx5fxqm9w4vf2hqr" + +[[llm.providers.gonka_nodes]] +url = "https://node2.gonka.ai" +address = "gonka1bcx3cpm8cz5nqu24aql4cqx5fxqm9w4vf2xyz" + +[[llm.providers.gonka_nodes]] +url = "https://node3.gonka.ai" +address = "gonka1ccx3cpm8cz5nqu24aql4cqx5fxqm9w4vf2abc" +``` + +### Combining Gonka with Local Embeddings + +If you want Gonka for chat but prefer local embeddings for cost reasons: + +```toml +[[llm.providers]] +type = "gonka" +name = "gonka-chat" +model = "gpt-4o" +gonka_chain_prefix = "gonka" +default = true # use for chat + +[[llm.providers]] +type = "ollama" +name = "local-embed" +embedding_model = "nomic-embed-text" +embed = true # use for embeddings + +[memory.semantic] +embed_provider = "local-embed" + +[skills] +embedding_provider = "local-embed" +``` + ## Troubleshooting Run the built-in diagnostic tool to check credentials and node reachability: diff --git a/book/src/guides/mcp.md b/book/src/guides/mcp.md index fe640ea26..6ed0c900e 100644 --- a/book/src/guides/mcp.md +++ b/book/src/guides/mcp.md @@ -82,6 +82,71 @@ Add and remove MCP servers at runtime via chat commands: After adding or removing a server, Qdrant registry syncs automatically for semantic tool matching. +## MCP Server Startup and Retry + +When Zeph starts, it attempts to connect to all configured MCP servers in parallel. Servers that fail to start (e.g., missing binary, network timeout, or slow startup) are automatically retried with exponential backoff. + +**Retry behavior:** + +1. **Initial connection attempt** — each server gets `startup_timeout` seconds to respond (default: 30 seconds) +2. **Failure detection** — if the server fails to initialize, auto-retry begins +3. **Exponential backoff** — subsequent attempts wait 1s, 2s, 4s, 8s, etc. up to `max_retry_interval_secs` +4. **Eventual availability** — servers are marked unavailable after `max_retries` attempts, but Zeph continues running without them +5. **Runtime reconnection** — if a server was unavailable at startup but comes online later, the agent can manually reconnect via `/mcp add` + +**Configuration:** + +```toml +[mcp] +startup_timeout_secs = 30 # Max time to wait for server initialization (default: 30) +max_retries = 5 # Max reconnection attempts before giving up (default: 5) +initial_retry_interval_secs = 1 # Starting backoff interval (default: 1) +max_retry_interval_secs = 60 # Max backoff interval (default: 60) +``` + +**Example timeline:** + +``` +Start → stdio server fails (can't find binary) + → Retry 1: wait 1s, attempt 2 fails + → Retry 2: wait 2s, attempt 3 fails + → Retry 3: wait 4s, attempt 4 succeeds ✓ + +OR + +Start → stdio server fails (timeout) + → Retry 1: wait 1s, attempt 2 fails + → Retry 2: wait 2s, attempt 3 fails + → Retry 3: wait 4s, attempt 4 fails + → Retry 4: wait 8s, attempt 5 fails + → Retry 5: wait 16s, attempt 6 fails + → Server marked unavailable; Zeph continues without it + → User can retry: /mcp add filesystem ... +``` + +Failed servers are logged with their error messages. Check `RUST_LOG=debug` to see detailed retry logs: + +``` +2025-05-06T10:30:15Z DEBUG mcp.startup: initializing server id=filesystem attempt=1 +2025-05-06T10:30:16Z WARN mcp.startup: server filesystem failed: timeout after 30s; will retry +2025-05-06T10:30:17Z DEBUG mcp.startup: initializing server id=filesystem attempt=2 retry_interval=1s +``` + +**Skipping slow servers:** + +If a particular server is slow to start but necessary for your workflow, increase its personal timeout: + +```toml +[[mcp.servers]] +id = "slow-analyzer" +command = "python3" +args = ["-m", "my_mcp_server"] +startup_timeout = 60 # give this server 60 seconds instead of 30 +``` + +> [!TIP] +> Exponential backoff prevents the agent startup from hanging indefinitely on flaky servers. If a server consistently fails, consider whether it's essential. If not, remove it from the config to speed up startup. + ## Native Tool Integration (Claude / OpenAI) MCP tools are exposed as native `ToolDefinition`s alongside built-in tools. All providers use the same structured tool calling path. diff --git a/book/src/reference/cli.md b/book/src/reference/cli.md index a8c34444d..6c8ece943 100644 --- a/book/src/reference/cli.md +++ b/book/src/reference/cli.md @@ -16,6 +16,7 @@ zeph [OPTIONS] [COMMAND] | `agents` | Manage sub-agent definitions — list, show, create, edit, delete (see [Sub-Agent Orchestration](../advanced/sub-agents.md#managing-definitions)) | | `skill` | Manage external skills — install, remove, verify, trust (see [Skill Trust Levels](../advanced/skill-trust.md)) | | `memory` | Export and import conversation history snapshots | +| `project` | Project-level management — purge all local state (see below) | | `vault` | Manage the age-encrypted secrets vault (see [Secrets Management](security.md#age-vault)) | | `router` | Inspect or reset Thompson Sampling router state (see [Adaptive Inference](../advanced/adaptive-inference.md)) | | `ingest` | Ingest a document or directory into semantic memory (Qdrant collection) | @@ -144,6 +145,59 @@ zeph memory tree The snapshot format is versioned (currently v1). Import uses `INSERT OR IGNORE` — re-importing the same file is safe and skips existing records. +### `zeph project` + +Manage project-level state and cleanup. + +| Subcommand | Description | +|------------|-------------| +| `project purge` | Remove all project-local state (database, logs, debug artifacts, Qdrant collections) with safety checks | + +**`zeph project purge` options:** + +| Flag | Short | Description | +|------|-------|-------------| +| `--config ` | `-c` | Path to config file (defaults to standard search path) | +| `--dry-run` | | Show what would be removed without deleting anything | +| `--yes` | `-y` | Skip confirmation prompt (database lock check is never skipped) | + +**Removes:** + +- SQLite database file (`zeph.db`) and its siblings (`zeph.db-wal`, `zeph.db-shm`) +- Main log file and any rotated log files +- Scheduler daemon log and PID file +- Debug dump artifacts directory +- Trace files directory +- Audit log file (if configured as a file path) +- All 10 known Qdrant collections (when `vector_backend = "qdrant"`) + +**Safety:** + +- Pre-flight exclusive lock check on the SQLite database — aborts immediately if an agent session is running +- Database lock check is always enforced, even with `-y` +- Respects vector backend configuration: skips Qdrant when `vector_backend = "sqlite"` +- Respects database configuration: skips SQLite file deletion when using PostgreSQL + +```bash +# Preview what would be removed +zeph project purge --dry-run + +# Remove all project state (after confirmation) +zeph project purge + +# Remove without confirmation (but DB lock check still applies) +zeph project purge -y + +# Use a custom config path +zeph project purge --config ~/.zeph/custom-config.toml --yes +``` + +> [!WARNING] +> `zeph project purge` is destructive. This action cannot be undone. Ensure you have backups if you need to preserve any state. + +> [!TIP] +> Use `--dry-run` first to see the byte counts that would be deleted. This helps you estimate storage recovery and verify the correct state will be removed. + ### `zeph agents` Manage sub-agent definition files. See [Managing Definitions](../advanced/sub-agents.md#managing-definitions) for examples and field details. diff --git a/crates/zeph-core/README.md b/crates/zeph-core/README.md index 205b04dd1..bcd5bb2fc 100644 --- a/crates/zeph-core/README.md +++ b/crates/zeph-core/README.md @@ -343,9 +343,34 @@ In-session commands for autonomous self-experimentation (requires `experiments` > [!TIP] > The same CRUD operations are available interactively in the TUI agents panel — press `a` in the TUI to open the panel, then `c` (create), `e` (edit), `d` (delete), Enter (detail view). +## Speculative tool dispatch + +`SpeculationEngine` pre-runs read-only tool calls while the LLM generates its response. Two activation paths are supported: + +- **SSE decoding path** — `claude_sse_to_tool_stream` emits `ToolBlockStart` at `content_block_start`; when confidence exceeds `confidence_threshold`, `try_dispatch(Trusted)` fires with a 2 s timeout. +- **PASTE pattern path** — `run_paste_skill_activation` calls `PatternStore::predict` per active skill and dispatches candidates above threshold with per-skill trust; `observe_paste_transition` records transitions for future pattern learning. + +`requires_confirmation` defaults to `true` for all executors, making speculative dispatch safe-by-default. Only executors that explicitly opt out can be speculatively dispatched. + +Configure via `[tools.speculative]` in `config.toml`: + +```toml +[tools.speculative] +mode = "decoding" # "off" | "decoding" | "pattern" +confidence_threshold = 0.8 +timeout_ms = 2000 +``` + +> [!NOTE] +> The speculation engine only runs when the agent is not in `--bare` mode. Committed speculative results that carry `ToolError::ConfirmationRequired` trigger a `tracing::error!` in debug builds, making the invariant machine-checkable at zero release cost. + +## Goal lifecycle and TACO output compression + +`GoalLifecycle` tracks active goals across turns. Tool outputs for completed or stale goals are compressed by the TACO (Tool-Aware Compaction Optimization) pipeline, which archives bodies to SQLite before the LLM compaction call and injects UUID back-references into the resulting summary. + ## Reactive hooks -`[hooks]` in `config.toml` defines shell commands that fire on working-directory or file-change events. +`[hooks]` in `config.toml` defines shell commands that fire on working-directory or file-change events. Hooks are now traced with `tracing` instrumentation and are propagated correctly through `reload_config` — hooks registered after a live config reload fire identically to those present at startup. ```toml [[hooks.cwd_changed]] diff --git a/crates/zeph-llm/README.md b/crates/zeph-llm/README.md index bc21f7f79..a5d86e6ae 100644 --- a/crates/zeph-llm/README.md +++ b/crates/zeph-llm/README.md @@ -5,11 +5,11 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](../../LICENSE) [![MSRV](https://img.shields.io/badge/MSRV-1.95-blue)](https://www.rust-lang.org) -LLM provider abstraction with Ollama, Claude, OpenAI, Gemini, and Candle backends. +LLM provider abstraction with Ollama, Claude, OpenAI, Gemini, Gonka, and Candle backends. ## Overview -Defines the `LlmProvider` trait and ships concrete backends for Ollama, Claude, OpenAI, Google Gemini, and OpenAI-compatible endpoints. Includes an orchestrator for multi-model coordination, a router for model selection, an optional Candle backend for local inference, and an SQLite-backed response cache with blake3 key hashing and TTL expiry. +Defines the `LlmProvider` trait and ships concrete backends for Ollama, Claude, OpenAI, Google Gemini, Gonka (signed native transport), and OpenAI-compatible endpoints. Includes an orchestrator for multi-model coordination, a router for model selection, an optional Candle backend for local inference, and an SQLite-backed response cache with blake3 key hashing and TTL expiry. ## Key modules @@ -21,6 +21,7 @@ Defines the `LlmProvider` trait and ships concrete backends for Ollama, Claude, | `openai` | OpenAI backend with `with_client()` builder for shared `reqwest::Client` | | `gemini` | Google Gemini backend (`generateContent` + `streamGenerateContent?alt=sse`); system prompt mapped to `systemInstruction`, `assistant` role to `"model"`, consecutive same-role message merging, thinking parts surfaced as `StreamChunk::Thinking`, `functionCall` parts in SSE stream emitted as `StreamChunk::ToolUse`; configured via `[llm.gemini]` and `ZEPH_GEMINI_API_KEY` | | `compatible` | Generic OpenAI-compatible endpoint backend | +| `gonka` | Gonka native inference backend — signed HTTP transport via `RequestSigner` (HMAC-based), `EndpointPool` for weighted multi-node load balancing; supports `chat`, `chat_stream`, `embed`, and `chat_with_tools` | | `candle_provider` | Local inference via Candle (optional feature) | | `orchestrator` | Multi-model coordination and fallback; `send_with_retry()` helper deduplicates retry logic | | `router` | Model selection and routing logic with two strategies: EMA latency tracking and Thompson Sampling (Beta distributions). `RouterProvider` dispatches to the configured strategy and records outcomes per provider. Providers stored as `Arc<[AnyProvider]>` — `clone()` on every LLM request is O(1) regardless of chain length | @@ -137,6 +138,35 @@ feature_dim = 8 # dimensionality of the context feature vector > [!TIP] > Inspect learned weights and UCB scores with `zeph router stats` (same command as Thompson Sampling) or `/router stats` in the TUI. +## Gonka native provider + +`GonkaProvider` connects to Gonka inference nodes using a signed transport. Requests are signed per-call via `RequestSigner` using an HMAC key stored in the age vault under `ZEPH_GONKA_API_KEY`. `EndpointPool` distributes load across nodes by weight and falls back automatically when a node is unreachable. + +Supported operations: `chat`, `chat_stream`, `embed`, `chat_with_tools`, and `chat_typed` (structured output via JSON Schema). + +```toml +[[llm.providers]] +name = "gonka" +type = "gonka" +model = "qwen3-235b" +default = true + +[[llm.gonka_nodes]] +url = "https://node.example.gonka.ai" +weight = 1 +``` + +Store the key in the vault: + +```bash +zeph vault set ZEPH_GONKA_API_KEY +``` + +Configure via the `--init` wizard by selecting the **GonkaGate / Gonka Native** option. + +> [!NOTE] +> The GonkaGate path (`type = "compatible"`) is still available for access via the OpenAI-compatible gateway. Use `type = "gonka"` for the native signed-transport path with full multi-node pool support. + ## SLM provider recommendations For cost-sensitive or resource-constrained deployments, the following Small Language Models are verified to work well with Zeph: diff --git a/crates/zeph-mcp/README.md b/crates/zeph-mcp/README.md index a17fb0169..90dfdfd8e 100644 --- a/crates/zeph-mcp/README.md +++ b/crates/zeph-mcp/README.md @@ -22,6 +22,30 @@ Implements the Model Context Protocol client for Zeph, managing connections to m - **prompt** — MCP prompt template support - **error** — `McpError` error types with typed `McpErrorCode` for retry classification (`Transient`, `RateLimited`, `InvalidInput`, `AuthFailure`, `ServerError`, `NotFound`, `PolicyBlocked`) +## Startup auto-retry + +When an MCP server fails to connect at startup, `McpManager` retries the connection with exponential backoff. The first retry fires after 1 s, doubling on each attempt up to a configurable cap, with jitter to prevent thundering-herd behaviour across multiple servers starting concurrently. + +Configure per-server or globally: + +```toml +[mcp] +startup_retry_max_attempts = 5 +startup_retry_initial_delay_ms = 1000 +startup_retry_max_delay_ms = 30000 + +[[mcp.servers]] +id = "filesystem" +command = "npx" +args = ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"] +startup_retry_max_attempts = 3 # per-server override +``` + +HTTP 4xx authentication errors (`401`, `403`) are mapped to `McpError::HttpAuth` and are not retried — a permanent auth failure will not exhaust the retry budget. + +> [!TIP] +> Increase `startup_retry_max_attempts` for servers that have slow cold-start times (e.g. Docker-based servers that pull images on first run). + ## MCP Roots protocol The MCP client implements the `roots/list` handler, exposing configured project roots to MCP servers. Roots are declared via `[mcp.roots]` in config and passed to each server connection at initialization time. Servers that support `roots/list` can use this information to scope their file system access to the declared directories. diff --git a/crates/zeph-memory/README.md b/crates/zeph-memory/README.md index f497474d8..b72d29abb 100644 --- a/crates/zeph-memory/README.md +++ b/crates/zeph-memory/README.md @@ -350,6 +350,25 @@ timeout_ms = 500 # Activation timeout to prevent runaway trav recall_timeout_ms = 1000 # Timeout for the full graph recall call (default: 1000) ``` +## MemCoT semantic state accumulation + +`SemanticStateAccumulator` tracks entity salience and relationship density across turns, maintaining a rolling cognitive thread. Two recall views are supported: + +- **Zoom-In** — high-granularity retrieval that surfaces entity-level details and recent edge additions from the current cognitive thread. +- **Zoom-Out** — panoramic retrieval that activates community-level summaries and older superseded edges for historical perspective. + +Memory retrieval failures are logged automatically for the OmniMem self-improvement loop — the agent can learn from misses without manual instrumentation. + +Configure via `[memory.memcot]`: + +```toml +[memory.memcot] +enabled = true +thread_depth = 10 # turns kept in the cognitive thread +zoom_in_top_k = 5 +zoom_out_top_k = 3 +``` + ## Importance scoring Messages are scored at write time via `compute_importance()`. The score is stored in the `importance_score` column (default 0.5 for legacy rows). When `importance_enabled = true` on `SemanticMemory`, recall results are blended with importance scores for content-aware ranking. diff --git a/crates/zeph-tools/README.md b/crates/zeph-tools/README.md index 23623eb18..d237e5ccd 100644 --- a/crates/zeph-tools/README.md +++ b/crates/zeph-tools/README.md @@ -111,6 +111,25 @@ failure_threshold = 0.7 auto_block = true ``` +## TrajectorySentinel and ScopedToolExecutor + +`TrajectorySentinel` monitors the sequence of tool calls across a session and flags anomalous capability escalation patterns — for example, when a tool chain attempts to access resources outside the declared project scope. Alerts are recorded as `SecurityEvent::Quarantine` entries in the metrics ring buffer. + +`ScopedToolExecutor` wraps any `ToolExecutor` and enforces a declared capability scope: a set of allowed filesystem paths and network hosts. Tool calls that would access resources outside the scope are blocked before execution, preventing lateral movement even when the underlying executor has broader permissions. + +Configure via `[tools.scoped]`: + +```toml +[tools.scoped] +enabled = true +allowed_paths = ["/home/user/project", "/tmp/zeph"] +allowed_hosts = ["api.openai.com", "qdrant.example.com"] +``` + +## Per-turn ExecutionContext + +`ShellExecutor` receives a `ExecutionContext` on each turn that carries the active goal ID, the current skill name, and the sub-agent identity (if running inside a sub-agent). This context is recorded in every `AuditEntry`, enabling per-goal and per-skill attribution in the audit trail. + ## TAFC (Think-Augmented Function Calling) TAFC injects a reasoning step before tool selection, allowing the LLM to evaluate which tools are appropriate for the current task. Configure via `[tools.tafc]` in `config.toml`. diff --git a/crates/zeph-tui/src/widgets/snapshots/zeph_tui__widgets__splash__tests__splash_default.snap b/crates/zeph-tui/src/widgets/snapshots/zeph_tui__widgets__splash__tests__splash_default.snap index b30533516..1b6e3901f 100644 --- a/crates/zeph-tui/src/widgets/snapshots/zeph_tui__widgets__splash__tests__splash_default.snap +++ b/crates/zeph-tui/src/widgets/snapshots/zeph_tui__widgets__splash__tests__splash_default.snap @@ -1,5 +1,6 @@ --- source: crates/zeph-tui/src/widgets/splash.rs +assertion_line: 79 expression: output --- ┌──────────────────────────────────────────────────────────┐ @@ -14,7 +15,7 @@ expression: output │ ███████╗███████╗██║ ██║ ██║ │ │ ╚══════╝╚══════╝╚═╝ ╚═╝ ╚═╝ │ │ │ -│ v0.20.1 │ +│ v0.20.2 │ │ │ │ Type a message to start. │ │ │ diff --git a/specs/002-agent-loop/spec.md b/specs/002-agent-loop/spec.md index fb2f2b798..630037787 100644 --- a/specs/002-agent-loop/spec.md +++ b/specs/002-agent-loop/spec.md @@ -253,3 +253,176 @@ strip_orphaned_tool_results(messages: &mut Vec) - Removing an orphaned `tool_result` is silent (no WARN) unless `--debug-dump` is active - This is a correctness invariant, not a heuristic — a single orphaned `tool_result` causes a provider 400/422 error - NEVER send a `tool_result` whose `tool_use_id` is absent from the message list + +--- + +## Goal Lifecycle (#3567) + +The agent tracks a per-session *goal state* that reflects whether the current user +intent has been stated, is in progress, or has been completed. This is distinct from +the orchestration `TaskGraph` goal (which is a planned multi-step execution) — goal +lifecycle tracks the natural-language objective expressed by the user in conversation. + +### GoalState Machine + +``` +Idle ──(user message with goal)──► Active(goal_text) +Active ──(agent signals completion)──► Completed(goal_text) +Completed ──(new user message)──► Active(new_goal_text) +Active ──(/clear or session reset)──► Idle +``` + +`GoalState` is stored on `LifecycleState`. The active goal text is made available +as a template variable in the system prompt (`{current_goal}`) when configured. + +### Goal Completion Detection + +The agent detects goal completion via a lightweight heuristic: + +1. If the last assistant response contains a completion signal phrase (configurable + pattern list, e.g., "task complete", "done", "finished") and no tool calls were + emitted in that turn → transition `Active → Completed` +2. If the orchestration `TaskGraph` plan completes → `Active → Completed` +3. Explicit `/done` slash command → `Active → Completed` + +Completion transitions emit a `GoalCompleted` event to the channel (displayed as a +status message, not a user-facing message). + +### Config + +```toml +[agent.goal] +enabled = true +track_in_system_prompt = false # inject {current_goal} into system prompt +completion_phrases = ["task complete", "done", "finished", "completed"] +``` + +### Key Invariants + +- Goal lifecycle is informational — it does NOT block tool execution or LLM calls +- NEVER surface `GoalState` to the LLM directly; it is agent-internal and operator-visible only +- The goal text is extracted from the first user message of the conversation; subsequent messages extend or replace the active goal heuristically + +--- + +## TACO Output Compression (#3591) + +TACO (Tool-output Automatic Compression and Offload) compresses large tool outputs +before they are injected into the context window. This is a targeted pre-injection +pass, distinct from the turn-level compaction that runs at the 60/90% pressure gates. + +### When It Fires + +TACO is evaluated after each tool call result is received, before the result is +appended to `messages`: + +1. Measure the raw tool output token count via `tiktoken-rs` +2. If `token_count > taco_threshold` AND the tool is in the compressible-tool set + → run TACO compression +3. Compressed result replaces the raw result in `messages` + +### Compression Strategy + +TACO compression uses a fast prompt to summarize the tool output: + +``` +System: You are a concise tool-output summarizer. Preserve all data values, +file paths, exit codes, and structured content. Remove verbose headers and +repeated patterns. Target: under {target_tokens} tokens. +Tool output: +{raw_output} +``` + +The compressed result is tagged with `MessagePart::TacoCompressed` so the TUI and +audit log can distinguish it from raw output. + +### Compressible Tool Set + +Default: `["shell", "web_scrape", "read"]`. Configurable via +`[tools.taco] compressible_tools`. MCP tools are excluded from TACO by default +because their structured output schema is unknown. + +### Config + +```toml +[tools.taco] +enabled = false # default off (opt-in) +taco_threshold = 2000 # tokens; compress outputs above this +target_tokens = 500 # target compressed size +taco_provider = "" # [[llm.providers]] name; empty = primary +compressible_tools = ["shell", "web_scrape", "read"] +``` + +### Key Invariants + +- TACO fires only on output that exceeds `taco_threshold`; short outputs are passed through untouched +- On compression failure (provider error, timeout) the **raw output is used** — TACO is best-effort +- NEVER compress `tool_result` messages from `execute_tool_call_confirmed` (fenced-block path) — user-approved results must not be silently summarized +- NEVER apply TACO to thinking blocks or system prompt parts +- `taco_provider` is resolved via the provider registry at runtime; empty = primary provider +- Compressed results carry `MessagePart::TacoCompressed` to make compression auditable + +--- + +## Per-Turn ExecutionContext (#3589) + +`ShellExecutor` now receives a per-turn `ExecutionContext` that carries the resolved +working directory and environment overrides for that specific turn. This replaces the +previous model where the working directory was a global field on `ShellExecutor`. + +### Contents + +```rust +pub struct ExecutionContext { + pub cwd: PathBuf, // resolved working directory for this turn + pub env: HashMap, // turn-scoped env overrides (e.g., from hooks) + pub session: SessionId, // for audit correlation +} +``` + +### Propagation + +`ExecutionContext` is constructed at the start of `process_user_message()` from the +current `LifecycleState::cwd` and any active hook-injected env vars. It is passed to +`ShellExecutor::execute_with_context(&call, &ctx)` instead of reading from a shared +field. + +### Key Invariants + +- The `cwd` in `ExecutionContext` reflects the working directory **as of the start of the turn** — changes made by `set_working_directory` tool calls in the current turn take effect in the NEXT turn's context +- NEVER mutate the `ExecutionContext` during a turn — it is immutable after construction +- The `ExecutionContext` is not serialized or persisted — it is reconstructed each turn + +--- + +## Memory Retrieval Failure Logging (#3597) + +OmniMem self-improvement loop requires a dataset of memory retrieval failures. +Starting from PR #3597, `OmniMem::recall()` logs retrieval failures into the +`skill_outcomes` table (existing SQLite table used by self-learning) with +`outcome_type = "memory_miss"`. + +### Logged Fields + +| Field | Value | +|-------|-------| +| `outcome_type` | `"memory_miss"` | +| `query` | The original recall query string (truncated to 512 chars) | +| `strategy` | Recall strategy that was attempted (e.g., `"semantic"`, `"graph"`, `"hybrid"`) | +| `error` | Error message or "no_results" | +| `session_id` | Current session UUID | +| `ts` | Unix timestamp | + +### What Counts as a Failure + +- Qdrant query returns 0 results above the similarity threshold +- Qdrant query returns an error (network, timeout) +- Graph BFS returns 0 edges above the confidence threshold +- Hybrid recall produces 0 non-empty results after merging + +### Key Invariants + +- Failure logging is fire-and-forget — it MUST NOT block the recall return path +- Logged queries are truncated to 512 characters before storage — no unbounded writes +- Failure logs are NOT surfaced to the LLM or the user; they are operator/self-improvement data only +- `outcome_type = "memory_miss"` is a stable string — consumers (scheduler micro-benchmark) depend on it diff --git a/specs/004-memory/004-13-memory-memcot.md b/specs/004-memory/004-13-memory-memcot.md new file mode 100644 index 000000000..d080a3848 --- /dev/null +++ b/specs/004-memory/004-13-memory-memcot.md @@ -0,0 +1,213 @@ +--- +aliases: + - MemCoT + - SemanticStateAccumulator + - Zoom-In Recall + - Zoom-Out Recall +tags: + - sdd + - spec + - memory + - retrieval + - experimental +created: 2026-05-06 +status: implemented +related: + - "[[MOC-specs]]" + - "[[constitution]]" + - "[[004-memory/spec]]" + - "[[004-7-memory-apex-magma]]" + - "[[012-graph-memory/spec]]" + - "[[024-multi-model-design/spec]]" +--- + +# Spec: MemCoT — Test-Time Memory Chain-of-Thought + +> [!info] +> Training-free multi-view long-term memory (LTM) perception layer with a dual +> short-term memory model. Implemented in PR #3592. Controlled by +> `[memory.memcot] enabled` (default: false). + +## Sources + +### External +- **MemCoT: Test-Time Memory Scaling via Memory Chain-of-Thought** (arXiv:2604.08216, 2026) — + Zoom-In/Zoom-Out dual-view retrieval + SemanticStateAccumulator; GPT-4o-mini F1 = 58.03 + on LoCoMo vs ~30 baseline. + +### Internal + +| File | Contents | +|---|---| +| `crates/zeph-memory/src/memcot/mod.rs` | Module root; `MemCotRecall` entry point | +| `crates/zeph-memory/src/memcot/accumulator.rs` | `SemanticStateAccumulator` — per-turn state tracking | +| `crates/zeph-memory/src/memcot/zoom.rs` | `zoom_in()` and `zoom_out()` retrieval views | +| `crates/zeph-memory/src/memcot/config.rs` | `MemCotConfig` TOML bindings | + +--- + +## 1. Overview + +### Problem Statement + +Standard semantic recall returns the top-K most similar past memories regardless of +the current reasoning state. Two failure modes: + +1. **Evidence fragmentation**: A question like "what was the outcome of the call with Alice + last Tuesday?" requires locating the specific call record (evidence localization). + Top-K cosine recall returns similar but different calls, burying the relevant one. +2. **Missing causal context**: Understanding why something happened requires expanding + from the specific fact to its surrounding causal/temporal neighborhood. Top-K returns + isolated facts without causal chain. + +### Goal + +Augment the existing `SemanticMemory::recall` pipeline with two complementary retrieval +views and a per-turn semantic state tracker: + +- **Zoom-In**: narrows the query to localize specific evidence within the APEX-MEM resolved + edge set and the conversation history. Prioritizes precision over coverage. +- **Zoom-Out**: expands the query to causal/contextual neighbors of recalled facts. + Prioritizes coverage to surface why/how chains. +- **`SemanticStateAccumulator`**: maintains a rolling semantic state across the session — + a compressed representation of what the agent "knows so far" that biases recall queries. + +### Out of Scope + +- Modifications to the APEX-MEM write path (MemCoT operates above the edge-resolution layer) +- Training or fine-tuning any model +- Changes to the Qdrant schema or SQLite schema +- Multi-session accumulator persistence across process restarts (accumulator is in-memory only) + +--- + +## 2. Functional Requirements + +| ID | Requirement | Priority | +|----|------------|----------| +| FR-MC-001 | WHEN `memory.memcot.enabled = false` THE SYSTEM SHALL behave identically to pre-MemCoT recall — no code paths activated, no allocations beyond the disabled-check | must | +| FR-MC-002 | WHEN MemCoT is enabled THE SYSTEM SHALL run Zoom-In and Zoom-Out as two parallel recall passes via `FuturesUnordered`, merging results before injecting into context | must | +| FR-MC-003 | Zoom-In SHALL re-rank the top-K recall results by an evidence-localization score derived from the current user query and the accumulated semantic state | must | +| FR-MC-004 | Zoom-Out SHALL expand each Zoom-In result to its K-nearest Qdrant neighbors and include neighbors not already in the Zoom-In set | must | +| FR-MC-005 | WHEN the token budget for the Zoom-Out expansion exceeds `memcot_budget_tokens` THE SYSTEM SHALL truncate the expansion set (lowest-score items removed first) | must | +| FR-MC-006 | `SemanticStateAccumulator` SHALL be updated at the end of every agent turn with a compressed representation of the assistant's response | must | +| FR-MC-007 | The accumulator update SHALL use a provider specified by `memcot_provider` (or the primary provider if empty); the update MUST be fire-and-forget (not block the turn) | must | +| FR-MC-008 | WHEN the accumulator has no state (first turn) THE SYSTEM SHALL fall back to standard recall for that turn | must | +| FR-MC-009 | WHEN `memcot_provider` returns an error during accumulator update THE SYSTEM SHALL log at `WARN` and retain the previous accumulator state — updates are best-effort | must | +| FR-MC-010 | WHEN the assembler collects context slots THE SYSTEM SHALL inject MemCoT recall results into the `semantic_recall` slot, replacing (not appending to) standard recall when MemCoT is enabled | must | + +--- + +## 3. Component Design + +### SemanticStateAccumulator + +Maintains a compressed representation of the agent's current understanding. + +```rust +pub struct SemanticStateAccumulator { + /// Compressed state embedding or summary string. + state: Option, + cfg: MemCotConfig, +} + +pub enum SemanticState { + /// Raw text summary of the agent's current understanding. + TextSummary(String), +} + +impl SemanticStateAccumulator { + /// Called once per turn after the assistant response is finalized. + /// Fire-and-forget: spawns a background task via TaskSupervisor. + pub fn update_async(&mut self, turn_text: &str, provider: &AnyProvider); + + /// Snapshot used by Zoom-In to bias the recall query. + pub fn snapshot(&self) -> Option<&SemanticState>; +} +``` + +The accumulator update prompt compresses the current turn text + prior state into a +short (≤ 256 tokens) semantic state summary. This summary biases the Zoom-In +re-ranking step. + +### Zoom-In Retrieval + +Zoom-In takes the standard top-K Qdrant recall results and re-ranks them: + +``` +score(m) = α × cosine(query, m) + β × cosine(accumulator_state, m) +``` + +where `α + β = 1.0` (configurable; defaults `α = 0.7`, `β = 0.3`). + +The result is the top-N re-ranked messages. This biases recall toward memories +consistent with the current semantic state, not just the raw query. + +### Zoom-Out Expansion + +Zoom-Out expands each Zoom-In result `m_i`: + +1. Find the K-nearest Qdrant neighbors of `m_i` (configurable `zoom_out_k`, default 3) +2. Include neighbors not already in the Zoom-In set +3. Score neighbors as `cosine(query, neighbor)` (no state bias — pure topical expansion) +4. Merge: Zoom-In results ranked first, Zoom-Out results appended, total token budget capped + +--- + +## 4. Config + +```toml +[memory.memcot] +enabled = false # default off; opt-in +memcot_provider = "" # [[llm.providers]] name for accumulator updates; empty = primary +zoom_in_alpha = 0.7 # weight for query similarity in re-ranking +zoom_in_beta = 0.3 # weight for semantic state similarity in re-ranking +zoom_out_k = 3 # neighbors to expand per Zoom-In result +memcot_budget_tokens = 512 # token cap on the full MemCoT recall slot +``` + +--- + +## 5. Key Invariants + +- **Disabled = zero overhead.** When `enabled = false`, `MemCotRecall` is never constructed and no allocations are made for accumulator or zoom passes. +- **Accumulator updates are fire-and-forget.** They MUST NOT block the agent turn response path. +- **Zoom-In and Zoom-Out run in parallel.** Both passes use `FuturesUnordered`; the merge step waits for both. +- **MemCoT replaces, not appends.** When enabled, MemCoT results replace the standard semantic recall slot; the two paths are mutually exclusive per turn. +- **Accumulator failure is non-fatal.** An error during the accumulator update retains the previous state; the system degrades to biasing recall with the prior state rather than the current turn. +- **NEVER inject Zoom-Out neighbors that exceed the token budget.** Truncation is mandatory when the expanded set would overflow `memcot_budget_tokens`. +- **NEVER persist the accumulator state across process restarts.** It is session-scoped and in-memory only. + +--- + +## 6. Edge Cases and Error Handling + +| Scenario | Expected Behavior | +|----------|-------------------| +| First turn (accumulator empty) | Fall back to standard recall; no Zoom-In re-ranking bias applied | +| `memcot_provider` unavailable at startup | Log `WARN`; MemCoT enabled but accumulator updates disabled; Zoom-In runs with `β = 0` (query-only) | +| Zoom-Out expansion returns 0 neighbors for all Zoom-In results | Return Zoom-In results only; no expansion; log at `DEBUG` | +| Zoom-In returns 0 results (empty recall) | Skip Zoom-Out; return empty slot; standard recall fallback is NOT triggered (MemCoT slot stays `None`) | +| Budget exhausted after Zoom-In alone | Truncate Zoom-In results to budget; skip Zoom-Out entirely | +| Qdrant unavailable mid-recall | Propagate error; ContextAssembler treats slot as `None` (graceful degradation per FR-005 in 021-zeph-context) | + +--- + +## 7. Acceptance Criteria + +- `cargo nextest run -p zeph-memory -E 'test(memcot)'` passes +- A session with `enabled = true` and a multi-turn conversation: accumulator state is non-empty after turn 1; Zoom-In scores differ from raw cosine ordering; Zoom-Out set contains at least one neighbor not in Zoom-In for typical queries +- A session with `enabled = false`: no `MemCotRecall` allocations appear in the trace; `semantic_recall` slot is filled by the standard path +- Budget overflow test: Zoom-Out expansion that would exceed `memcot_budget_tokens` is truncated; total slot token count ≤ budget +- Accumulator update error: provider returns `Err`; accumulator retains previous state; next turn's Zoom-In uses the prior state without panicking + +--- + +## 8. See Also + +- [[004-memory/spec]] — parent memory spec +- [[004-7-memory-apex-magma]] — APEX-MEM (MemCoT operates above the edge-resolution layer) +- [[012-graph-memory/spec]] — SYNAPSE spreading activation (Zoom-Out neighbor traversal is complementary) +- [[021-zeph-context/spec]] — `ContextAssembler` slot model +- [[024-multi-model-design/spec]] — `memcot_provider` tier guidance +- [[MOC-specs]] — all specifications diff --git a/specs/004-memory/004-7-memory-apex-magma.md b/specs/004-memory/004-7-memory-apex-magma.md index e2c46ee9a..2fadcb41d 100644 --- a/specs/004-memory/004-7-memory-apex-magma.md +++ b/specs/004-memory/004-7-memory-apex-magma.md @@ -457,14 +457,58 @@ AND the head remains the head (valid_to untouched) --- -## 12. Open Questions +## 12. Implementation Notes (Post-Landing) + +### insert_or_supersede Unique Index Constraint (#3639) + +The write path for `insert_or_supersede` previously could hit a UNIQUE constraint +violation on `uq_graph_edges_active_head` when two concurrent extraction tasks raced +to write the same `(source_entity_id, target_entity_id, canonical_relation, edge_type)` +tuple without the first write having completed its `valid_to` closure. + +**Resolution**: The supersede transaction now uses `INSERT OR REPLACE` on the +`edge_reassertions` table for byte-identical writes (FR-015), and the main edge +insert uses an explicit per-entity `SAVEPOINT` guard so a constraint violation from a +concurrent writer triggers a retry-after-reload rather than propagating upward. +The partial unique index `uq_graph_edges_active_head` remains the enforcement +mechanism; the write path is now MVCC-safe under SQLite WAL mode. + +**Key invariant added**: `insert_or_supersede` MUST be retried (with exponential +backoff, max 3 attempts) on `SQLITE_CONSTRAINT_UNIQUE` before surfacing as +`GraphError`; the constraint violation indicates a concurrent write that already +advanced the head. + +### extract_provider Bypass for QualityGate (#3615) + +The `quality_gate_provider` in `[memory.graph]` controls post-write scoring. +LLM-assisted entity extraction (ontology normalization, conflict resolution) uses +a separate `extract_provider` so the quality gate can be bypassed for the extraction +path itself. This prevents the quality gate from gating its own scorer — the gate +only applies to user-generated writes, not to extraction-originated edges. + +```toml +[memory.graph] +extract_provider = "fast" # provider for entity extraction LLM calls +quality_gate_provider = "fast" # provider for quality gate scoring (empty = disable) +``` + +When `quality_gate_provider` is empty, the gate is disabled. When `extract_provider` +is empty, it falls back to the primary provider. The two fields are independent. + +**Key invariant**: extraction-originated writes MUST bypass the quality gate, not +flow through it. The quality gate applies only to writes that originate from user +memory commands or external memory injection. + +--- + +## 13. Open Questions > [!question] > - **Cardinality model for multi-valued predicates**: FR-008 gates conflict resolution on `cardinality = 1` predicates, but the ontology table (§5.3) currently has no explicit `cardinality` column — it is implied for known predicates and defaults to `n` (multi-valued) for unknown ones. FR-014 adds the `cardinality` field to ontology entries, but FR-008 must explicitly document how SYNAPSE distinguishes single-value predicates (e.g., `works_at`) from intrinsically multi-valued ones (e.g., `owns`) before invariant tests for conflict resolution are written. The distinction must be mechanical, not inferred from predicate name. --- -## 13. See Also +## 14. See Also - [[constitution]] — project principles - [[004-memory/spec]] — memory pipeline @@ -476,21 +520,22 @@ AND the head remains the head (valid_to untouched) --- -## 14. Research Backlog +## 15. Research Backlog Research findings pending implementation review. Each entry links to the originating tracking issue and proposes a concrete integration point. ### 14.1 MemCoT — Test-Time Memory Scaling (arXiv:2604.08216) **Tracking issue**: #3564 -**Status**: Researched / pending implementation (P3) +**Status**: Implemented (#3592) — see [[004-13-memory-memcot]] for the full sub-spec. -MemCoT introduces a training-free multi-view LTM perception layer (Zoom-In for evidence localization, Zoom-Out for causal context expansion) and a task-conditioned dual short-term memory (semantic state + episodic trajectory). Benchmarked on LoCoMo: GPT-4o-mini F1 = 58.03 vs ~30 baseline. +MemCoT introduces a training-free multi-view LTM perception layer (Zoom-In for evidence localization, Zoom-Out for causal context expansion) and a task-conditioned dual short-term memory (`SemanticStateAccumulator` and episodic trajectory). Benchmarked on LoCoMo: GPT-4o-mini F1 = 58.03 vs ~30 baseline. -**Proposed Zeph integration**: -- `SemanticStateAccumulator` in `TurnContext` (new follow-up issue filed) -- Zoom-In retrieval pass in `ReasoningMemory::recall` before spreading activation (new follow-up issue filed) -- Config: `memory.memcot.enabled` (default: false) +**Implemented Zeph integration**: +- `SemanticStateAccumulator` attached to `TurnContext` in `zeph-memory`; accumulates per-turn semantic state across the session +- Zoom-In recall view passes a narrowed query over the APEX-MEM resolved edge set to localize evidence +- Zoom-Out recall view expands the query to causal/contextual neighbors +- Config: `[memory.memcot] enabled` (default: false); provider references via `memcot_provider` **Relevance to APEX-MEM**: APEX-MEM canonicalizes facts at the edge layer. MemCoT's Zoom-In retrieval and semantic-state STM operate one layer above — on top of the resolved edge set returned by SYNAPSE. The two are complementary: APEX-MEM decides which edge wins; MemCoT decides how the winning edges are presented to the reasoning model. diff --git a/specs/006-tools/spec.md b/specs/006-tools/spec.md index c94071e17..2298a0b9c 100644 --- a/specs/006-tools/spec.md +++ b/specs/006-tools/spec.md @@ -244,6 +244,39 @@ An empty string falls back to the agent's primary provider. - `compress_context` is non-cacheable (side effects on context window) — must be in the non-cacheable set - NEVER call `compress_context` recursively from within a compress_context execution +## DynExecutor Confirmation Delegation (#3649, #3651) + +`DynExecutor` is the type-erased wrapper around dynamically-loaded tool executors +(e.g., from MCP servers registered after agent startup). Before these PRs, `DynExecutor` +implemented `requires_confirmation` with a hardcoded `false` default, meaning dynamically +loaded tools bypassed the confirmation gate even when the underlying executor required it. + +### Fix + +`DynExecutor::requires_confirmation(&call)` now delegates to the inner executor's +`requires_confirmation` method via the `ErasedToolExecutor` vtable: + +```rust +impl ToolExecutor for DynExecutor { + fn requires_confirmation_erased(&self, call: &ToolCall) -> bool { + self.inner.requires_confirmation_erased(call) + } +} +``` + +The `SpeculationEngine` also uses `requires_confirmation_erased` to gate speculative +dispatch (FR-SE-005 in [[053-speculation-engine/spec]]). This fix ensures that a +`DynExecutor`-wrapped MCP tool that requires confirmation is never speculatively +dispatched. + +### Key Invariants + +- `DynExecutor::requires_confirmation_erased` MUST delegate to the inner executor — no hardcoded default +- The confirmation gate applies uniformly to all executors, including dynamically loaded ones +- NEVER special-case `DynExecutor` in the confirmation gate — it must behave identically to a statically-typed executor + +--- + ## Key Invariants - Blocklist check is unconditional — PermissionPolicy cannot bypass it diff --git a/specs/008-mcp/008-1-lifecycle.md b/specs/008-mcp/008-1-lifecycle.md index ea6df7d47..47f7e4e5b 100644 --- a/specs/008-mcp/008-1-lifecycle.md +++ b/specs/008-mcp/008-1-lifecycle.md @@ -180,7 +180,74 @@ async fn shutdown_server(&self, server_id: &str) -> Result<()> { } ``` -## Configuration +## Startup Auto-Retry with Exponential Backoff (#3578) + +MCP server startup is unreliable in practice: a server process may crash before +completing the `initialize` handshake, or a network MCP server may be temporarily +unavailable at agent start time. Without retry, a single failed server blocks agent +startup or silently reduces the tool catalog. + +### Retry Contract + +`McpManager::start_with_retry(config)` wraps `start_server()` in an exponential +backoff loop: + +``` +attempt 1: immediate +attempt 2: base_delay_ms (default 200 ms) +attempt 3: base_delay_ms × backoff_factor (default 2.0) +... +attempt N: min(base_delay_ms × backoff_factor^(N-2), max_delay_ms) +``` + +On exhaustion (all `max_startup_retries` attempts failed): + +- **`critical = false` servers**: log `ERROR`, skip server, agent starts without it. + The missing server's tools are absent from the catalog until a `/mcp reconnect` command. +- **`critical = true` servers**: return `Err(McpError::CriticalServerStartFailed)`, + aborting agent startup. + +### Jitter + +Each backoff delay is jittered by `±25%` (uniform random) to prevent thundering herds +when multiple MCP servers restart simultaneously after a crash. + +### Tracing + +Each retry attempt emits a `tracing::warn!` with attempt number, server name, and +error. The initial failure emits `tracing::info!` (not warn — first attempt failure is +expected in slow-start environments). + +### Config + +```toml +[[mcp.servers]] +name = "local-tools" +command = "python3 /path/to/server.py" +stdio = "pipe" # or "pty" for terminal emulation +timeout_init_s = 10 +timeout_request_s = 30 +healthcheck_interval_s = 60 +critical = false # if true, startup failure aborts the agent +max_startup_retries = 3 # total attempts (1 initial + N-1 retries); 0 = no retry +startup_retry_base_delay_ms = 200 # base delay before first retry +startup_retry_max_delay_ms = 5000 # cap on exponential backoff +startup_retry_backoff_factor = 2.0 # multiplier applied per attempt + +# Environment scrubbing: keep only these vars +allow_env_vars = ["PATH", "HOME", "RUST_LOG"] +``` + +### Key Invariants + +- Retry delay is bounded by `startup_retry_max_delay_ms` — backoff cannot grow unbounded +- `critical = true` servers abort startup on first failure (no retry is attempted before aborting) + — override: set `max_startup_retries > 0` to retry even critical servers before aborting +- NEVER silently swallow a critical server failure — `Err` must propagate to `McpManager::start_all` +- Jitter is applied on retries only, not on the initial attempt +- The TUI startup spinner shows per-server retry status when `max_startup_retries > 0` + +## Configuration (Legacy) ```toml [[mcp.servers]] diff --git a/specs/009-orchestration/spec.md b/specs/009-orchestration/spec.md index fcf4b31a2..8551e87bf 100644 --- a/specs/009-orchestration/spec.md +++ b/specs/009-orchestration/spec.md @@ -110,10 +110,12 @@ Pending → Queued → Running → Completed ```toml [orchestration] -planner_provider = "quality" # references [[llm.providers]] name; empty = primary provider fallback +planner_provider = "quality" # references [[llm.providers]] name; empty = primary provider fallback +orchestrator_provider = "quality" # provider for the orchestrator's own LLM calls (aggregation, routing decisions) ``` -- `planner_provider: String` — provider name from `[[llm.providers]]`. Empty string means "use the agent's primary provider". +- `planner_provider: String` — provider name for goal decomposition. Empty string means "use the agent's primary provider". +- `orchestrator_provider: String` — provider name for `LlmAggregator` and `AgentRouter` LLM calls. Empty string means "use the agent's primary provider". If unset, defaults to `planner_provider`. - `planner_model` has been removed (dead field, cleaned up pre-v1.0.0). Config migration `migrate_planner_model_to_provider()` rewrites any existing `planner_model` key with a warning to use `planner_provider` instead. ### Provider selection rule @@ -121,8 +123,8 @@ planner_provider = "quality" # references [[llm.providers]] name; empty = prim Planning is a complex/expert task (goal decomposition requires reasoning about parallelism and dependencies) — route to a quality provider, not a fast/cheap one. ``` -planner_provider = "quality" # correct: complex reasoning task -planner_provider = "fast" # acceptable only for simple, known-structure goals +planner_provider = "quality" # correct: complex reasoning task +orchestrator_provider = "quality" # aggregation and routing decisions benefit from quality reasoning ``` ### Key Invariants @@ -133,6 +135,52 @@ planner_provider = "fast" # acceptable only for simple, known-structure goal - `DagScheduler` is tick-based (not event-driven) — tick interval is configurable - Sub-agent results are merged by `LlmAggregator`, not concatenated — aggregation is an LLM call - `planner_provider` must resolve via the provider registry at runtime — never hardcode a model in `LlmPlanner` +- `orchestrator_provider` must resolve via the provider registry at runtime; fallback to `planner_provider`, then primary + +--- + +## AdmissionGate + +`AdmissionGate` (#3617) is a pre-planning filter that prevents low-quality, malformed, or +policy-violating goals from reaching `LlmPlanner`. It runs synchronously before any LLM +planning call. + +### Purpose + +Without an admission gate, `LlmPlanner` accepts any string as a goal and makes an +expensive LLM call to decompose it. Common failure modes: + +1. Empty or trivially short goals produce degenerate plans +2. Goals that include PII or injection attempts bypass VIGIL because the planner input + is not a tool call +3. Extremely long goals (>8 KB) can cause planning context overflow + +### Checks Performed (in order) + +| Check | Threshold | Error | +|-------|-----------|-------| +| Goal length (min) | < 10 characters → reject | `OrchestrationError::GoalTooShort` | +| Goal length (max) | > `max_goal_length` bytes → reject | `OrchestrationError::GoalTooLong` | +| PII detection | VIGIL regex scan on goal text → warn + redact | Logged; planning proceeds with redacted goal | +| Injection detection | `SecurityPatterns` scan → reject | `OrchestrationError::GoalInjectionDetected` | + +### Config + +```toml +[orchestration.admission] +enabled = true # default: enabled +max_goal_length = 8192 # bytes; 0 = no limit +pii_warn = true # log a warning when PII is detected in the goal +inject_reject = true # reject goals that trigger injection patterns +``` + +### Key Invariants + +- `AdmissionGate::check()` runs BEFORE any LLM call — no planning cost is incurred for rejected goals +- PII detection warns and redacts; it does not reject (goal may be valid but contain PII) +- Injection detection rejects immediately; no planning cost is incurred +- `enabled = false` bypasses all checks; the raw goal is forwarded to `LlmPlanner` unchanged +- NEVER surface the rejection reason as an LLM response — surface it as a user-facing error message through the channel --- diff --git a/specs/021-zeph-context/spec.md b/specs/021-zeph-context/spec.md index 556d74042..9c8a4d9a8 100644 --- a/specs/021-zeph-context/spec.md +++ b/specs/021-zeph-context/spec.md @@ -232,16 +232,76 @@ AND it returns false for results with meaningful content --- -## 9. Open Questions +## 9. TypedPage Compaction Integration (#3638) + +The `ContextAssembler` now includes a `TypedPageCompactor` pass that runs after the +parallel slot-gather and before the final `PreparedContext` is returned to `zeph-core`. +This integrates the ClawVM typed-page compaction model (see [[004-8-memory-typed-pages]]) +into the assembler pipeline. + +### Integration Point + +``` +ContextAssembler::gather() + │ + ├── FuturesUnordered parallel fetch (all slots) + │ + ▼ +TypedPageCompactor::compact(slots, budget) ← NEW (feature-gated) + │ + ▼ +PreparedContext +``` + +The `TypedPageCompactor` receives the assembled `PreparedContext` and the +`BudgetAllocation`. For any slot that exceeds its budget allocation, the compactor +applies the per-type minimum-fidelity constraint (from [[004-8-memory-typed-pages]]) +instead of truncating uniformly. + +### Page Type Dispatch + +Each `ContextSlot` carries a `PageType` tag set during the gather step: + +| Slot | PageType | +|------|---------| +| `semantic_recall` | `SemanticFact` | +| `graph_facts` | `GraphFact` | +| `summaries` | `Summary` | +| `recent_history` | `ConversationTurn` | +| `doc_rag` / `cross_session` | `ToolOutput` | + +The `TypedPageCompactor` looks up the minimum-fidelity invariant for each `PageType` +and ensures the compacted slot satisfies it. If a slot cannot satisfy the invariant +within the budget, it is marked `CompactionWarning` and the caller (`zeph-core`) is +notified via a `CompactionFailureClass::MinimumFidelityViolation`. + +### Feature Gate + +TypedPage compaction is controlled by the `typed-pages` feature flag (default: off). +When the feature is off, `TypedPageCompactor` is a no-op pass that returns the input +slots unchanged. + +### Key Invariants + +- `TypedPageCompactor::compact()` is stateless and pure — no agent state mutated +- Compaction runs AFTER parallel fetch completes — it cannot delay individual fetches +- A `MinimumFidelityViolation` warning does NOT abort the turn — the slot is truncated + and the warning is logged at `WARN` level +- NEVER apply `TypedPageCompactor` when `typed-pages` feature is off + +--- + +## 10. Open Questions None. --- -## 10. See Also +## 11. See Also - [[constitution]] — project principles - [[001-system-invariants/spec]] — cross-cutting invariants - [[002-agent-loop/spec]] — agent loop that consumes this crate - [[004-memory/spec]] — memory stores queried by `ContextAssembler` +- [[004-8-memory-typed-pages]] — ClawVM typed-page compaction (integrated into assembler pipeline) - [[MOC-specs]] — all specifications diff --git a/specs/028-hooks/spec.md b/specs/028-hooks/spec.md index f168a0dc9..53f6fb954 100644 --- a/specs/028-hooks/spec.md +++ b/specs/028-hooks/spec.md @@ -187,6 +187,45 @@ working directory. --- +## Tracing Instrumentation and reload_config Propagation (#3628) + +### Tracing Spans + +`HookRunner::fire_hooks()` now wraps each hook execution in a `tracing::info_span!` +with the following attributes: + +```rust +info_span!("hooks.fire", event = event_name, hook_count = hooks.len()) +``` + +Each individual hook command invocation gets a child span: + +```rust +info_span!("hooks.command", command = hook.command) +``` + +This makes hook execution visible in local Chrome traces and OTLP spans. Previously, +hook fires were invisible in the trace. + +### reload_config Hook Propagation + +`Agent::reload_config()` — called when a `config reload` event arrives from the +`FileChangeWatcher` or the TUI — now propagates the reloaded hooks config to the +`HookRunner`. Before PR #3628, a live config reload updated all other subsystems but +left `HookRunner` with the stale hooks from startup. + +**Fix**: `reload_config` calls `hook_runner.replace_config(new_hooks_config)` after +validating the new config. `HookRunner::replace_config` is an atomic swap using +`arc_swap::ArcSwap`. + +### Key Invariants + +- `HookRunner` MUST NOT cache `HooksConfig` as a plain field — it MUST use `ArcSwap` so `replace_config` is atomic and does not require a lock on the hook runner +- After `reload_config` completes, the NEXT hook fire uses the new config — in-flight hook commands continue under the config active at dispatch +- NEVER propagate a config that failed validation to `HookRunner` + +--- + ## Key Invariants - Hook commands execute with the blocked-command list applied — dangerous shell patterns are prevented @@ -199,6 +238,7 @@ working directory. - `permission_denied` hook fires when `RuntimeLayer::before_tool` short-circuits execution; `LayerDenial.reason` is propagated to `ZEPH_DENY_REASON` (#3310) - `turn_complete` is added to `HooksConfig` and `HooksConfig::is_empty()` check (#3327) - `type = "mcp_tool"` action requires MCP manager active; must fail gracefully per `fail_closed` setting when unavailable (#3293) +- `HookRunner` uses `ArcSwap` — live reload is atomic, no lock contention on hook dispatch - NEVER inject hook stdout into the agent's conversation context - NEVER run hooks with elevated privileges — they inherit the agent process permissions only - If `[hooks]` section is absent from config, all hook lists are empty and no hooks fire — zero-cost when unused diff --git a/specs/047-cli-modes/spec.md b/specs/047-cli-modes/spec.md index 5aaaf2d3f..b3adaaa4a 100644 --- a/specs/047-cli-modes/spec.md +++ b/specs/047-cli-modes/spec.md @@ -272,7 +272,68 @@ AND the summary is ≤ 512 tokens as measured by the embedding provider tokenize --- -## 10. Open Questions +## 10. `zeph project purge` Command (#3598) + +`zeph project purge` performs a full reset of all persisted project state for the +current working directory. It is a destructive, operator-only command with a mandatory +confirmation prompt. + +### What Is Purged + +| State | Location | Action | +|-------|----------|--------| +| Conversation history | SQLite `messages` table | Deleted for the current project path | +| Memory embeddings | Qdrant collection for the project | Vectors deleted | +| Graph entities and edges | SQLite `entities`, `edges` tables | Deleted for the project | +| Tool audit log | `audit.jsonl` | Deleted | +| Summaries and compactions | SQLite `summaries` table | Deleted for the project | +| Plan history | SQLite `plans` table | Deleted for the project | +| Provider preference | SQLite `channel_preferences` | Deleted for the project | +| Code index | SQLite code-index tables | Deleted for the project | +| Debug dumps | `.local/debug/*` | Deleted if `--include-debug` is passed | + +Skills, vault secrets, and config files are **never** purged by this command. + +### Invocation + +``` +zeph project purge [--yes] [--include-debug] [--dry-run] +``` + +| Flag | Effect | +|------|--------| +| `--yes` / `-y` | Skip the confirmation prompt (matches global `-y` semantics) | +| `--include-debug` | Also delete debug dumps from `.local/debug/` | +| `--dry-run` | Print what would be purged without deleting anything | + +### Confirmation Prompt (default) + +``` +WARNING: This will permanently delete all conversation history, memory, and tool +audit logs for project at /path/to/project. This cannot be undone. + +Type the project name to confirm: +``` + +The user must type the project directory name (last path component) exactly to +proceed. This prevents accidental purge from a mis-typed command. + +### Key Invariants + +- Purge is scoped to the **current project** (resolved from `cwd`). It does NOT purge + other projects sharing the same SQLite database. +- Skills, vault secrets, and `config.toml` are NEVER touched. +- Qdrant vector deletion is best-effort — if Qdrant is unavailable, the SQLite + embedding references are still deleted and the command succeeds. +- `--dry-run` must not perform any write operations — read-only inspection only. +- NEVER auto-approve the confirmation prompt in scripts without `--yes` / `-y`. +- The audit log entry for the purge operation itself is written BEFORE the audit log + is deleted (so forensics can confirm when the purge occurred). +- Exit code 0 on success; exit code 1 on user cancellation; exit code 2 on error. + +--- + +## 11. Open Questions > [!question] > - **`/loop` count limit**: should `/loop` support a `--count N` variant that runs N @@ -284,7 +345,7 @@ AND the summary is ≤ 512 tokens as measured by the embedding provider tokenize --- -## 11. See Also +## 12. See Also - [[constitution]] — project principles - [[002-agent-loop/spec]] — turn lifecycle diff --git a/specs/051-gonka-gateway/spec.md b/specs/051-gonka-gateway/spec.md index b0d7a434e..e9024540b 100644 --- a/specs/051-gonka-gateway/spec.md +++ b/specs/051-gonka-gateway/spec.md @@ -10,7 +10,7 @@ tags: - providers - config created: 2026-05-05 -status: draft +status: implemented related: - "[[MOC-specs]]" - "[[001-system-invariants/spec]]" diff --git a/specs/052-gonka-native/spec.md b/specs/052-gonka-native/spec.md index 86ff787dd..98a1d2d04 100644 --- a/specs/052-gonka-native/spec.md +++ b/specs/052-gonka-native/spec.md @@ -11,7 +11,7 @@ tags: - security - contract created: 2026-05-05 -status: draft +status: implemented related: - "[[MOC-specs]]" - "[[001-system-invariants/spec]]" @@ -246,7 +246,8 @@ async fn send_signed_with_retry( |--------|-----------| | `chat` | Build body via inner `OpenAiProvider`; sign; send; decode | | `chat_stream` | As `chat` but request SSE stream; fall back to `chat` if unsupported | -| `chat_with_tools` | Build tools-enabled body; sign; send; decode tool call response | +| `chat_with_tools` | Build tools-enabled body (OpenAI tools format); sign; send; decode tool call response including `tool_calls` array | +| `chat_typed` | Typed structured output variant of `chat`; uses OpenAI response format `json_schema`; sign; send; decode typed response | | `embed` | Return `LlmError::Unsupported` — gonka network is text-generation-only at launch | | `supports_streaming` | `true` | | `supports_embeddings` | `false` | @@ -256,6 +257,11 @@ async fn send_signed_with_retry( | `name` | Provider name from config (e.g., `"gonka"`) | | `last_usage` | Parsed from `usage` field in OpenAI-format response | +> [!note] +> `chat_with_tools` and `chat_typed` were wired in PRs #3612 and #3624 respectively. +> Both delegate body construction to the inner `OpenAiProvider` and replace only the +> signing + transport step — the OpenAI request/response schema is reused verbatim. + --- ## 8. Config Schema Additions diff --git a/specs/README.md b/specs/README.md index 6a7484bb6..716f8a8f8 100644 --- a/specs/README.md +++ b/specs/README.md @@ -40,8 +40,9 @@ Spec IDs (001–044) follow a logical grouping: - **048**: SLM cost metrics survey and CPS metric contract - **049**: Agent god-object decomposition (Services aggregator + AgentRuntime newtype) - **050**: Security capability governance (tool scoping + trajectory sentinel + CapSeal sketch) -- **051**: Gonka.ai Phase 1 — GonkaGate hosted gateway (CompatibleProvider, wizard, vault key) -- **052**: Gonka.ai Phase 2 — native network transport (GonkaProvider, ECDSA signing, EndpointPool) +- **051**: Gonka.ai Phase 1 — GonkaGate hosted gateway (CompatibleProvider, wizard, vault key) [implemented] +- **052**: Gonka.ai Phase 2 — native network transport (GonkaProvider, ECDSA signing, EndpointPool, chat_with_tools, chat_typed) [implemented] +- **053**: SpeculationEngine — speculative tool execution (SSE decoding path, PASTE skill activation, ToolStartEvent{speculative:true}) --- @@ -72,6 +73,7 @@ Spec IDs (001–044) follow a logical grouping: | `004-memory/004-10-memory-memmachine-retrieval.md` | MemMachine retrieval-depth-first memory: retrieval depth config, search prompt templates, query bias correction, episode preservation (#3325) | `zeph-memory` | | `004-memory/004-11-memory-hela-mem.md` | HeLa-Mem Hebbian learning: edge weight reinforcement, periodic consolidation, spreading activation retrieval (#3324) | `zeph-memory` | | `004-memory/004-12-memory-reasoning-bank.md` | ReasoningBank: self-judge + distillation pipeline, strategy embedding store, context preamble injection (#3312) | `zeph-memory`, `zeph-core` | +| `004-memory/004-13-memory-memcot.md` | MemCoT: SemanticStateAccumulator, Zoom-In evidence localization, Zoom-Out causal expansion (#3592) | `zeph-memory` | | `005-skills/spec.md` | SKILL.md format, registry, matching, hot-reload, skill trust governance, two-stage matching, Wilson score confidence intervals, hub install pipeline, agent-invocable skills (`invoke_skill`) | `zeph-skills` | | `006-tools/spec.md` | ToolExecutor, CompositeExecutor, TAFC, schema filter, result cache, dependency graph, tool invocation phase taxonomy, native `tool_use` only; `invoke_skill`/`load_skill` utility-gate exemption | `zeph-tools` | | `007-channels/spec.md` | Channel trait, AnyChannel dispatch, streaming, channel feature parity | `zeph-channels` | @@ -124,4 +126,5 @@ Spec IDs (001–044) follow a logical grouping: | `049-agent-decomposition/spec.md` | Agent god-object Phase 2 (#3509): split `Agent` 25+ direct sub-state fields into `services: Services` (background subsystems) and `runtime: AgentRuntime` (config, lifecycle, providers, metrics, debug, instructions); pure refactor, no API change, separately borrowable; `TurnContext` boundary sketched for P2-prereq-3 | `zeph-core` | | `050-security-capability-governance/spec.md` | Capability scoping (`ScopedToolExecutor` + per-task-type allow-lists, #3563), `TrajectorySentinel` multi-turn risk accumulator with decay (#3570), and CapSeal/SUDP `VaultBroker::propose_operation` Phase-3 research sketch (#3569) | `zeph-tools`, `zeph-core` | | `051-gonka-gateway/spec.md` | Phase 1: gonka.ai inference via GonkaGate hosted gateway — zero new Rust code, `CompatibleProvider` reuse, wizard branch, vault key `ZEPH_COMPATIBLE_GONKAGATE_API_KEY` | `zeph-llm`, `zeph-config` | -| `052-gonka-native/spec.md` | Phase 2: native gonka network transport — `GonkaProvider`, ECDSA secp256k1 signing (`RequestSigner`), `EndpointPool` round-robin fail-skip, `send_signed_with_retry`, `zeph gonka doctor` | `zeph-llm`, `zeph-config` | +| `052-gonka-native/spec.md` | Phase 2: native gonka network transport — `GonkaProvider`, ECDSA secp256k1 signing (`RequestSigner`), `EndpointPool` round-robin fail-skip, `send_signed_with_retry`, `chat_with_tools`, `chat_typed`, `zeph gonka doctor` | `zeph-llm`, `zeph-config` | +| `053-speculation-engine/spec.md` | `SpeculationEngine` — speculative tool execution: `PartialJsonParser` SSE decoding path, PASTE skill activation, `try_dispatch`/`try_commit`/`end_turn` API, `ToolStartEvent{speculative:true}`, `DynExecutor` confirmation delegation | `zeph-core`, `zeph-tools` |