From 92e88528757a28259a167fd74c1c4187560f77ae Mon Sep 17 00:00:00 2001 From: ossirytk Date: Sun, 29 Mar 2026 08:42:41 +0300 Subject: [PATCH 1/2] Web ui work --- .github/copilot-instructions.md | 43 ++ AGENTS.MD | 43 ++ core/job_queue.py | 87 ++++ core/preset_profiles.py | 82 +++ core/rag_manager.py | 419 ++++++++++++++++ docs/future_work/COPILOT_COMPACT_REFERENCE.md | 8 +- docs/future_work/REFINEMENTS.md | 88 +++- docs/future_work/UI_REFINEMENTS.md | 90 +++- templates/diagnostics_panel.html | 37 ++ templates/index.html | 270 ++++++++++ templates/presets_panel.html | 40 ++ templates/rag/benchmark_results.html | 32 ++ templates/rag/collection_detail.html | 78 +++ templates/rag/collections_list.html | 43 ++ templates/rag/coverage_report.html | 40 ++ templates/rag/evaluate_index.html | 33 ++ templates/rag/evaluate_results.html | 51 ++ templates/rag/file_view.html | 6 + templates/rag/files_list.html | 61 +++ templates/rag/layout.html | 467 ++++++++++++++++++ templates/rag/lint_results.html | 49 ++ templates/rag/push_status.html | 34 ++ templates/rag/query_results.html | 22 + templates/rag/trends_table.html | 25 + web_app.py | 354 ++++++++++++- 25 files changed, 2483 insertions(+), 19 deletions(-) create mode 100644 core/job_queue.py create mode 100644 core/preset_profiles.py create mode 100644 core/rag_manager.py create mode 100644 templates/diagnostics_panel.html create mode 100644 templates/presets_panel.html create mode 100644 templates/rag/benchmark_results.html create mode 100644 templates/rag/collection_detail.html create mode 100644 templates/rag/collections_list.html create mode 100644 templates/rag/coverage_report.html create mode 100644 templates/rag/evaluate_index.html create mode 100644 templates/rag/evaluate_results.html create mode 100644 templates/rag/file_view.html create mode 100644 templates/rag/files_list.html create mode 100644 templates/rag/layout.html create mode 100644 templates/rag/lint_results.html create mode 100644 templates/rag/push_status.html create mode 100644 templates/rag/query_results.html create mode 100644 templates/rag/trends_table.html diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index d71b265..8aa2c59 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -25,6 +25,49 @@ All terminal commands should be reproducible from the supported shell/editor com --- +## 0.1 Available CLI Tools + +The following tools are installed locally and available for use in terminal workflows and agent tasks: + +| Tool | Purpose | +|------|---------| +| `diffutils` | File comparison (`diff`, `cmp`, `diff3`, `sdiff`) | +| `fd` | Fast, user-friendly alternative to `find` for file search | +| `fzf` | General-purpose fuzzy finder for interactive filtering | +| `ripgrep` (`rg`) | Fast regex search across files; prefer over `grep`/`Select-String` | +| `zip` | Archive creation and extraction | +| `tokei` | Count lines of code by language | +| `ast-grep` (`sg`) | Structural code search and rewriting using AST patterns | +| `jq` | JSON query and transformation CLI | +| `yq` | YAML/JSON/TOML query and transformation CLI | +| `hyperfine` | Command-line benchmarking with statistical output | +| `pre-commit` | Run and manage repository pre-commit hooks | +| `http` / `https` (HTTPie) | Human-friendly HTTP API client | +| `just` | Project task runner via `justfile` recipes | +| `difft` (difftastic) | Syntax-aware structural diffing | + +Prefer these tools over PowerShell built-ins where applicable (e.g., use `rg` instead of `Select-String`, use `fd` instead of `Get-ChildItem` for file discovery). + +### Preferred command order + +- Content search: `rg` first, then `ast-grep` for structural/language-aware matching +- File discovery: `fd` first, then `rg --files` as a fallback +- JSON config inspection: `jq` +- YAML/TOML inspection: `yq` +- HTTP/API smoke checks: `http` / `https` (HTTPie) +- Task orchestration: `just` recipes when a `justfile` exists +- Diff/review: `difft` for syntax-aware diffs, `diff` for plain text diffs +- Performance comparisons: `hyperfine` for repeatable timing + +### Avoid in autonomous runs + +- Avoid interactive-only flows (for example `fzf` prompts) unless the user explicitly asks for interactive selection +- Avoid destructive git/file operations unless the user explicitly approves them +- Avoid long-running watch commands by default; use one-shot checks first, then switch to watch mode only when requested +- Avoid invoking `pre-commit run --all-files` on very large repos when a targeted path or hook is enough for the task + +--- + ## 1. Authoritative Tools & Source of Truth ### Python diff --git a/AGENTS.MD b/AGENTS.MD index 9ffdbbc..942a979 100644 --- a/AGENTS.MD +++ b/AGENTS.MD @@ -25,6 +25,49 @@ All terminal commands should be reproducible from the supported shell/editor com --- +## 0.1 Available CLI Tools + +The following tools are installed locally and available for use in terminal workflows and agent tasks: + +| Tool | Purpose | +|------|---------| +| `diffutils` | File comparison (`diff`, `cmp`, `diff3`, `sdiff`) | +| `fd` | Fast, user-friendly alternative to `find` for file search | +| `fzf` | General-purpose fuzzy finder for interactive filtering | +| `ripgrep` (`rg`) | Fast regex search across files; prefer over `grep`/`Select-String` | +| `zip` | Archive creation and extraction | +| `tokei` | Count lines of code by language | +| `ast-grep` (`sg`) | Structural code search and rewriting using AST patterns | +| `jq` | JSON query and transformation CLI | +| `yq` | YAML/JSON/TOML query and transformation CLI | +| `hyperfine` | Command-line benchmarking with statistical output | +| `pre-commit` | Run and manage repository pre-commit hooks | +| `http` / `https` (HTTPie) | Human-friendly HTTP API client | +| `just` | Project task runner via `justfile` recipes | +| `difft` (difftastic) | Syntax-aware structural diffing | + +Prefer these tools over PowerShell built-ins where applicable (e.g., use `rg` instead of `Select-String`, use `fd` instead of `Get-ChildItem` for file discovery). + +### Preferred command order + +- Content search: `rg` first, then `ast-grep` for structural/language-aware matching +- File discovery: `fd` first, then `rg --files` as a fallback +- JSON config inspection: `jq` +- YAML/TOML inspection: `yq` +- HTTP/API smoke checks: `http` / `https` (HTTPie) +- Task orchestration: `just` recipes when a `justfile` exists +- Diff/review: `difft` for syntax-aware diffs, `diff` for plain text diffs +- Performance comparisons: `hyperfine` for repeatable timing + +### Avoid in autonomous runs + +- Avoid interactive-only flows (for example `fzf` prompts) unless the user explicitly asks for interactive selection +- Avoid destructive git/file operations unless the user explicitly approves them +- Avoid long-running watch commands by default; use one-shot checks first, then switch to watch mode only when requested +- Avoid invoking `pre-commit run --all-files` on very large repos when a targeted path or hook is enough for the task + +--- + ## 1. Authoritative Tools & Source of Truth ### Python diff --git a/core/job_queue.py b/core/job_queue.py new file mode 100644 index 0000000..0020c51 --- /dev/null +++ b/core/job_queue.py @@ -0,0 +1,87 @@ +"""Simple in-memory job store for long-running RAG web operations. + +Jobs run in background threads. Route handlers poll for status via HTMX +(`hx-trigger="every 2s"`). The job status endpoint stops including the +polling trigger once the job reaches a terminal state (done or error). +""" + +from __future__ import annotations + +import threading +import time +import uuid +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Callable + + +class Job: + """In-memory representation of a background job.""" + + __slots__ = ("error", "finished_at", "id", "result", "started_at", "status") + + def __init__(self, job_id: str) -> None: + self.id = job_id + self.status: str = "pending" + self.result: Any = None + self.error: str | None = None + self.started_at: float = time.monotonic() + self.finished_at: float | None = None + + def to_dict(self) -> dict[str, Any]: + elapsed = round((self.finished_at or time.monotonic()) - self.started_at, 2) + return { + "id": self.id, + "status": self.status, + "result": self.result, + "error": self.error, + "elapsed_s": elapsed, + } + + +class JobStore: + """Thread-safe store for background jobs.""" + + MAX_JOBS: int = 50 + + def __init__(self) -> None: + self._jobs: dict[str, Job] = {} + self._lock = threading.Lock() + + def submit(self, fn: Callable[..., Any], *args: object, **kwargs: object) -> str: + """Submit a callable as a background job; returns a job_id immediately.""" + job_id = uuid.uuid4().hex[:12] + job = Job(job_id) + with self._lock: + self._jobs[job_id] = job + self._evict_old() + + def _run() -> None: + job.status = "running" + try: + job.result = fn(*args, **kwargs) + job.status = "done" + except Exception as exc: + job.error = str(exc) + job.status = "error" + finally: + job.finished_at = time.monotonic() + + threading.Thread(target=_run, daemon=True).start() + return job_id + + def get(self, job_id: str) -> dict[str, Any] | None: + """Return job state dict, or None if job_id is unknown.""" + with self._lock: + job = self._jobs.get(job_id) + return job.to_dict() if job else None + + def _evict_old(self) -> None: + """Remove oldest finished jobs when over the cap (called under lock).""" + if len(self._jobs) <= self.MAX_JOBS: + return + finished = [j for j in self._jobs.values() if j.status in {"done", "error"}] + finished.sort(key=lambda j: j.finished_at or 0) + for j in finished[: len(self._jobs) - self.MAX_JOBS]: + del self._jobs[j.id] diff --git a/core/preset_profiles.py b/core/preset_profiles.py new file mode 100644 index 0000000..f7e6db9 --- /dev/null +++ b/core/preset_profiles.py @@ -0,0 +1,82 @@ +"""Saveable preset profiles for runtime retrieval settings.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + from core.config import ConversationRuntimeConfig + +PROFILE_FIELDS: list[str] = [ + "use_mmr", + "rag_rerank_enabled", + "rag_sentence_compression_enabled", + "rag_multi_query_enabled", + "rag_k", + "rag_k_mes", + "debug_context", +] + + +class ProfileStore: + """Persist and apply named retrieval-setting presets stored in a JSON file.""" + + def __init__(self, path: Path) -> None: + self._path = path + + def _load(self) -> dict[str, dict[str, object]]: + if not self._path.exists(): + return {} + try: + data = json.loads(self._path.read_text(encoding="utf-8")) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + def _save(self, data: dict[str, dict[str, object]]) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + self._path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + + def list_profiles(self) -> list[str]: + """Return sorted list of saved profile names.""" + return sorted(self._load().keys()) + + def save_profile(self, name: str, config: ConversationRuntimeConfig) -> None: + """Snapshot the profile-eligible fields from *config* under *name*.""" + data = self._load() + data[name] = {field: getattr(config, field) for field in PROFILE_FIELDS} + self._save(data) + + def get_profile(self, name: str) -> dict[str, object]: + """Return the stored settings dict for *name*.""" + data = self._load() + if name not in data: + msg = f"Profile {name!r} not found" + raise KeyError(msg) + return dict(data[name]) + + def apply_profile(self, name: str, config: ConversationRuntimeConfig) -> list[str]: + """Write profile values onto *config* in place; return list of changed field names.""" + profile = self.get_profile(name) + changed: list[str] = [] + for field, value in profile.items(): + if field not in PROFILE_FIELDS: + continue + current = getattr(config, field, None) + if current != value: + setattr(config, field, value) + changed.append(field) + return changed + + def delete_profile(self, name: str) -> None: + """Remove *name* from the store (no-op if not found).""" + data = self._load() + data.pop(name, None) + self._save(data) + + def current_values(self, config: ConversationRuntimeConfig) -> dict[str, object]: + """Return current values of the profile-eligible fields from *config*.""" + return {field: getattr(config, field) for field in PROFILE_FIELDS} diff --git a/core/rag_manager.py b/core/rag_manager.py new file mode 100644 index 0000000..8a6c95c --- /dev/null +++ b/core/rag_manager.py @@ -0,0 +1,419 @@ +"""Thin façade over scripts/rag/* for the RAG Management web UI. + +All functions are synchronous and designed to be called from route handlers +via asyncio.to_thread. None of these functions load the LLM model. +""" + +from __future__ import annotations + +import csv +import json +import time +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import chromadb +from chromadb.config import Settings + +if TYPE_CHECKING: + from core.config import RagScriptConfig + + +def _chroma_client(persist_dir: str) -> chromadb.PersistentClient: + return chromadb.PersistentClient( + path=persist_dir, + settings=Settings(anonymized_telemetry=False), + ) + + +# --------------------------------------------------------------------------- +# Collections +# --------------------------------------------------------------------------- + + +def list_collections(config: RagScriptConfig) -> list[dict[str, Any]]: + """List all ChromaDB collections with counts and fingerprint metadata.""" + client = _chroma_client(config.persist_directory) + results: list[dict[str, Any]] = [] + for col in client.list_collections(): + try: + count = col.count() + except Exception: + count = None + meta = col.metadata or {} + results.append( + { + "name": col.name, + "count": count, + "embedding_model": meta.get("embedding:model", ""), + "embedding_dimension": meta.get("embedding:dimension", ""), + "embedding_normalize": meta.get("embedding:normalize", ""), + } + ) + results.sort(key=lambda c: c["name"]) + return results + + +def collection_info(config: RagScriptConfig, name: str) -> dict[str, Any] | None: + """Return detailed info for a single collection, or None if not found.""" + client = _chroma_client(config.persist_directory) + try: + col = client.get_collection(name) + except Exception: + return None + try: + count = col.count() + except Exception: + count = 0 + meta = col.metadata or {} + try: + sample = col.peek(limit=5) + sample_docs = [ + {"id": id_, "text": (doc or "")[:200], "metadata": m} + for id_, doc, m in zip( + sample.get("ids", []), + sample.get("documents", []) or [], + sample.get("metadatas", []) or [], + strict=False, + ) + ] + except Exception: + sample_docs = [] + return { + "name": name, + "count": count, + "metadata": meta, + "embedding_model": meta.get("embedding:model", ""), + "embedding_dimension": meta.get("embedding:dimension", ""), + "embedding_normalize": meta.get("embedding:normalize", ""), + "sample_docs": sample_docs, + } + + +def delete_collection(config: RagScriptConfig, name: str) -> None: + """Delete a ChromaDB collection by name.""" + client = _chroma_client(config.persist_directory) + client.delete_collection(name) + + +def query_collection( + config: RagScriptConfig, + name: str, + query: str, + k: int = 5, +) -> list[dict[str, Any]]: + """Run ad-hoc similarity search. Returns top-k chunks with scores.""" + from langchain_chroma import Chroma # noqa: PLC0415 + from langchain_huggingface import HuggingFaceEmbeddings # noqa: PLC0415 + + embedder = HuggingFaceEmbeddings( + model_name=config.embedding_model, + model_kwargs={"device": config.embedding_device}, + encode_kwargs={"normalize_embeddings": True}, + cache_folder=config.embedding_cache, + ) + client = _chroma_client(config.persist_directory) + db = Chroma( + client=client, + collection_name=name, + embedding_function=embedder, + ) + results = db.similarity_search_with_score(query, k=k) + return [ + { + "rank": i + 1, + "text": doc.page_content, + "score": round(float(score), 4), + "metadata": doc.metadata, + } + for i, (doc, score) in enumerate(results) + ] + + +def backfill_fingerprint(config: RagScriptConfig, name: str) -> dict[str, Any]: + """Write embedding fingerprint metadata onto an existing collection.""" + from langchain_huggingface import HuggingFaceEmbeddings # noqa: PLC0415 + + from scripts.rag.manage_collections_core_collection import ( # noqa: PLC0415 + build_embedding_fingerprint, + infer_embedding_dimension, + ) + + embedder = HuggingFaceEmbeddings( + model_name=config.embedding_model, + model_kwargs={"device": config.embedding_device}, + encode_kwargs={"normalize_embeddings": True}, + cache_folder=config.embedding_cache, + ) + dimension = infer_embedding_dimension(embedder) + fingerprint = build_embedding_fingerprint( + embedding_model=config.embedding_model, + normalize_embeddings=True, + embedding_dimension=dimension, + ) + client = _chroma_client(config.persist_directory) + col = client.get_collection(name) + existing_meta = col.metadata or {} + col.modify(metadata={**existing_meta, **fingerprint}) + return fingerprint + + +# --------------------------------------------------------------------------- +# RAG Data Files +# --------------------------------------------------------------------------- + + +def list_rag_files(config: RagScriptConfig) -> list[dict[str, Any]]: + """List .txt source files in rag_data/ with type classification.""" + rag_dir = Path(config.documents_directory) + if not rag_dir.exists(): + return [] + files: list[dict[str, Any]] = [] + for path in sorted(rag_dir.glob("*.txt")): + stem = path.stem + files.append( + { + "name": path.name, + "stem": stem, + "type": "message_examples" if stem.endswith("_message_examples") else "lore", + "size": path.stat().st_size, + "has_metadata": (rag_dir / f"{stem}.json").exists(), + } + ) + return files + + +def file_content(config: RagScriptConfig, filename: str) -> str | None: + """Return the text content of a rag_data file, guarding against path traversal.""" + rag_dir = Path(config.documents_directory).resolve() + candidate = (rag_dir / filename).resolve() + if not candidate.is_relative_to(rag_dir): + return None + if not candidate.exists() or not candidate.is_file(): + return None + if candidate.suffix not in {".txt", ".json"}: + return None + return candidate.read_text(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Linting +# --------------------------------------------------------------------------- + + +def run_lint(config: RagScriptConfig, *, auto_fix: bool = False) -> list[dict[str, Any]]: + """Lint all *_message_examples.txt files. Returns list of report dicts.""" + from scripts.rag.lint_message_examples import lint_file_path # noqa: PLC0415 + + rag_dir = Path(config.documents_directory) + reports: list[dict[str, Any]] = [] + for path in sorted(rag_dir.glob("*_message_examples.txt")): + report = lint_file_path(path, auto_fix=auto_fix) + reports.append( + { + "file": path.name, + "valid": report.valid, + "auto_fixed": report.auto_fixed, + "violations": [ + { + "line_no": v.line_no, + "rule_id": v.rule_id, + "message": v.message, + "severity": v.severity.value if hasattr(v.severity, "value") else str(v.severity), + "suggested_fix": v.suggested_fix, + } + for v in report.violations + ], + } + ) + return reports + + +# --------------------------------------------------------------------------- +# Coverage +# --------------------------------------------------------------------------- + + +def run_coverage(config: RagScriptConfig, stem: str) -> dict[str, Any] | None: + """Run coverage analysis for a character (lore + metadata pair).""" + from scripts.rag.analyze_rag_coverage import ( # noqa: PLC0415 + extract_coverage_metrics, + format_coverage_report, + load_metadata_file, + ) + + rag_dir = Path(config.documents_directory) + source_file = rag_dir / f"{stem}.txt" + metadata_file = rag_dir / f"{stem}.json" + if not source_file.exists() or not metadata_file.exists(): + return None + source_text = source_file.read_text(encoding="utf-8") + metadata_list = load_metadata_file(metadata_file) + metrics = extract_coverage_metrics(source_text, metadata_list) + report_text = format_coverage_report(metrics) + return { + "stem": stem, + "entities_count": metrics.entities_count, + "source_coverage_ratio": round(metrics.source_coverage_ratio, 4), + "total_source_chars": metrics.total_source_chars, + "covered_chars": metrics.covered_chars, + "unmapped_segments": metrics.unmapped_segments[:20], + "category_distribution": metrics.category_distribution, + "report_text": report_text, + } + + +# --------------------------------------------------------------------------- +# Fixture Evaluation +# --------------------------------------------------------------------------- + + +def list_fixture_packs(tests_dir: str = "tests/fixtures") -> list[str]: + """List available fixture JSON files.""" + fixture_dir = Path(tests_dir) + if not fixture_dir.exists(): + return [] + return sorted(p.name for p in fixture_dir.glob("*.json")) + + +def run_evaluate_fixtures( + config: RagScriptConfig, + fixture_file: str, + tests_dir: str = "tests/fixtures", +) -> dict[str, Any] | None: + """Run fixture evaluation in similarity mode. Returns metrics dict.""" + from scripts.rag.manage_collections_core_evaluation import _execute_fixture_evaluation # noqa: PLC0415 + from scripts.rag.manage_collections_core_types import FixtureEvalOptions # noqa: PLC0415 + + fixture_path = Path(tests_dir) / fixture_file + if not fixture_path.exists(): + return None + options = FixtureEvalOptions( + fixture_file=fixture_path, + k=None, + retrieval_mode="similarity", + persist_directory=config.persist_directory, + embedding_model=config.embedding_model, + embedding_device=config.embedding_device, + show_failures=False, + ) + run = _execute_fixture_evaluation(options) + return { + "fixture_file": fixture_file, + "default_k": run.default_k, + "skipped": run.skipped, + "metrics": run.metrics, + "case_results": [ + { + "case_id": c.case_id, + "rank": c.rank, + "status": c.status, + "query": c.query[:120], + "collection": c.collection, + "forbidden_hit": c.forbidden_hit, + "precision_at_k": round(c.precision_at_k, 4), + "average_precision_at_k": round(c.average_precision_at_k, 4), + "matched_expected": c.matched_expected, + "expected_total": c.expected_total, + } + for c in run.case_results + ], + } + + +def get_fixture_trends(logs_dir: str = "logs/retrieval_eval") -> list[dict[str, Any]]: + """Read retrieval evaluation trend history from CSV (newest first).""" + history_path = Path(logs_dir) / "history.csv" + if not history_path.exists(): + return [] + rows: list[dict[str, Any]] = [] + try: + with history_path.open(encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + rows.extend(dict(row) for row in reader) + except Exception: + return [] + return list(reversed(rows)) + + +# --------------------------------------------------------------------------- +# Collection Push +# --------------------------------------------------------------------------- + + +def push_collection( + config: RagScriptConfig, + stem: str, + collection_name: str, + *, + overwrite: bool = True, +) -> dict[str, Any]: + """Chunk, enrich, and push a rag_data text file into a ChromaDB collection.""" + + from langchain_huggingface import HuggingFaceEmbeddings # noqa: PLC0415 + + from scripts.rag.push_rag_data import ( # noqa: PLC0415 + ProcessingContext, + PushConfig, + build_embedding_fingerprint, + enrich_documents_with_metadata, + infer_embedding_dimension, + load_and_chunk_text_file, + push_to_collection, + resolve_metadata_file, + ) + + rag_dir = Path(config.documents_directory) + file_path = rag_dir / f"{stem}.txt" + if not file_path.exists(): + msg = f"Source file not found: {file_path}" + raise FileNotFoundError(msg) + + embedder = HuggingFaceEmbeddings( + model_name=config.embedding_model, + model_kwargs={"device": config.embedding_device}, + encode_kwargs={"normalize_embeddings": True}, + cache_folder=config.embedding_cache, + ) + client = _chroma_client(config.persist_directory) + dimension = infer_embedding_dimension(embedder) + fingerprint = build_embedding_fingerprint( + embedding_model=config.embedding_model, + normalize_embeddings=True, + embedding_dimension=dimension, + ) + documents = load_and_chunk_text_file(file_path, config.chunk_size, config.chunk_overlap) + metadata_file = resolve_metadata_file(file_path, config.key_storage, None) + documents = enrich_documents_with_metadata(documents, metadata_file, config.threads) + + push_cfg = PushConfig( + persist_directory=config.persist_directory, + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + key_storage=config.key_storage, + threads=config.threads, + dry_run=False, + overwrite=overwrite, + ) + ctx = ProcessingContext(embedder=embedder, client=client) + t0 = time.monotonic() + push_to_collection(collection_name, documents, push_cfg, ctx, fingerprint) + elapsed = time.monotonic() - t0 + return { + "collection": collection_name, + "stem": stem, + "doc_count": len(documents), + "elapsed_s": round(elapsed, 2), + } + + +def get_benchmark_results(benchmark_dir: str = "logs/benchmark") -> dict[str, Any] | None: + """Load the most recent benchmark JSON from logs/benchmark/, if present.""" + benchmark_path = Path(benchmark_dir) / "last_benchmark.json" + if not benchmark_path.exists(): + return None + try: + with benchmark_path.open(encoding="utf-8") as f: + return json.load(f) + except Exception: + return None diff --git a/docs/future_work/COPILOT_COMPACT_REFERENCE.md b/docs/future_work/COPILOT_COMPACT_REFERENCE.md index 0db7cd0..8a8fbe7 100644 --- a/docs/future_work/COPILOT_COMPACT_REFERENCE.md +++ b/docs/future_work/COPILOT_COMPACT_REFERENCE.md @@ -1,6 +1,6 @@ # Copilot Compact Reference — Implemented State -Last verified: 2026-03-26 +Last verified: 2026-03-29 Use this as the single compact reference for implemented work across conversation quality, RAG quality, and web app behavior. @@ -140,15 +140,21 @@ Primary files: - In-UI session picker with naming support. - Per-turn retrieval trace history in the debug panel. - Session exports persist conversation-quality metadata and drift traces for later calibration. +- **Per-turn diagnostics panel**: collapsible sidebar panel showing Turn, Latency (s), Chars, Main chunks, MES chunks, Cross-removed, and Drift score (colour-coded at warning/fail thresholds) for the last 40 turns. Auto-refreshes after each stream. Route: `GET /chat/diagnostics`. +- **Saveable preset profiles**: collapsible sidebar panel for saving/applying/deleting named snapshots of 7 retrieval settings (`use_mmr`, `rag_rerank_enabled`, `rag_sentence_compression_enabled`, `rag_multi_query_enabled`, `rag_k`, `rag_k_mes`, `debug_context`). Profiles persisted in `configs/profiles.json`; applied in-place to the live `ConversationRuntimeConfig` without restart. Routes: `GET/POST /settings/profiles/*`. +- **One-click export bundle**: `GET /chat/export/bundle` downloads a ZIP containing `manifest.json`, `conversation.json` (full session), `retrieval_traces.json` (per-turn history), and `drift_history.json`. Button in composer quick-actions. Primary files: - `web_app.py` - `main.py` +- `core/preset_profiles.py` - `templates/index.html` - `templates/chat_message_pair.html` - `templates/chat_messages.html` - `templates/chat_single_message.html` +- `templates/diagnostics_panel.html` +- `templates/presets_panel.html` ## Current Defaults Snapshot diff --git a/docs/future_work/REFINEMENTS.md b/docs/future_work/REFINEMENTS.md index 230aee1..b0c6ff0 100644 --- a/docs/future_work/REFINEMENTS.md +++ b/docs/future_work/REFINEMENTS.md @@ -1,6 +1,6 @@ # Refinements Backlog -Last updated: 2026-03-26 +Last updated: 2026-04-03 This is the single source for remaining and future work across quality and retrieval. @@ -64,6 +64,87 @@ Implemented state lives in `docs/future_work/COPILOT_COMPACT_REFERENCE.md`. - Benchmark sentence compression and rerank combinations on a fixed fixture matrix. - Add an embedding model tiering profile (`small`, `balanced`, `quality`) with measured quality/cost tradeoffs. +### 6. Memory & Session Continuity + +Inspired by analysis of Claude's leaked markdown-based memory system. The current RAG pipeline is +entirely character-centric (lore and style). There is no persistent cross-session user memory — +facts about the user, relationship state, or conversation history carry zero weight between sessions. + +A two-tier hybrid is the right approach: + +- **Tier 1 — Markdown persona memory (do first):** Small per-user file + (`memory//.md`, ~200–400 tokens) containing relationship facts, user + preferences, and conversation state. Written by the LLM at session end via a lightweight + summarisation prompt. Loaded at session start and injected into the context budget before RAG + content. Human-readable, editable, and debuggable without any vector tooling. +- **Tier 2 — RAG over conversation archives (later):** When a user accumulates many sessions, + semantic search over past conversations using a `_memory` ChromaDB collection. Reuses + the existing retrieval pipeline. Only worthwhile at scale. + +Prerequisites before Tier 1 can be built: + 1. User/session identity scoping (who is this user across sessions?). + 2. Session-end write hook (trigger point for LLM memory extraction). + 3. Reserved context budget slot (~300 tokens, injected before RAG). + 4. Memory write prompt (instructs the LLM to extract 5–10 facts from the session). + +Idle-time memory consolidation: merge/summarise older notes when count exceeds threshold +(equivalent to AutoDream consolidation in the Claude spec). New commands: +`/memory list`, `/memory add `, `/memory forget `, `/memory clear`. + +Web UI: memory panel showing injected facts per turn (see `UI_REFINEMENTS.md §A.5`). + +### 7. Chat Experience & Conversation Control + +- **Conversation branching:** `/fork` to snapshot current state to a named branch, `/forks` to + list all saved branches, `/fork restore ` to rewind and continue from that point. Branches + stored in session JSON under a `branches` key; pairs with web UI controls (see `UI_REFINEMENTS.md §A.3`). +- **Character hot-reload:** `/character ` to swap the active character card mid-session + without a full restart. Preserve conversation history; reset persona drift state and reload the + RAG collection. Insert a visible "Character switched → " marker in the conversation. + List available cards with `/character list`. +- **Stop hooks:** user-defined stop conditions in config (`generation.stop_hooks`) — regex patterns + or keyword lists with `stop | redirect | warn` actions. Useful for OOC marker detection, + character-break detection, or content policy enforcement. Log stop events to telemetry and web + diagnostics. +- **User-defined command macros (skills):** define custom `/skill` commands in + `configs/skills.json`, mapping names to message templates injected at send time. Support template + variables: `{{char}}`, `{{user}}`, `{{last_response}}`. Commands: `/skill list`, + `/skill add