EntityProcess · christso · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/README.md b/README.md
diff --git a/apps/web/astro.config.mjs b/apps/web/astro.config.mjs
@@ -37,13 +37,14 @@ export default defineConfig({
         { icon: 'github', label: 'GitHub', href: 'https://github.com/EntityProcess/agentv' },
       ],
       sidebar: [
-        { label: 'Getting Started', autogenerate: { directory: 'docs/getting-started' } },
-        { label: 'Evaluation', autogenerate: { directory: 'docs/evaluation' } },
-        { label: 'Evaluators', autogenerate: { directory: 'docs/evaluators' } },
-        { label: 'Targets', autogenerate: { directory: 'docs/targets' } },
-        { label: 'Tools', autogenerate: { directory: 'docs/tools' } },
-        { label: 'Guides', autogenerate: { directory: 'docs/guides' } },
-        { label: 'Integrations', autogenerate: { directory: 'docs/integrations' } },
+        { label: 'Getting Started', autogenerate: { directory: 'getting-started' } },
+        { label: 'Evaluation', autogenerate: { directory: 'evaluation' } },
+        { label: 'Evaluators', autogenerate: { directory: 'evaluators' } },
+        { label: 'Targets', autogenerate: { directory: 'targets' } },
+        { label: 'Tools', autogenerate: { directory: 'tools' } },
+        { label: 'Guides', autogenerate: { directory: 'guides' } },
+        { label: 'Integrations', autogenerate: { directory: 'integrations' } },
+        { label: 'Reference', autogenerate: { directory: 'reference' } },
       ],
       editLink: {
         baseUrl: 'https://github.com/EntityProcess/agentv/edit/main/apps/web/',

diff --git a/apps/web/src/components/Lander.astro b/apps/web/src/components/Lander.astro
@@ -13,7 +13,7 @@ import type { Props } from '@astrojs/starlight/props';
       <span class="av-wordmark">agent<span class="av-wordmark-v">v</span></span>
     </a>
     <div class="av-nav-links">
-      <a href="/docs/getting-started/introduction/">Docs</a>
+      <a href="/docs/">Docs</a>
       <a href="https://github.com/EntityProcess/agentv" target="_blank" rel="noopener noreferrer">GitHub</a>
       <button class="av-nav-pill" data-command="npm install -g agentv">
         <code>npm install -g agentv</code>
@@ -39,7 +39,7 @@ import type { Props } from '@astrojs/starlight/props';
           Deterministic code judges + customizable LLM judges, version-controlled in Git.
         </p>
         <div class="av-hero-cta">
-          <a href="/docs/getting-started/introduction/" class="av-btn-primary">Get Started</a>
+          <a href="/docs/" class="av-btn-primary">Get Started</a>
           <a href="https://github.com/EntityProcess/agentv" class="av-btn-ghost" target="_blank" rel="noopener noreferrer">
             <svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"/></svg>
             GitHub
@@ -268,7 +268,7 @@ tests:
       <h2 class="av-gradient-text">Start evaluating your agents</h2>
       <p class="av-footer-sub">Open source. Local-first. MIT Licensed.</p>
       <div class="av-footer-actions">
-        <a href="/docs/getting-started/introduction/" class="av-btn-primary">Read the docs</a>
+        <a href="/docs/" class="av-btn-primary">Read the docs</a>
         <button class="av-footer-install" data-command="npm install -g agentv">
           <code>$ npm install -g agentv</code>
           <span class="av-copy-icon">

diff --git a/...ocs/docs/getting-started/introduction.mdx → apps/web/src/content/docs/docs.mdx b/...ocs/docs/getting-started/introduction.mdx → apps/web/src/content/docs/docs.mdx
diff --git a/...ontent/docs/docs/evaluation/batch-cli.mdx → ...src/content/docs/evaluation/batch-cli.mdx b/...ontent/docs/docs/evaluation/batch-cli.mdx → ...src/content/docs/evaluation/batch-cli.mdx
diff --git a/...ntent/docs/docs/evaluation/eval-cases.mdx → ...rc/content/docs/evaluation/eval-cases.mdx b/...ntent/docs/docs/evaluation/eval-cases.mdx → ...rc/content/docs/evaluation/eval-cases.mdx
diff --git a/...ntent/docs/docs/evaluation/eval-files.mdx → ...rc/content/docs/evaluation/eval-files.mdx b/...ntent/docs/docs/evaluation/eval-files.mdx → ...rc/content/docs/evaluation/eval-files.mdx
diff --git a/...content/docs/docs/evaluation/examples.mdx → .../src/content/docs/evaluation/examples.mdx b/...content/docs/docs/evaluation/examples.mdx → .../src/content/docs/evaluation/examples.mdx
diff --git a/.../content/docs/docs/evaluation/rubrics.mdx → ...b/src/content/docs/evaluation/rubrics.mdx b/.../content/docs/docs/evaluation/rubrics.mdx → ...b/src/content/docs/evaluation/rubrics.mdx
diff --git a/...nt/docs/docs/evaluation/running-evals.mdx → ...content/docs/evaluation/running-evals.mdx b/...nt/docs/docs/evaluation/running-evals.mdx → ...content/docs/evaluation/running-evals.mdx
diff --git a/.../src/content/docs/docs/evaluation/sdk.mdx → apps/web/src/content/docs/evaluation/sdk.mdx b/.../src/content/docs/docs/evaluation/sdk.mdx → apps/web/src/content/docs/evaluation/sdk.mdx
diff --git a/...ent/docs/docs/evaluators/code-graders.mdx → .../content/docs/evaluators/code-graders.mdx b/...ent/docs/docs/evaluators/code-graders.mdx → .../content/docs/evaluators/code-graders.mdx
diff --git a/...ontent/docs/docs/evaluators/composite.mdx → ...src/content/docs/evaluators/composite.mdx b/...ontent/docs/docs/evaluators/composite.mdx → ...src/content/docs/evaluators/composite.mdx
diff --git a/...ocs/docs/evaluators/custom-assertions.mdx → ...ent/docs/evaluators/custom-assertions.mdx b/...ocs/docs/evaluators/custom-assertions.mdx → ...ent/docs/evaluators/custom-assertions.mdx
diff --git a/...ocs/docs/evaluators/custom-evaluators.mdx → ...ent/docs/evaluators/custom-evaluators.mdx b/...ocs/docs/evaluators/custom-evaluators.mdx → ...ent/docs/evaluators/custom-evaluators.mdx
diff --git a/...ocs/docs/evaluators/execution-metrics.mdx → ...ent/docs/evaluators/execution-metrics.mdx b/...ocs/docs/evaluators/execution-metrics.mdx → ...ent/docs/evaluators/execution-metrics.mdx
diff --git a/...tent/docs/docs/evaluators/llm-graders.mdx → ...c/content/docs/evaluators/llm-graders.mdx b/...tent/docs/docs/evaluators/llm-graders.mdx → ...c/content/docs/evaluators/llm-graders.mdx
diff --git a/.../docs/docs/evaluators/structured-data.mdx → ...ntent/docs/evaluators/structured-data.mdx b/.../docs/docs/evaluators/structured-data.mdx → ...ntent/docs/evaluators/structured-data.mdx
diff --git a/.../docs/docs/evaluators/tool-trajectory.mdx → ...ntent/docs/evaluators/tool-trajectory.mdx b/.../docs/docs/evaluators/tool-trajectory.mdx → ...ntent/docs/evaluators/tool-trajectory.mdx
diff --git a/...ocs/docs/getting-started/installation.mdx → ...ent/docs/getting-started/installation.mdx b/...ocs/docs/getting-started/installation.mdx → ...ent/docs/getting-started/installation.mdx
diff --git a/apps/web/src/content/docs/getting-started/introduction.mdx b/apps/web/src/content/docs/getting-started/introduction.mdx
@@ -0,0 +1,53 @@
+---
+title: Introduction
+description: What AgentV is and why it exists
+sidebar:
+  order: 1
+---
+
+AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic code graders + customizable LLM graders, all version-controlled in Git.
+
+## Why AgentV?
+
+**Best for:** Developers who want evaluation in their workflow, not a separate dashboard. Teams prioritizing privacy and reproducibility.
+
+- **No cloud dependency** — everything runs locally
+- **No server** — just install and run
+- **Version-controlled** — YAML evaluation files live in Git alongside your code
+- **CI/CD ready** — run evaluations in your pipeline without external API calls
+- **Multiple evaluator types** — code validators, LLM graders, custom Python/TypeScript
+
+## How AgentV Compares
+
+| Feature | AgentV | LangWatch | LangSmith | LangFuse |
+|---------|--------|-----------|-----------|----------|
+| **Setup** | `npx allagents plugin install` | Cloud account + API key | Cloud account + API key | Cloud account + API key |
+| **Server** | None (local) | Managed cloud | Managed cloud | Managed cloud |
+| **Privacy** | All local | Cloud-hosted | Cloud-hosted | Cloud-hosted |
+| **CLI-first** | Yes | No | Limited | Limited |
+| **CI/CD ready** | Yes | Requires API calls | Requires API calls | Requires API calls |
+| **Version control** | Yes (YAML in Git) | No | No | No |
+| **Evaluators** | Code + LLM + Custom | LLM only | LLM + Code | LLM only |
+
+## Core Concepts
+
+**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent or provider to evaluate. **Graders** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison.
+
+### Key Components
+
+- **Eval files** — YAML or JSONL definitions of test cases
+- **Tests** — Individual test entries with input messages and expected outcomes
+- **Targets** — The agent or LLM provider being evaluated
+- **Evaluators** — Code graders (Python/TypeScript) or LLM graders that score responses
+- **Rubrics** — Structured criteria with weights for grading
+- **Results** — JSONL output with scores, reasoning, and execution traces
+
+## Features
+
+- **Multi-objective scoring**: Correctness, latency, cost, safety in one run
+- **Multiple evaluator types**: Code validators, LLM graders, custom Python/TypeScript
+- **Built-in targets**: VS Code Copilot, Codex CLI, Pi Coding Agent, Azure OpenAI, local CLI agents
+- **Structured evaluation**: Rubric-based grading with weights and requirements
+- **Batch evaluation**: Run hundreds of test cases in parallel
+- **Export**: JSON, JSONL, YAML formats
+- **Compare results**: Compute deltas between evaluation runs for A/B testing
diff --git a/.../docs/docs/getting-started/quickstart.mdx → ...ntent/docs/getting-started/quickstart.mdx b/.../docs/docs/getting-started/quickstart.mdx → ...ntent/docs/getting-started/quickstart.mdx
diff --git a/...nt/docs/docs/guides/agent-eval-layers.mdx → ...content/docs/guides/agent-eval-layers.mdx b/...nt/docs/docs/guides/agent-eval-layers.mdx → ...content/docs/guides/agent-eval-layers.mdx
diff --git a/...t/docs/docs/guides/agent-skills-evals.mdx → ...ontent/docs/guides/agent-skills-evals.mdx b/...t/docs/docs/guides/agent-skills-evals.mdx → ...ontent/docs/guides/agent-skills-evals.mdx
diff --git a/...ocs/docs/guides/autoevals-integration.mdx → ...ent/docs/guides/autoevals-integration.mdx b/...ocs/docs/guides/autoevals-integration.mdx → ...ent/docs/guides/autoevals-integration.mdx
diff --git a/...ntent/docs/docs/guides/eval-authoring.mdx → ...rc/content/docs/guides/eval-authoring.mdx b/...ntent/docs/docs/guides/eval-authoring.mdx → ...rc/content/docs/guides/eval-authoring.mdx
diff --git a/...ent/docs/docs/guides/evaluation-types.mdx → .../content/docs/guides/evaluation-types.mdx b/...ent/docs/docs/guides/evaluation-types.mdx → .../content/docs/guides/evaluation-types.mdx
diff --git a/.../docs/docs/guides/git-cache-workspace.mdx → ...ntent/docs/guides/git-cache-workspace.mdx b/.../docs/docs/guides/git-cache-workspace.mdx → ...ntent/docs/guides/git-cache-workspace.mdx
diff --git a/...content/docs/docs/guides/human-review.mdx → .../src/content/docs/guides/human-review.mdx b/...content/docs/docs/guides/human-review.mdx → .../src/content/docs/guides/human-review.mdx
diff --git a/...ocs/guides/skill-improvement-workflow.mdx → ...ocs/guides/skill-improvement-workflow.mdx b/...ocs/guides/skill-improvement-workflow.mdx → ...ocs/guides/skill-improvement-workflow.mdx
diff --git a/...ntent/docs/docs/guides/workspace-pool.mdx → ...rc/content/docs/guides/workspace-pool.mdx b/...ntent/docs/docs/guides/workspace-pool.mdx → ...rc/content/docs/guides/workspace-pool.mdx
diff --git a/...ntent/docs/docs/integrations/langfuse.mdx → ...rc/content/docs/integrations/langfuse.mdx b/...ntent/docs/docs/integrations/langfuse.mdx → ...rc/content/docs/integrations/langfuse.mdx
diff --git a/apps/web/src/content/docs/reference/comparison.mdx b/apps/web/src/content/docs/reference/comparison.mdx
@@ -0,0 +1,126 @@
+---
+title: Comparison
+description: How AgentV compares to other evaluation frameworks.
+---
+
+## Quick Comparison
+
+| Aspect | **AgentV** | **Braintrust** | **Langfuse** | **LangSmith** | **LangWatch** | **Google ADK** | **Mastra** | **OpenCode Bench** |
+|--------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
+| **Primary Focus** | Agent evaluation & testing | Evaluation + logging | Observability + evaluation | Observability + evaluation | LLM ops & evaluation | Agent development | Agent/workflow development | Coding agent benchmarking |
+| **Language** | TypeScript/CLI | Python/TypeScript | Python/JavaScript | Python/JavaScript | Python/JavaScript | Python | TypeScript | Python/CLI |
+| **Deployment** | Local (CLI-first) | Cloud | Cloud/self-hosted | Cloud only | Cloud/self-hosted/hybrid | Local/Cloud Run | Local/server | Benchmarking service |
+| **Self-contained** | Yes | No (cloud) | No (requires server) | No (cloud-only) | No (requires server) | Yes | Yes (optional) | No (requires service) |
+| **Evaluation Focus** | Core feature | Core feature | Yes | Yes | Core feature | Minimal | Secondary | Core feature |
+| **Judge Types** | Code + LLM (custom prompts) | Code + LLM (custom) | LLM-as-judge only | LLM-based + custom | LLM + real-time | Built-in metrics | Built-in (minimal) | Multi-judge LLM (3 judges) |
+| **CLI-First** | Yes | No (SDK-first) | Dashboard-first | Dashboard-first | Dashboard-first | Code-first | Code-first | Service-based |
+| **Open Source** | MIT | Closed source | Apache 2.0 | Closed | Closed | Apache 2.0 | MIT | Open source |
+| **Setup Time** | &lt; 2 min | 5+ min | 15+ min | 10+ min | 20+ min | 30+ min | 10+ min | 5-10 min |
+
+## AgentV vs. Braintrust
+
+| Feature | AgentV | Braintrust |
+|---------|--------|-----------|
+| **Evaluation** | Code + LLM (custom prompts) | Code + LLM (Autoevals library) |
+| **Deployment** | Local (no server) | Cloud-only (managed) |
+| **Open source** | MIT | Closed source |
+| **Pricing** | Free | Free tier + paid plans |
+| **CLI-first** | Yes | SDK-first (Python/TS) |
+| **Custom judge prompts** | Markdown files (Git) | SDK-based |
+| **Observability** | No | Yes (logging, tracing) |
+| **Datasets** | YAML/JSONL in Git | Managed in platform |
+| **CI/CD** | Native (exit codes) | API-based |
+| **Collaboration** | Git-based | Web dashboard |
+
+**Choose AgentV if:** You want local-first evaluation, open source, version-controlled evals in Git.
+**Choose Braintrust if:** You want a managed platform with built-in logging, datasets, and team collaboration.
+
+## AgentV vs. Langfuse
+
+| Feature | AgentV | Langfuse |
+|---------|--------|----------|
+| **Evaluation** | Code + LLM (custom prompts) | LLM only |
+| **Local execution** | Yes | No (requires server) |
+| **Speed** | Fast (no network) | Slower (API round-trips) |
+| **Setup** | `npm install` | Docker + database |
+| **Cost** | Free | Free + $299+/mo for production |
+| **Observability** | No | Full tracing |
+| **Custom judge prompts** | Version in Git | API-based |
+| **CI/CD ready** | Yes | Requires API calls |
+
+**Choose AgentV if:** You iterate locally on evals, need deterministic + subjective judges together.
+**Choose Langfuse if:** You need production observability + team dashboards.
+
+## AgentV vs. LangSmith
+
+| Feature | AgentV | LangSmith |
+|---------|--------|-----------|
+| **Evaluation** | Code + LLM custom | LLM-based (SDK) |
+| **Deployment** | Local (no server) | Cloud only |
+| **Framework lock-in** | None | LangChain ecosystem |
+| **Open source** | MIT | Closed |
+| **Local execution** | Yes | No (requires API calls) |
+| **Observability** | No | Full tracing |
+
+**Choose AgentV if:** You want local evaluation, deterministic judges, open source.
+**Choose LangSmith if:** You're LangChain-heavy, need production tracing.
+
+## AgentV vs. LangWatch
+
+| Feature | AgentV | LangWatch |
+|---------|--------|-----------|
+| **Evaluation focus** | Development-first | Team collaboration first |
+| **Execution** | Local | Cloud/self-hosted server |
+| **Custom judge prompts** | Markdown files (Git) | UI-based |
+| **Code judges** | Yes | LLM-focused |
+| **Setup** | &lt; 2 min | 20+ min |
+| **Team features** | No | Annotation, roles, review |
+
+**Choose AgentV if:** You develop locally, want fast iteration, prefer code judges.
+**Choose LangWatch if:** You need team collaboration, managed optimization, on-prem deployment.
+
+## AgentV vs. Google ADK
+
+| Feature | AgentV | Google ADK |
+|---------|--------|-----------|
+| **Purpose** | Evaluation | Agent development |
+| **Evaluation capability** | Comprehensive | Built-in metrics only |
+| **Setup** | &lt; 2 min | 30+ min |
+| **Code-first** | YAML-first | Python-first |
+
+**Choose AgentV if:** You need to evaluate agents (not build them).
+**Choose Google ADK if:** You're building multi-agent systems.
+
+## AgentV vs. Mastra
+
+| Feature | AgentV | Mastra |
+|---------|--------|--------|
+| **Purpose** | Agent evaluation & testing | Agent/workflow development framework |
+| **Evaluation** | Core focus (code + LLM judges) | Secondary, built-in only |
+| **Agent Building** | No (tests agents) | Yes (builds agents with tools, workflows) |
+| **Open Source** | MIT | MIT |
+
+**Choose AgentV if:** You need to test/evaluate agents.
+**Choose Mastra if:** You're building TypeScript AI agents and need orchestration.
+
+## When to Use AgentV
+
+**Best for:** Individual developers and teams that evaluate locally before deploying, and need custom evaluation criteria.
+
+**Use something else for:**
+- Production observability → Langfuse or LangWatch
+- Team dashboards → LangWatch, Langfuse, or Braintrust
+- Building agents → Mastra (TypeScript) or Google ADK (Python)
+- Standardized benchmarking → OpenCode Bench
+
+## Ecosystem Recommendation
+
+```
+Build agents (Mastra / Google ADK)
+    ↓
+Evaluate locally (AgentV)
+    ↓
+Block regressions in CI/CD (AgentV)
+    ↓
+Monitor in production (Langfuse / LangWatch / Braintrust)
+```
diff --git a/...ntent/docs/docs/targets/coding-agents.mdx → ...rc/content/docs/targets/coding-agents.mdx b/...ntent/docs/docs/targets/coding-agents.mdx → ...rc/content/docs/targets/coding-agents.mdx
diff --git a/...ntent/docs/docs/targets/configuration.mdx → ...rc/content/docs/targets/configuration.mdx b/...ntent/docs/docs/targets/configuration.mdx → ...rc/content/docs/targets/configuration.mdx
diff --git a/...nt/docs/docs/targets/custom-providers.mdx → ...content/docs/targets/custom-providers.mdx b/...nt/docs/docs/targets/custom-providers.mdx → ...content/docs/targets/custom-providers.mdx
diff --git a/...ntent/docs/docs/targets/llm-providers.mdx → ...rc/content/docs/targets/llm-providers.mdx b/...ntent/docs/docs/targets/llm-providers.mdx → ...rc/content/docs/targets/llm-providers.mdx
diff --git a/...b/src/content/docs/docs/targets/retry.mdx → apps/web/src/content/docs/targets/retry.mdx b/...b/src/content/docs/docs/targets/retry.mdx → apps/web/src/content/docs/targets/retry.mdx
diff --git a/...b/src/content/docs/docs/tools/compare.mdx → apps/web/src/content/docs/tools/compare.mdx b/...b/src/content/docs/docs/tools/compare.mdx → apps/web/src/content/docs/tools/compare.mdx
diff --git a/...b/src/content/docs/docs/tools/convert.mdx → apps/web/src/content/docs/tools/convert.mdx b/...b/src/content/docs/docs/tools/convert.mdx → apps/web/src/content/docs/tools/convert.mdx
diff --git a/...web/src/content/docs/docs/tools/trace.mdx → apps/web/src/content/docs/tools/trace.mdx b/...web/src/content/docs/docs/tools/trace.mdx → apps/web/src/content/docs/tools/trace.mdx
diff --git a/.../src/content/docs/docs/tools/validate.mdx → apps/web/src/content/docs/tools/validate.mdx b/.../src/content/docs/docs/tools/validate.mdx → apps/web/src/content/docs/tools/validate.mdx