diff --git a/.env.example b/.env.example index d96227c..8ef9281 100644 --- a/.env.example +++ b/.env.example @@ -17,3 +17,5 @@ OPENAI_API_KEY=sk-your-openai-key-here # OLLAMA_BASE_URL=http://127.0.0.1:11434/v1 # OPENAI_BASE_URL=http://127.0.0.1:11434/v1 # OPENAI_API_KEY is optional for local; defaults to a placeholder if OPENAI_BASE_URL/OLLAMA_BASE_URL is set (ollama provider only). +# Optional — only if you use Ollama Cloud model tags (e.g. *:cloud); local models need LLM_PROVIDER + LLM_MODEL only. +# OLLAMA_API_KEY= diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bacfc98..5a6edc1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,11 +1,17 @@ -# CI: TypeScript typecheck + Playwright agent tests against OpenAI (API key from repo secrets). +# CI: typecheck + smoke test via local Ollama. # -# Required secret (Settings → Secrets and variables → Actions): -# OPENAI_API_KEY — your OpenAI API key +# Configure without editing this file (repo → Settings → Secrets and variables → Actions): +# • Variables: LLM_PROVIDER, LLM_MODEL, optional OLLAMA_BASE_URL +# • Secret OLLAMA_API_KEY — only if you set LLM_MODEL to an Ollama *Cloud* tag (*:cloud). # -# Optional: set repository variable LLM_MODEL (e.g. gpt-4o-mini) or edit the env block below. +# Defaults if unset (all local, no API key): ollama + llama3.2:3b + http://127.0.0.1:11434/v1 +# llama3.2:3b is a good CI default: small/fast on CPU, much more reliable tool-calling than 1b. # -# Fork PRs: the test job is skipped (secrets are not available to workflows from forks). +# Fork PRs: the test job is skipped (secrets/vars from the base repo are not available to workflows from forks). +# +# Manual run (Actions → CI → Run workflow): +# • GitHub shows "Use workflow from" — pick the branch there (that version of the workflow runs). +# • Optional: set "Checkout ref" below only if you need a different ref than the branch picker. name: CI @@ -15,6 +21,12 @@ on: pull_request: branches: [main, master] workflow_dispatch: + inputs: + checkout_ref: + description: 'Optional — branch name or refs/heads/... to checkout. Leave empty to use the branch selected in "Use workflow from" above.' + required: false + default: '' + type: string concurrency: group: ci-${{ github.workflow }}-${{ github.ref }} @@ -26,6 +38,8 @@ jobs: timeout-minutes: 10 steps: - uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.checkout_ref != '' && github.event.inputs.checkout_ref || github.ref }} - uses: actions/setup-node@v4 with: @@ -46,13 +60,17 @@ jobs: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) runs-on: ubuntu-latest - timeout-minutes: 60 + timeout-minutes: 45 env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - LLM_PROVIDER: openai - LLM_MODEL: gpt-4o-mini + # secrets.* overrides vars.* (both optional); fallbacks keep CI working if nothing is configured. + LLM_PROVIDER: ${{ secrets.LLM_PROVIDER || vars.LLM_PROVIDER || 'ollama' }} + LLM_MODEL: ${{ secrets.LLM_MODEL || vars.LLM_MODEL || 'llama3.2:3b' }} + OLLAMA_BASE_URL: ${{ secrets.OLLAMA_BASE_URL || vars.OLLAMA_BASE_URL || 'http://127.0.0.1:11434/v1' }} + OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} steps: - uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.checkout_ref != '' && github.event.inputs.checkout_ref || github.ref }} - uses: actions/setup-node@v4 with: @@ -65,8 +83,47 @@ jobs: - name: Install Playwright browsers run: npx playwright install --with-deps - - name: Run Playwright tests - run: npm test + - name: Ollama Cloud models need OLLAMA_API_KEY + run: | + case "$LLM_MODEL" in + *:cloud*) + if [ -z "${OLLAMA_API_KEY}" ]; then + echo "::error title=Missing OLLAMA_API_KEY::Models tagged *:cloud use Ollama Cloud. Add repository secret OLLAMA_API_KEY (https://ollama.com/settings/keys). Or use a local tag (e.g. llama3.2:3b) in Variables." + exit 1 + fi + echo "OLLAMA_API_KEY is set for Cloud model." + ;; + *) + echo "LLM_MODEL=$LLM_MODEL (local tag — OLLAMA_API_KEY optional)" + ;; + esac + + - name: Install Ollama + run: curl -fsSL https://ollama.com/install.sh | sh + + - name: Start Ollama and wait for API + run: | + set -e + sudo systemctl stop ollama 2>/dev/null || true + nohup ollama serve > /tmp/ollama-serve.log 2>&1 & + echo "Waiting for http://127.0.0.1:11434 ..." + for i in $(seq 1 90); do + if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then + echo "Ollama is ready (after ${i}s)" + exit 0 + fi + sleep 1 + done + echo "--- ollama serve log ---" + cat /tmp/ollama-serve.log || true + exit 1 + + - name: Pull Ollama model + timeout-minutes: 30 + run: ollama pull "$LLM_MODEL" + + - name: Smoke test (single LLM case) + run: npm run test:smoke - name: Upload Playwright report (on failure) if: failure() diff --git a/README.md b/README.md index 9a126e6..e984bb7 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,27 @@ # AgentAssert -**A Playwright-based testing framework for agentic AI systems with MCP-compatible tool schemas and in-process tool orchestration.** +**Proof-of-concept:** Playwright tests + reusable assertion helpers for tool-calling LLM agents (MCP-shaped tool schemas, in-process tools in the demo). + +This repository is **`"private": true`** in `package.json` — it is **not** published to npm and is **not** a productized “framework.” It is a **reference implementation** you can clone, read, and copy from. A clean entry point exists for imports (`index.ts` → `framework/`), but publishing a real package would add build steps, versioning, and semver guarantees — out of scope for this POC. + +--- + +## Repository layout + +| Path | Role | +|------|------| +| **`framework/`** | Reusable assertions (`AgentAssert`, `HeuristicContractMatcher`, `BehaviorContract`) and **shared types** (`framework/types.ts` — traces, contracts, tool shapes). | +| **`index.ts`** | Barrel export so you can `import { AgentAssert, … } from 'agent-assert'` when vendoring this repo. | +| **`examples/agent/`** | **Demo system under test:** LLM loop, `ToolRegistry`, file-reader / api-caller tools. Replace with your own agent; keep compatible `AgentTrace` / `AgentOutput` shapes if you reuse the assertions. | +| **`tests/`** | Playwright specs and fixtures wired to the demo agent via `tests/fixtures/setup.ts`. | --- ## About -A working proof-of-concept that demonstrates five testing patterns for AI agents that call tools through a **`ToolRegistry`** (the same tool definitions map cleanly to Anthropic/OpenAI tool formats and to MCP-style schemas). The repo does not run a live MCP server by default; tools execute in-process so tests stay fast and deterministic. The framework introduces `NonDeterministicMatcher` — an assertion utility that evaluates LLM outputs against semantic intent contracts instead of exact string matches. +A working proof-of-concept that demonstrates five testing patterns for AI agents that call tools through a **`ToolRegistry`** (the same tool definitions map cleanly to Anthropic/OpenAI tool formats and to MCP-style schemas). The repo does not run a live MCP server by default; tools execute in-process so tests stay fast and deterministic. + +**Behavior contracts** are checked by **`HeuristicContractMatcher`**: required fields, **case-insensitive keyword overlap**, **regex** forbidden phrases, and optional custom validators — *not* embedding similarity or LLM-as-judge semantics. The weighted **`confidence`** score is a tuning signal, not a calibrated measure of meaning. That avoids pretending the cheap matcher is “semantic” while still letting you reject exact-string tests for variable LLM wording. See **Heuristic matching: scope and limits** below. The deliberate use of Playwright (not Jest, not Vitest) as the test runner is itself a publishable insight. @@ -17,7 +32,7 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it ``` ┌─────────────────────────────────────────────────────────┐ │ YOUR TEST FILE │ -│ import { AgentAssert } from '../../framework/AgentAssert.js' │ +│ import { AgentAssert } from 'agent-assert' // or ../../framework/AgentAssert.js │ │ const trace = await agent.run("some prompt") │ │ AgentAssert.toolWasInvoked(trace, 'file-reader') │ │ AgentAssert.satisfiesContract(trace.output, CONTRACT) │ @@ -36,11 +51,11 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it │ 5. Capture TRACE │ └───────────┬─────────────┘ └────────┬────────────┘ │ │ ┌──────────▼──────────────┐ - ┌──────────▼───────────┐ │ NonDeterministicMatcher │ - │ ToolRegistry │ │ │ - │ │ │ Layer 1: Structure │ - │ file-reader → exec() │ │ Layer 2: Semantics │ - │ api-caller → exec() │ │ Layer 3: Forbidden │ + ┌──────────▼───────────┐ │ HeuristicContractMatcher │ + │ ToolRegistry │ │ │ + │ │ │ Layer 1: Structure │ + │ file-reader → exec() │ │ Layer 2: Keywords (BoW) │ + │ api-caller → exec() │ │ Layer 3: Forbidden (regex)│ └──────────────────────┘ │ Layer 4: Custom │ │ │ │ Returns: MatchResult │ @@ -58,8 +73,8 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it 6. **Loop continues** until the model produces a final text response 7. **Agent builds `AgentTrace`** — captures EVERY step (tool calls, tool results, reasoning, output) 8. **Test receives the trace** and passes it to AgentAssert methods -9. **AgentAssert uses NonDeterministicMatcher** to evaluate output against BehaviorContracts -10. **MatchResult returned** with confidence score and detailed breakdown +9. **AgentAssert uses HeuristicContractMatcher** to score output against BehaviorContracts +10. **MatchResult returned** with heuristic confidence and per-layer details --- @@ -88,9 +103,9 @@ AgentAssert.expectMatched(result, 'file-reader should be invoked'); // embeds Ag ### Pattern 2: Behavior Contract Validation **File:** `tests/behavioral/output-contract.spec.ts` -**What it tests:** Does the output satisfy a semantic contract (not exact string match)? +**What it tests:** Does the output satisfy a **heuristic** contract (fields + keywords + patterns), not an exact string match? -**Why it's unique:** `expect(output).toBe("...")` breaks on every LLM run. Contracts define rules that any correct output must satisfy, regardless of exact phrasing. +**Why it's unique:** `expect(output).toBe("...")` breaks on every LLM run. Contracts define cheap rules that often track “good enough” outputs. Synonymous phrasing can still fail if keywords don’t align — widen keywords, lower thresholds, add a **customValidator**, or upgrade to embeddings / LLM-judge (see limits section below). **Key assertion:** ```typescript @@ -100,9 +115,9 @@ AgentAssert.expectMatched(result, 'SUMMARIZATION contract should pass'); **What to look at in the code:** - `BehaviorContract.ts` — pre-built contracts with required fields, keywords, forbidden patterns -- `NonDeterministicMatcher.evaluate()` — the three-layer evaluation engine -- `minKeywordMatchRatio` — controls how strict keyword matching is -- `forbiddenPatterns` — hard-fail patterns that override the confidence score +- `HeuristicContractMatcher.evaluate()` — structure + keyword overlap + forbidden regex (+ optional custom) +- `minKeywordMatchRatio` — how much of the keyword list must appear as substrings +- `forbiddenPatterns` — regex matches force a contract failure path --- @@ -187,16 +202,14 @@ AgentAssert.expectMatched( ## Key Files Explained -### agent/types.ts -Every type definition. Read this first — everything else depends on these types. +### framework/types.ts +Shared types for traces, contracts, and (in the demo) tool definitions. Assertions and `examples/agent/` both import from here so the SUT and matchers stay aligned. -- `AgentTrace` — the backbone. Every assertion operates on traces. -- `TraceStep` — one decision the agent made (tool_call, tool_result, reasoning, output) -- `ContractDefinition` — the rules that define "correct" for non-deterministic outputs -- `MatchResult` — what assertions return (confidence score + details) +- `AgentTrace` — what assertions operate on +- `TraceStep`, `AgentOutput`, `ContractDefinition`, `MatchResult`, `ToolDefinition`, … -### agent/agent.ts -The System Under Test. The tool-calling loop is the core pattern: +### examples/agent/agent.ts +**Demo system under test** — not part of the reusable assertion layer. The tool-calling loop is the reference pattern: 1. Send prompt + tool definitions to **Anthropic Messages API** or **OpenAI Chat Completions** (see `AgentConfig.provider`) 2. The model responds with text and/or tool calls (`tool_use` vs `function` / `tool_calls` depending on provider) @@ -209,27 +222,39 @@ The System Under Test. The tool-calling loop is the core pattern: **Important:** The system prompt in this file shapes agent behavior. If you change it, update the test contracts to match. -### agent/tools/file-reader.ts and api-caller.ts -Tools use MCP-aligned JSON schemas and register through **`ToolRegistry`**. In this POC they run locally (file-reader reads from disk, api-caller uses mock responses). To connect them to a real MCP server, replace the `execute` function with MCP transport calls — the schema stays the same. +### examples/agent/tools/file-reader.ts and api-caller.ts +Demo tools use MCP-aligned JSON schemas and register through **`ToolRegistry`**. In this POC they run locally (file-reader reads from disk, api-caller uses mock responses). To connect them to a real MCP server, replace the `execute` function with MCP transport calls — the schema stays the same. **Security note:** `file-reader.ts` includes path traversal protection. Read the comments. -### agent/tools/registry.ts +### examples/agent/tools/registry.ts Maps tool names to definitions. Provides `toAnthropicTools()` and `toOpenAITools()` so the same tool definitions work with either API. This is the bridge between your tool definitions and the LLM. -### framework/NonDeterministicMatcher.ts -**The core innovation.** Three evaluation layers: +### framework/HeuristicContractMatcher.ts +**Heuristic evaluation (not deep semantics).** Layers: -1. **Structural** (40% weight) — are required fields present? -2. **Semantic** (35% weight) — do enough intent keywords appear? -3. **Forbidden** (25% weight) — do any red-flag patterns match? +1. **Structural** (40% weight) — required fields (and optional length) +2. **Keywords** (35% weight, or 25% + 10% custom when `customValidator` is set) — bag-of-words style: substring presence for each listed keyword +3. **Forbidden** (25% weight) — regex patterns; any hit triggers the contract-failure path +4. **Custom** (optional) — your own validator in the contract -Forbidden patterns cause a hard failure regardless of other scores. +The headline `confidence` is a **weighted average of those scores** — useful for ranking and thresholds, not as a semantic similarity score. **Tuning knobs:** -- `minKeywordMatchRatio` in the contract — lower = more lenient -- `confidence` threshold in the test — lower = fewer flaky tests -- `forbiddenPatterns` — add patterns to catch more failure modes +- `minKeywordMatchRatio` — lower = more lenient keyword layer +- Assertion threshold on `result.confidence` — lower = fewer flaky tests +- `forbiddenPatterns` — stricter guardrails (regex can be brittle; test them) +- **Synonyms** — add alternate phrasings to `requiredIntentKeywords`, or use `customValidator` / external judges (below) + +### Heuristic matching: scope and limits + +| Approach | What this repo does | What would be “more semantic” | +|----------|---------------------|--------------------------------| +| Keyword list | Substring checks after lowercasing | LLM-as-judge, entailment models | +| Confidence | Weighted heuristic blend | Calibrated metrics or judge scores | +| Same meaning, different words | Can **fail** unless keywords or patterns cover both | Embeddings vs reference texts, synonym lists | + +**Possible upgrades (not implemented here):** call a second model to grade outputs against the contract; embed output and reference snippets and compare cosine similarity; use an NLP library for paraphrase / NLI. Those add latency, cost, and complexity — the heuristic matcher stays intentionally cheap and explicit. ### framework/BehaviorContract.ts Pre-built contracts for common task types. Each contract defines what "correct" means for that task type. The five contracts: SUMMARIZATION, API_ACTION, MULTI_STEP, SCOPE_BOUNDED, GRACEFUL_FAILURE. @@ -252,14 +277,29 @@ Helpers: ### tests/env-llm.ts and `.env` -Playwright loads **`tests/env-llm.ts`** from **`playwright.config.ts`** (`applyLlmVarsFromDotEnv()`). Selected keys from a project-root **`.env`** file are merged into `process.env` (`.env` wins over existing shell vars for those keys): `LLM_PROVIDER`, `LLM_MODEL`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`. Copy **`.env.example`** to **`.env`** and fill in keys so tests and IDE runs see the same configuration without exporting variables manually. +Playwright loads **`tests/env-llm.ts`** from **`playwright.config.ts`** (`applyLlmVarsFromDotEnv()`). Selected keys from a project-root **`.env`** file are merged into `process.env` (`.env` wins over existing shell vars for those keys): `LLM_PROVIDER`, `LLM_MODEL`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OLLAMA_API_KEY`, `OLLAMA_BASE_URL`. Copy **`.env.example`** to **`.env`** and fill in keys so tests and IDE runs see the same configuration without exporting variables manually. + +**GitHub Actions CI** (`.github/workflows/ci.yml`) installs Ollama, pulls the configured model, and runs the smoke test. **Provider, model, and optional base URL** are read from repository **Variables** or **Secrets** (no code change): + +| Name | Purpose | +|------|--------| +| `LLM_PROVIDER` | e.g. `ollama` | +| `LLM_MODEL` | e.g. `llama3.2:3b` or `deepseek-v3.2:cloud` | +| `OLLAMA_BASE_URL` | Optional; default `http://127.0.0.1:11434/v1` | +| `OLLAMA_API_KEY` | **Repository secret** — required when `LLM_MODEL` is an Ollama **Cloud** tag (`*:cloud`). Without it, Ollama returns **HTTP 500**. [Create a key](https://ollama.com/settings/keys). | + +**Best practice:** put **non-sensitive** values under **Variables**. Use **Secrets** for **`OLLAMA_API_KEY`** and other keys. Storing `LLM_MODEL` as a *secret* works but **masks** it in logs—prefer **Variables** for model name unless needed. If neither Variable nor Secret is set for provider/model, CI defaults to **`ollama`** + **`llama3.2:3b`** (local, no key; stronger tool-calling than `1b`). + +**Run CI on a chosen branch:** **Actions** → **CI** → **Run workflow**. GitHub shows **Use workflow from** — that dropdown is the branch selector for manual runs (the workflow file and checkout use that branch unless you override). Optionally fill **Checkout ref** in the form to checkout a different branch or full ref (e.g. `refs/heads/feature/x`). + +**Local vs Cloud tags:** tags like `llama3.2:3b` run fully locally. Tags ending in **`:cloud`** need **`OLLAMA_API_KEY`** in Secrets. Override **`LLM_MODEL`** in Variables if you want another local model (e.g. newer builds when Ollama adds them). --- ### playwright.config.ts (high level) - **`retries: 1`** — each failed test runs one more time (LLM outputs vary) -- **Timeouts:** default **45s**; **`behavioral`** project **60s** (multi-step runs); **`boundary`** **45s** +- **Timeouts:** default **45s**; **`behavioral`** project **120s** locally, **300s** when **`CI=true`** (e.g. GitHub Actions + Ollama on CPU); **`boundary`** **45s** - **`trace: 'off'`** — browser-style Playwright traces are disabled (this suite does not use a browser). Failures still get rich attachments from **`registerAgentTraceForDiagnostics`** in `tests/fixtures/setup.ts` (see below) - **`workers: 3`** — tune for your API rate limits - **HTML report `title`** — includes resolved LLM provider and model for quick scanning @@ -350,7 +390,7 @@ npx playwright show-report ## How to Extend ### Add a New Tool -1. Create `agent/tools/your-tool.ts` following the same factory pattern as `file-reader.ts` +1. Create `examples/agent/tools/your-tool.ts` following the same factory pattern as `file-reader.ts` 2. Register it in the ToolRegistry in your test setup 3. Add mock responses in `setup.ts` 4. Write tests using `AgentAssert.toolWasInvoked(trace, 'your-tool')` @@ -366,14 +406,14 @@ npx playwright show-report 1. Add a static method to `AgentAssert.ts` 2. Accept `AgentTrace` or `AgentOutput` as input 3. Return `MatchResult` -4. Use `NonDeterministicMatcher` methods internally if needed +4. Use `HeuristicContractMatcher` methods internally if needed 5. Include detailed reasons in the `details` array ### Adapt for Another LLM Provider (beyond Anthropic, OpenAI, and Ollama) -1. Add a branch in `agent/agent.ts` alongside the existing Anthropic and OpenAI-compatible loops +1. Add a branch in `examples/agent/agent.ts` alongside the existing Anthropic and OpenAI-compatible loops 2. Add a `toYourProviderTools()` (or equivalent) on `ToolRegistry` if the tool schema differs 3. Map that provider’s tool-call and tool-result messages into the same `TraceStep` shapes the framework already expects -4. The framework layer (AgentAssert, NonDeterministicMatcher, BehaviorContract) stays UNCHANGED — it operates on `AgentTrace`, which is provider-agnostic +4. The framework layer (AgentAssert, HeuristicContractMatcher, BehaviorContract) stays UNCHANGED — it operates on `AgentTrace`, which is provider-agnostic ### Connect to a Real MCP Server 1. Replace the `execute` function in your tool with MCP client calls @@ -420,8 +460,8 @@ Each test run calls a real LLM API. Costs depend on provider and model. ## Troubleshooting -**Tests timeout (>60s):** -LLM APIs can be slow. Increase `timeout` in `playwright.config.ts`. Check your API key is valid for the chosen provider (`LLM_PROVIDER` / `AgentConfig.provider`). Check rate limits. +**Tests timeout (behavioral tests: 120s local, 300s on CI):** +LLM APIs and local Ollama on CPU can be slow (especially in GitHub Actions — first inference after `ollama pull` can take minutes). The **`behavioral`** project uses a longer cap when **`CI=true`**. You can raise `behavioralTimeoutMs` in `playwright.config.ts` if needed. For Ollama in CI, ensure the model is pulled before tests and the runner has enough RAM. **Tests are flaky (pass sometimes, fail sometimes):** This is expected with LLM testing. Three strategies: @@ -433,6 +473,9 @@ This is expected with LLM testing. Three strategies: **Wrong provider or API URL (401 / unexpected host):** Confirm `LLM_PROVIDER` matches the key you set. For `openai`, set **`OPENAI_BASE_URL`** for a custom endpoint; **`OLLAMA_BASE_URL` is not read** for that provider. Use **`LLM_PROVIDER=ollama`** with Ollama’s `/v1` base if you intend local Ollama. +**Ollama `500` / `internal service error` with `*:cloud` models:** +Cloud-tagged models may require an Ollama Cloud API key; use a **local** model tag for the same behavior without keys, or set optional **`OLLAMA_API_KEY`**. + **Agent output is not JSON:** The system prompt tells Claude to respond in JSON, but it sometimes wraps it in markdown fences. The `parseOutput()` method in `agent.ts` handles this. If you see `taskType: "unknown"`, the JSON parsing failed entirely — check the raw text in the trace. diff --git a/agent/types.ts b/agent/types.ts deleted file mode 100644 index 77224a1..0000000 --- a/agent/types.ts +++ /dev/null @@ -1,170 +0,0 @@ -/** - * agent/types.ts - * - * ARCHITECTURE ROLE: Shared type definitions. - * Every component in the system imports from here. - * If you change a type here, the compiler tells you every file that breaks. - * - * KEY DESIGN DECISION: The AgentTrace type is the backbone of the entire - * testing framework. Every assertion method in AgentAssert operates on - * traces, not on raw outputs. This is what makes the framework work — - * you're testing the agent's BEHAVIOR (what tools it chose, what params - * it passed, what path it took), not just its final answer. - */ - -// ───────────────────────────────────────────── -// TOOL DEFINITIONS -// ───────────────────────────────────────────── - -/** - * Describes a tool the agent can call. Maps directly to - * Anthropic's tool_use schema (and by extension, MCP protocol). - * - * The `execute` function is what actually runs when the agent - * decides to use this tool. In production, this calls real services. - * In tests, you swap it with a mock. - */ -export interface ToolDefinition { - name: string; - description: string; - inputSchema: Record; // JSON Schema for the tool's parameters - execute: (input: Record) => Promise; -} - -/** - * What a tool returns after execution. - * `success` flag is critical — the retry-behavior tests - * check how the agent responds when success=false. - */ -export interface ToolResult { - success: boolean; - data: unknown; - error?: string; -} - -// ───────────────────────────────────────────── -// AGENT TRACE (the core testing data structure) -// ───────────────────────────────────────────── - -/** - * A single step in the agent's execution. - * - * WHY THIS MATTERS: - * Traditional testing checks input → output. - * Agent testing checks input → [decision₁, decision₂, ... decisionₙ] → output. - * - * Each TraceStep records ONE decision the agent made: - * - 'tool_call': Agent decided to invoke a specific tool with specific params - * - 'tool_result': The tool returned data (or failed) - * - 'reasoning': Agent's internal reasoning (from Claude's response text) - * - 'output': Agent's final answer - * - * The sequence of steps IS the agent's behavior. Your tests assert - * against this sequence, not against the final string. - */ -export interface TraceStep { - type: 'tool_call' | 'tool_result' | 'reasoning' | 'output'; - toolName?: string; - toolInput?: Record; - toolOutput?: unknown; - content?: string; - timestamp: number; -} - -/** - * Complete record of an agent run. - * This is what every test receives. This is what every assertion inspects. - * - * EXTENDING THIS: - * When you adapt this framework for other domains, you might add: - * - `tokenUsage: { prompt: number, completion: number }` for cost tracking - * - `parentTraceId: string` for multi-agent orchestration testing - * - `guardrailResults: GuardrailCheck[]` for safety testing - */ -export interface AgentTrace { - input: string; // The original natural language prompt - steps: TraceStep[]; // Ordered list of everything the agent did - output: AgentOutput; // The final structured result - metadata: { - model: string; // Which LLM model was used - /** Which API was used (`anthropic`, `openai`, or `ollama`). Omitted in older traces. */ - provider?: 'anthropic' | 'openai' | 'ollama'; - durationMs: number; // Total wall-clock time - toolCallCount: number; // How many tools were invoked - retryCount: number; // How many retries happened - }; -} - -/** - * The structured output the agent returns. - * - * WHY STRUCTURED AND NOT JUST A STRING: - * If the agent returns free text, you can't reliably assert on it. - * By forcing structured output, you can check: - * - Did the agent produce the right type of result? - * - Did it include all required fields? - * - Are the values within expected ranges? - * - * The `toolsUsed` array is particularly important — it's the - * agent's self-report of which tools it called. Your tests - * cross-reference this against the actual trace to catch lies. - */ -export interface AgentOutput { - taskType: string; // Classification of what the agent did - result: unknown; // The actual payload (varies by task) - toolsUsed: string[]; // Which tools the agent reports using - confidence: number; // 0-1 confidence score - summary: string; // Human-readable summary -} - -// ───────────────────────────────────────────── -// BEHAVIOR CONTRACTS -// ───────────────────────────────────────────── - -/** - * Defines what "correct" means for a specific type of agent output. - * This replaces exact string matching with semantic rules. - * - * EXAMPLE: - * For a SUMMARIZATION contract: - * - requiredFields: ['summary', 'sourceFile'] - * - requiredIntentKeywords: ['summary', 'key points', 'overview'] - * - maxLengthChars: 500 - * - forbiddenPatterns: [/I don't know/, /I cannot/] - * - * The NonDeterministicMatcher evaluates output against these rules - * and returns a confidence score, not a binary pass/fail. - */ -export interface ContractDefinition { - name: string; - description: string; - requiredFields: string[]; // Fields that MUST exist in output - requiredIntentKeywords: string[]; // At least N of these must appear - minKeywordMatchRatio: number; // What fraction of keywords must match (0-1) - forbiddenPatterns: RegExp[]; // Patterns that must NOT appear - maxLengthChars?: number; // Optional length constraint - customValidator?: (output: unknown) => ValidationResult; // Escape hatch for complex rules -} - -export interface ValidationResult { - passed: boolean; - score: number; // 0-1 confidence - reason: string; // Human-readable explanation of why it passed/failed -} - -/** - * What the NonDeterministicMatcher returns. - * - * KEY INSIGHT: `confidence` is not binary. - * An output might score 0.7 — it partially satisfies the contract. - * Your test decides the threshold: strict tests require 0.9+, - * exploratory tests might accept 0.5+. - * - * `details` tells you exactly what matched and what didn't, - * so when a test fails, you know WHY without re-reading the LLM output. - */ -export interface MatchResult { - matched: boolean; - confidence: number; - details: string[]; // List of what passed and what failed -} diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..2dc7350 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,9 @@ +# Examples + +## `agent/` — demo system under test + +This folder holds a **reference LLM agent** (tool loop + `ToolRegistry` + sample tools). It exists to drive the Playwright tests in `tests/` and to show how `AgentTrace` / `AgentOutput` are produced. + +It is **not** the reusable assertion library — that lives in `framework/` at the repo root and is re-exported from `index.ts`. + +To try your own agent: implement a runner that yields the same trace shapes (see `framework/types.ts`), wire it in `tests/fixtures/setup.ts`, and keep or replace the demo tools. diff --git a/agent/agent.ts b/examples/agent/agent.ts similarity index 94% rename from agent/agent.ts rename to examples/agent/agent.ts index b69e0cb..b89c91b 100644 --- a/agent/agent.ts +++ b/examples/agent/agent.ts @@ -1,7 +1,7 @@ /** - * agent/agent.ts + * examples/agent/agent.ts * - * THE AGENT — the System Under Test (SUT) + * Demo agent — reference system under test for this POC (not the assertion library). * * WHAT IT DOES: * 1. Takes a natural language task from the user @@ -12,7 +12,9 @@ * 6. The agent records EVERY step in a trace (AgentTrace) * * Provider selection: set `AgentConfig.provider`, or `LLM_PROVIDER=anthropic|openai|ollama`. - * Keys: `ANTHROPIC_API_KEY` (Anthropic), `OPENAI_API_KEY` (OpenAI cloud or Ollama dummy). + * Keys: `ANTHROPIC_API_KEY` (Anthropic), `OPENAI_API_KEY` (OpenAI). For **`LLM_PROVIDER=ollama`**, local models + * typically need only **`LLM_MODEL`**; the client uses the placeholder apiKey `ollama`. Optional **`OLLAMA_API_KEY`** + * if you use Ollama Cloud–hosted model tags (e.g. `*:cloud`). * Local Ollama: `LLM_PROVIDER=ollama` or `LLM_PROVIDER=openai` with `OPENAI_BASE_URL` * (e.g. `http://127.0.0.1:11434/v1`). See `.env.example`. */ @@ -26,7 +28,7 @@ import { AgentOutput, TraceStep, ToolResult, -} from './types.js'; +} from '../../framework/types.js'; /** Which vendor API backs the agent. `ollama` uses the OpenAI SDK against a local Ollama server. */ export type LlmProvider = 'anthropic' | 'openai' | 'ollama'; @@ -112,12 +114,20 @@ function resolveAgentConfig(config: AgentConfig): ResolvedAgentConfig { ? resolveOpenAIBaseURL(provider, config) : undefined; - let apiKey = - config.apiKey ?? - (provider === 'openai' || provider === 'ollama' - ? process.env.OPENAI_API_KEY - : process.env.ANTHROPIC_API_KEY) ?? - ''; + let apiKey = config.apiKey?.trim() ?? ''; + + if (!apiKey) { + if (provider === 'anthropic') { + apiKey = process.env.ANTHROPIC_API_KEY?.trim() ?? ''; + } else if (provider === 'ollama') { + apiKey = + process.env.OLLAMA_API_KEY?.trim() || + process.env.OPENAI_API_KEY?.trim() || + ''; + } else { + apiKey = process.env.OPENAI_API_KEY?.trim() ?? ''; + } + } if ((provider === 'openai' || provider === 'ollama') && !apiKey && baseURL) { apiKey = 'ollama'; @@ -127,7 +137,7 @@ function resolveAgentConfig(config: AgentConfig): ResolvedAgentConfig { provider === 'openai' ? 'gpt-4o' : provider === 'ollama' - ? 'llama3:latest' + ? 'llama3.2:3b' : 'claude-sonnet-4-20250514'; return { diff --git a/agent/tools/api-caller.ts b/examples/agent/tools/api-caller.ts similarity index 98% rename from agent/tools/api-caller.ts rename to examples/agent/tools/api-caller.ts index 9b58426..1e2e8bd 100644 --- a/agent/tools/api-caller.ts +++ b/examples/agent/tools/api-caller.ts @@ -1,5 +1,5 @@ /** - * agent/tools/api-caller.ts + * examples/agent/tools/api-caller.ts * * MCP TOOL #2: API Caller * @@ -27,7 +27,7 @@ * - Connect to your actual Jira/ServiceNow/Datadog APIs */ -import { ToolDefinition, ToolResult } from '../types.js'; +import { ToolDefinition, ToolResult } from '../../../framework/types.js'; /** * Configuration for the API caller. diff --git a/agent/tools/file-reader.ts b/examples/agent/tools/file-reader.ts similarity index 86% rename from agent/tools/file-reader.ts rename to examples/agent/tools/file-reader.ts index 6a6b6e8..5c0cafe 100644 --- a/agent/tools/file-reader.ts +++ b/examples/agent/tools/file-reader.ts @@ -1,5 +1,5 @@ /** - * agent/tools/file-reader.ts + * examples/agent/tools/file-reader.ts * * MCP TOOL #1: File Reader * @@ -33,7 +33,15 @@ import * as fs from 'fs/promises'; import * as path from 'path'; -import { ToolDefinition, ToolResult } from '../types.js'; +import { ToolDefinition, ToolResult } from '../../../framework/types.js'; + +/** Small / weak models sometimes omit `filePath` or use snake_case — normalize before resolve(). */ +function pickFilePath(input: Record): string | undefined { + const raw = input.filePath ?? input.file_path ?? input.path; + if (typeof raw === 'string' && raw.trim()) return raw.trim(); + if (raw != null && typeof raw !== 'object') return String(raw).trim() || undefined; + return undefined; +} /** * Creates a file-reader tool instance. @@ -96,7 +104,16 @@ export function createFileReaderTool(basePath: string = '/tmp/agent-files'): Too * The path.resolve + startsWith check below prevents this. */ execute: async (input: Record): Promise => { - const filePath = input.filePath as string; + const filePath = pickFilePath(input); + if (!filePath) { + return { + success: false, + data: null, + error: + 'Missing file path. Pass filePath as a string (e.g. "logs/test-results.log").', + }; + } + const encoding = (input.encoding as BufferEncoding) || 'utf-8'; // SECURITY: Resolve the full path and verify it's within basePath. diff --git a/agent/tools/registry.ts b/examples/agent/tools/registry.ts similarity index 90% rename from agent/tools/registry.ts rename to examples/agent/tools/registry.ts index 8f949d5..033428f 100644 --- a/agent/tools/registry.ts +++ b/examples/agent/tools/registry.ts @@ -1,5 +1,5 @@ /** - * agent/tools/registry.ts + * examples/agent/tools/registry.ts * * TOOL REGISTRY * @@ -22,7 +22,7 @@ */ import type OpenAI from 'openai'; -import { ToolDefinition, ToolResult } from '../types.js'; +import { ToolDefinition, ToolResult } from '../../../framework/types.js'; export class ToolRegistry { private tools: Map = new Map(); @@ -62,7 +62,16 @@ export class ToolRegistry { error: `Tool "${name}" is not registered. Available tools: ${this.listNames().join(', ')}`, }; } - return tool.execute(input); + try { + return await tool.execute(input); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return { + success: false, + data: null, + error: `Tool "${name}" failed: ${message}`, + }; + } } /** diff --git a/framework/AgentAssert.ts b/framework/AgentAssert.ts index d1a5939..329a59b 100644 --- a/framework/AgentAssert.ts +++ b/framework/AgentAssert.ts @@ -39,8 +39,8 @@ */ import { expect } from '@playwright/test'; -import { AgentTrace, ContractDefinition, MatchResult } from '../agent/types.js'; -import { NonDeterministicMatcher } from './NonDeterministicMatcher.js'; +import { AgentTrace, ContractDefinition, MatchResult } from './types.js'; +import { HeuristicContractMatcher } from './HeuristicContractMatcher.js'; export class AgentAssert { @@ -141,8 +141,8 @@ export class AgentAssert { /** * ASSERTION 2: Does the output satisfy a behavior contract? * - * This is where the NonDeterministicMatcher does its work. - * Instead of checking exact values, we check semantic rules. + * This is where HeuristicContractMatcher does its work: fields, keyword overlap, + * forbidden regexes — not LLM-grade semantics (see HeuristicContractMatcher.ts). * * @param output - The agent's structured output (AgentOutput) * @param contract - The behavior contract to evaluate against @@ -169,7 +169,7 @@ export class AgentAssert { contract: ContractDefinition, minConfidence: number = 0.5 ): MatchResult { - const result = NonDeterministicMatcher.evaluate(output, contract); + const result = HeuristicContractMatcher.evaluate(output, contract); // Override the matched flag based on minConfidence return { diff --git a/framework/BehaviorContract.ts b/framework/BehaviorContract.ts index 85dc10f..1c001b3 100644 --- a/framework/BehaviorContract.ts +++ b/framework/BehaviorContract.ts @@ -1,7 +1,7 @@ /** * framework/BehaviorContract.ts * - * BEHAVIOR CONTRACTS — what "correct" means when outputs are non-deterministic + * BEHAVIOR CONTRACTS — what "correct" means when LLM wording varies run to run * * THE PROBLEM: * In traditional testing, you write: @@ -40,7 +40,7 @@ * - Be under 1000 chars (it's a creation confirmation, not a novel) */ -import { ContractDefinition, ValidationResult } from '../agent/types.js'; +import { ContractDefinition, ValidationResult } from './types.js'; /** * Pre-built contracts for common agent task types. diff --git a/framework/HeuristicContractMatcher.ts b/framework/HeuristicContractMatcher.ts new file mode 100644 index 0000000..f0f8155 --- /dev/null +++ b/framework/HeuristicContractMatcher.ts @@ -0,0 +1,236 @@ +/** + * framework/HeuristicContractMatcher.ts + * + * HEURISTIC CONTRACT EVALUATION — for LLM outputs that vary in wording + * + * WHAT THIS IS (honest scope): + * ──────────────────────────── + * This is **not** deep semantic understanding. It does **not** embed text, + * call an LLM-as-judge, or parse meaning the way a human does. It applies + * **cheap, deterministic heuristics**: + * + * - Required fields present (structural) + * - Substring / bag-of-words overlap with a keyword list (“intent keywords”) + * - **Regex** forbidden patterns + * - Optional custom validator + * + * The numeric **confidence** is a **weighted average of those heuristic + * scores** — a tuning aid and ranking signal, not a calibrated probability + * of semantic correctness. Treat it accordingly in assertions. + * + * WHY IT STILL EXISTS: + * ──────────────────── + * Exact `expect(output).toBe("...")` fails on every LLM run. Rules based on + * fields + keyword coverage + forbidden phrases often **do** catch wrong + * behavior cheaply. But phrasing that is **semantically equivalent** can + * still miss keywords (e.g. “unable to locate the file” vs “file not found”) + * unless your keyword lists and thresholds cover those variants — or you + * add synonyms / move to embeddings / LLM grading (see README). + * + * LAYERS (implementation): + * ───────────────────────── + * 1. STRUCTURE — required fields, optional length + * 2. KEYWORDS — case-insensitive substring checks; ratio vs `minKeywordMatchRatio` + * 3. FORBIDDEN — regex matches → hard failure path + * 4. CUSTOM — contract-supplied validator (optional) + * + * SCORING WEIGHTS: structural 40%, keyword 35% (or 25% + custom 10% if set), + * forbidden 25%. Forbidden violations force a failed match regardless of headline confidence. + */ + +import { ContractDefinition, MatchResult } from './types.js'; + +export class HeuristicContractMatcher { + /** + * Evaluate an output against a behavior contract using the heuristic layers above. + * + * @example + * const result = HeuristicContractMatcher.evaluate(trace.output, BehaviorContract.SUMMARIZATION); + */ + static evaluate(output: unknown, contract: ContractDefinition): MatchResult { + const details: string[] = []; + const scores: { weight: number; score: number; label: string }[] = []; + + const structuralResult = this.checkStructure(output, contract); + scores.push({ weight: 0.4, score: structuralResult.score, label: 'structural' }); + details.push(...structuralResult.details); + + const keywordResult = this.checkKeywordOverlap(output, contract); + const keywordWeight = contract.customValidator ? 0.25 : 0.35; + scores.push({ weight: keywordWeight, score: keywordResult.score, label: 'keywords' }); + details.push(...keywordResult.details); + + const forbiddenResult = this.checkForbiddenPatterns(output, contract); + scores.push({ weight: 0.25, score: forbiddenResult.score, label: 'forbidden' }); + details.push(...forbiddenResult.details); + + if (contract.customValidator) { + const customResult = contract.customValidator(output); + scores.push({ weight: 0.1, score: customResult.score, label: 'custom' }); + details.push(`[custom] ${customResult.reason}`); + } + + const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0); + const confidence = + scores.reduce((sum, s) => sum + s.weight * s.score, 0) / totalWeight; + + const hasForbiddenViolation = forbiddenResult.score < 1.0; + + return { + matched: !hasForbiddenViolation && confidence >= 0.5, + confidence: hasForbiddenViolation ? Math.min(confidence, 0.3) : confidence, + details, + }; + } + + private static checkStructure( + output: unknown, + contract: ContractDefinition + ): { score: number; details: string[] } { + const details: string[] = []; + + if (typeof output !== 'object' || output === null) { + details.push('[structural] FAIL: output is not an object'); + return { score: 0, details }; + } + + const obj = output as Record; + let presentCount = 0; + + for (const field of contract.requiredFields) { + if (field in obj && obj[field] !== undefined && obj[field] !== null) { + presentCount++; + details.push(`[structural] PASS: field "${field}" present`); + } else { + details.push(`[structural] FAIL: field "${field}" missing`); + } + } + + if (contract.maxLengthChars) { + const outputStr = JSON.stringify(output); + if (outputStr.length > contract.maxLengthChars) { + details.push( + `[structural] FAIL: output length ${outputStr.length} exceeds max ${contract.maxLengthChars}` + ); + } else { + details.push(`[structural] PASS: output length ${outputStr.length} within limit`); + } + } + + const score = + contract.requiredFields.length > 0 + ? presentCount / contract.requiredFields.length + : 1.0; + + return { score, details }; + } + + /** + * Keyword layer: case-insensitive substring presence against `requiredIntentKeywords`. + * Not synonym-aware; expand keywords or lower thresholds if tests are brittle. + */ + private static checkKeywordOverlap( + output: unknown, + contract: ContractDefinition + ): { score: number; details: string[] } { + const details: string[] = []; + const outputStr = JSON.stringify(output).toLowerCase(); + + let matchCount = 0; + const matchedKeywords: string[] = []; + const missedKeywords: string[] = []; + + for (const keyword of contract.requiredIntentKeywords) { + if (outputStr.includes(keyword.toLowerCase())) { + matchCount++; + matchedKeywords.push(keyword); + } else { + missedKeywords.push(keyword); + } + } + + const ratio = + contract.requiredIntentKeywords.length > 0 + ? matchCount / contract.requiredIntentKeywords.length + : 1.0; + + const passed = ratio >= contract.minKeywordMatchRatio; + + details.push( + `[keywords] ${passed ? 'PASS' : 'FAIL'}: ` + + `${matchCount}/${contract.requiredIntentKeywords.length} keywords matched ` + + `(${(ratio * 100).toFixed(0)}%, threshold: ${(contract.minKeywordMatchRatio * 100).toFixed(0)}%)` + ); + details.push(`[keywords] Matched: [${matchedKeywords.join(', ')}]`); + if (missedKeywords.length > 0 && missedKeywords.length <= 10) { + details.push(`[keywords] Missed: [${missedKeywords.join(', ')}]`); + } + + return { + score: Math.min(ratio / contract.minKeywordMatchRatio, 1.0), + details, + }; + } + + private static checkForbiddenPatterns( + output: unknown, + contract: ContractDefinition + ): { score: number; details: string[] } { + const details: string[] = []; + const outputStr = JSON.stringify(output); + + let violations = 0; + + for (const pattern of contract.forbiddenPatterns) { + if (pattern.test(outputStr)) { + violations++; + details.push(`[forbidden] FAIL: pattern ${pattern} matched in output`); + } + } + + if (violations === 0) { + details.push(`[forbidden] PASS: no forbidden patterns detected`); + } + + return { score: violations > 0 ? 0 : 1.0, details }; + } + + /** + * Substring search, or word-level overlap when `fuzzy` is true (still heuristic, not NLP). + */ + static containsIntent( + output: unknown, + target: string, + fuzzy: boolean = false, + fuzzyThreshold: number = 0.5 + ): MatchResult { + const outputStr = JSON.stringify(output).toLowerCase(); + const targetLower = target.toLowerCase(); + + if (!fuzzy) { + const found = outputStr.includes(targetLower); + return { + matched: found, + confidence: found ? 1.0 : 0.0, + details: [ + found ? `Found "${target}" in output` : `"${target}" not found in output`, + ], + }; + } + + const words = targetLower.split(/\s+/).filter(w => w.length > 2); + let matchCount = 0; + for (const word of words) { + if (outputStr.includes(word)) matchCount++; + } + + const ratio = words.length > 0 ? matchCount / words.length : 0; + return { + matched: ratio >= fuzzyThreshold, + confidence: ratio, + details: [ + `Fuzzy word overlap: ${matchCount}/${words.length} words found (${(ratio * 100).toFixed(0)}%)`, + ], + }; + } +} diff --git a/framework/NonDeterministicMatcher.ts b/framework/NonDeterministicMatcher.ts deleted file mode 100644 index a7438fd..0000000 --- a/framework/NonDeterministicMatcher.ts +++ /dev/null @@ -1,326 +0,0 @@ -/** - * framework/NonDeterministicMatcher.ts - * - * THE KEY INNOVATION — assertion logic for non-deterministic outputs - * - * WHY THIS EXISTS: - * ──────────────── - * Traditional test matchers are binary: the output either equals the - * expected value or it doesn't. This is useless for LLM outputs because: - * - * - Same prompt → different wording every time - * - Same intent → different structure every time - * - Same facts → different ordering every time - * - * The NonDeterministicMatcher evaluates outputs against INTENT CONTRACTS - * rather than exact values. It returns a CONFIDENCE SCORE (0-1) instead - * of a binary pass/fail. - * - * HOW IT WORKS (three evaluation layers): - * ───────────────────────────────────────── - * - * LAYER 1: STRUCTURAL VALIDATION - * Does the output have the required fields? Is it valid JSON? - * Is it within length limits? This is deterministic — it either - * passes or doesn't. - * - * LAYER 2: SEMANTIC KEYWORD MATCHING - * Does the output text contain enough intent-related keywords? - * This is fuzzy — we count how many keywords appear and compare - * against a threshold. The keyword list is intentionally broad - * to accommodate phrasing variation. - * - * LAYER 3: FORBIDDEN PATTERN DETECTION - * Does the output contain anything it shouldn't? Hallucination - * markers, refusal language, fabricated data indicators. - * Any match here FAILS the output regardless of other scores. - * - * OPTIONAL LAYER 4: CUSTOM VALIDATION - * Contract-specific logic that can't be expressed as keywords - * or patterns. Example: "toolsUsed must have 2+ entries." - * - * THE SCORING MODEL: - * ────────────────── - * Each layer produces a score from 0 to 1. - * The final confidence is the WEIGHTED AVERAGE: - * - Structural: 40% weight (must have right shape) - * - Semantic: 35% weight (must express right intent) - * - Forbidden: 25% weight (must not contain bad patterns) - * - * If a custom validator exists, it replaces 10% of the semantic weight. - * - * WHY THESE WEIGHTS: - * Structure matters most because a broken JSON or missing field is - * unambiguously wrong. Semantics is next because keyword absence - * might just mean different phrasing. Forbidden patterns are weighted - * lowest because a single accidental match shouldn't tank the score — - * but they DO cause a hard failure if matched. - * - * HOW TO TUNE: - * If your tests are too flaky (passing sometimes, failing sometimes), - * you have two knobs: - * 1. Lower the minKeywordMatchRatio in the contract - * 2. Add more keywords to the contract (broader coverage) - * 3. Lower the confidence threshold in your test assertion - * - * If your tests are too permissive (passing when they shouldn't), - * do the opposite. Add more forbidden patterns. Raise thresholds. - */ - -import { ContractDefinition, MatchResult, ValidationResult } from '../agent/types.js'; - -export class NonDeterministicMatcher { - - /** - * Evaluate an output against a behavior contract. - * This is the main entry point. Every assertion in AgentAssert - * calls this method. - * - * @param output - The agent's output (AgentOutput object) - * @param contract - The behavior contract to evaluate against - * @returns MatchResult with confidence score and details - * - * EXAMPLE USAGE: - * const result = NonDeterministicMatcher.evaluate(trace.output, BehaviorContract.SUMMARIZATION); - * expect(result.confidence).toBeGreaterThan(0.7); - */ - static evaluate(output: unknown, contract: ContractDefinition): MatchResult { - const details: string[] = []; - const scores: { weight: number; score: number; label: string }[] = []; - - // ── LAYER 1: STRUCTURAL VALIDATION ────────────────── - const structuralResult = this.checkStructure(output, contract); - scores.push({ weight: 0.40, score: structuralResult.score, label: 'structural' }); - details.push(...structuralResult.details); - - // ── LAYER 2: SEMANTIC KEYWORD MATCHING ────────────── - const semanticResult = this.checkSemantics(output, contract); - const semanticWeight = contract.customValidator ? 0.25 : 0.35; - scores.push({ weight: semanticWeight, score: semanticResult.score, label: 'semantic' }); - details.push(...semanticResult.details); - - // ── LAYER 3: FORBIDDEN PATTERN DETECTION ──────────── - const forbiddenResult = this.checkForbiddenPatterns(output, contract); - scores.push({ weight: 0.25, score: forbiddenResult.score, label: 'forbidden' }); - details.push(...forbiddenResult.details); - - // ── OPTIONAL LAYER 4: CUSTOM VALIDATION ───────────── - if (contract.customValidator) { - const customResult = contract.customValidator(output); - scores.push({ weight: 0.10, score: customResult.score, label: 'custom' }); - details.push(`[custom] ${customResult.reason}`); - } - - // ── COMPUTE WEIGHTED CONFIDENCE ───────────────────── - const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0); - const confidence = scores.reduce((sum, s) => sum + (s.weight * s.score), 0) / totalWeight; - - // Hard failure: if any forbidden pattern was found, override to fail - const hasForbiddenViolation = forbiddenResult.score < 1.0; - - return { - matched: !hasForbiddenViolation && confidence >= 0.5, - confidence: hasForbiddenViolation ? Math.min(confidence, 0.3) : confidence, - details, - }; - } - - /** - * LAYER 1: Check if output has the required structure. - * - * This is the simplest check — does the output object have - * the fields the contract requires? - * - * WHY 40% WEIGHT: - * If the output doesn't even have the right fields, nothing - * else matters. A summarization that's missing the 'summary' - * field is fundamentally broken regardless of what the text says. - */ - private static checkStructure( - output: unknown, - contract: ContractDefinition - ): { score: number; details: string[] } { - const details: string[] = []; - - if (typeof output !== 'object' || output === null) { - details.push('[structural] FAIL: output is not an object'); - return { score: 0, details }; - } - - const obj = output as Record; - let presentCount = 0; - - for (const field of contract.requiredFields) { - if (field in obj && obj[field] !== undefined && obj[field] !== null) { - presentCount++; - details.push(`[structural] PASS: field "${field}" present`); - } else { - details.push(`[structural] FAIL: field "${field}" missing`); - } - } - - // Check length constraint if specified - if (contract.maxLengthChars) { - const outputStr = JSON.stringify(output); - if (outputStr.length > contract.maxLengthChars) { - details.push( - `[structural] FAIL: output length ${outputStr.length} exceeds max ${contract.maxLengthChars}` - ); - // Don't count this as a field failure — it's a soft constraint - } else { - details.push(`[structural] PASS: output length ${outputStr.length} within limit`); - } - } - - const score = contract.requiredFields.length > 0 - ? presentCount / contract.requiredFields.length - : 1.0; - - return { score, details }; - } - - /** - * LAYER 2: Check if output text contains enough intent keywords. - * - * HOW THIS WORKS: - * 1. Serialize the entire output to a string (JSON.stringify) - * 2. Check each keyword in the contract's requiredIntentKeywords - * 3. Count how many keywords appear (case-insensitive) - * 4. Compare the match ratio against minKeywordMatchRatio - * - * WHY STRINGIFY THE WHOLE OBJECT: - * Keywords might appear in any field — the summary, the result, - * the taskType. By stringifying everything, we search everywhere - * at once. This is intentionally permissive. - * - * WHY CASE-INSENSITIVE: - * "Summary" and "summary" and "SUMMARY" are the same intent. - * Don't fail a test because the LLM capitalized differently. - */ - private static checkSemantics( - output: unknown, - contract: ContractDefinition - ): { score: number; details: string[] } { - const details: string[] = []; - const outputStr = JSON.stringify(output).toLowerCase(); - - let matchCount = 0; - const matchedKeywords: string[] = []; - const missedKeywords: string[] = []; - - for (const keyword of contract.requiredIntentKeywords) { - if (outputStr.includes(keyword.toLowerCase())) { - matchCount++; - matchedKeywords.push(keyword); - } else { - missedKeywords.push(keyword); - } - } - - const ratio = contract.requiredIntentKeywords.length > 0 - ? matchCount / contract.requiredIntentKeywords.length - : 1.0; - - const passed = ratio >= contract.minKeywordMatchRatio; - - details.push( - `[semantic] ${passed ? 'PASS' : 'FAIL'}: ` + - `${matchCount}/${contract.requiredIntentKeywords.length} keywords matched ` + - `(${(ratio * 100).toFixed(0)}%, threshold: ${(contract.minKeywordMatchRatio * 100).toFixed(0)}%)` - ); - details.push(`[semantic] Matched: [${matchedKeywords.join(', ')}]`); - if (missedKeywords.length > 0 && missedKeywords.length <= 10) { - details.push(`[semantic] Missed: [${missedKeywords.join(', ')}]`); - } - - // Score is the ratio itself, clamped to 0-1 - return { score: Math.min(ratio / contract.minKeywordMatchRatio, 1.0), details }; - } - - /** - * LAYER 3: Check that no forbidden patterns appear in the output. - * - * CRITICAL BEHAVIOR: This is the STRICTEST check. - * If ANY forbidden pattern matches, the score drops to 0. - * This is intentional — forbidden patterns represent hard failures: - * - "I cannot" → the agent refused the task - * - "I fabricated" → the agent hallucinated and admitted it - * - "I don't have access" → the agent couldn't use the tools - * - * These are not "slightly wrong" — they're categorically wrong. - * No amount of correct keywords should compensate for a refusal. - */ - private static checkForbiddenPatterns( - output: unknown, - contract: ContractDefinition - ): { score: number; details: string[] } { - const details: string[] = []; - const outputStr = JSON.stringify(output); - - let violations = 0; - - for (const pattern of contract.forbiddenPatterns) { - if (pattern.test(outputStr)) { - violations++; - details.push(`[forbidden] FAIL: pattern ${pattern} matched in output`); - } - } - - if (violations === 0) { - details.push(`[forbidden] PASS: no forbidden patterns detected`); - } - - // Binary: any violation → score 0 - return { score: violations > 0 ? 0 : 1.0, details }; - } - - /** - * UTILITY: Check if a specific string appears in the output, - * with fuzzy matching support. - * - * USE THIS WHEN: - * You need a one-off check that doesn't fit into a full contract. - * Example: "Does the output mention the file name 'test-results.log'?" - * - * @param output - The output to check (any type, gets stringified) - * @param target - The string to look for - * @param fuzzy - If true, splits target into words and checks each independently. - * "test results log" matches if any two of those words appear. - * @param fuzzyThreshold - What fraction of target words must appear (0-1) - */ - static containsIntent( - output: unknown, - target: string, - fuzzy: boolean = false, - fuzzyThreshold: number = 0.5 - ): MatchResult { - const outputStr = JSON.stringify(output).toLowerCase(); - const targetLower = target.toLowerCase(); - - if (!fuzzy) { - const found = outputStr.includes(targetLower); - return { - matched: found, - confidence: found ? 1.0 : 0.0, - details: [found - ? `Found "${target}" in output` - : `"${target}" not found in output` - ], - }; - } - - // Fuzzy: check individual words - const words = targetLower.split(/\s+/).filter(w => w.length > 2); - let matchCount = 0; - for (const word of words) { - if (outputStr.includes(word)) matchCount++; - } - - const ratio = words.length > 0 ? matchCount / words.length : 0; - return { - matched: ratio >= fuzzyThreshold, - confidence: ratio, - details: [`Fuzzy match: ${matchCount}/${words.length} words found (${(ratio * 100).toFixed(0)}%)`], - }; - } -} diff --git a/framework/index.ts b/framework/index.ts new file mode 100644 index 0000000..25f5cab --- /dev/null +++ b/framework/index.ts @@ -0,0 +1,18 @@ +/** + * Reusable assertion helpers and types (this POC’s “library” surface). + */ + +export type { + AgentOutput, + AgentTrace, + ContractDefinition, + MatchResult, + ToolDefinition, + ToolResult, + TraceStep, + ValidationResult, +} from './types.js'; + +export { AgentAssert } from './AgentAssert.js'; +export { BehaviorContract } from './BehaviorContract.js'; +export { HeuristicContractMatcher } from './HeuristicContractMatcher.js'; diff --git a/framework/types.ts b/framework/types.ts new file mode 100644 index 0000000..7f75d94 --- /dev/null +++ b/framework/types.ts @@ -0,0 +1,91 @@ +/** + * framework/types.ts + * + * Public types for the assertion helpers (`AgentAssert`, contracts, traces). + * The demo agent under `examples/agent/` imports these same types so traces + * and contracts stay aligned — this file is not part of the example SUT. + * + * `AgentTrace` is what assertions operate on: tool decisions and structured output. + */ + +// ───────────────────────────────────────────── +// TOOL DEFINITIONS (used by demo agent + registry) +// ───────────────────────────────────────────── + +/** + * Describes a tool the agent can call. Maps directly to + * Anthropic's tool_use schema (and by extension, MCP protocol). + */ +export interface ToolDefinition { + name: string; + description: string; + inputSchema: Record; + execute: (input: Record) => Promise; +} + +export interface ToolResult { + success: boolean; + data: unknown; + error?: string; +} + +// ───────────────────────────────────────────── +// AGENT TRACE +// ───────────────────────────────────────────── + +export interface TraceStep { + type: 'tool_call' | 'tool_result' | 'reasoning' | 'output'; + toolName?: string; + toolInput?: Record; + toolOutput?: unknown; + content?: string; + timestamp: number; +} + +export interface AgentTrace { + input: string; + steps: TraceStep[]; + output: AgentOutput; + metadata: { + model: string; + provider?: 'anthropic' | 'openai' | 'ollama'; + durationMs: number; + toolCallCount: number; + retryCount: number; + }; +} + +export interface AgentOutput { + taskType: string; + result: unknown; + toolsUsed: string[]; + confidence: number; + summary: string; +} + +// ───────────────────────────────────────────── +// BEHAVIOR CONTRACTS +// ───────────────────────────────────────────── + +export interface ContractDefinition { + name: string; + description: string; + requiredFields: string[]; + requiredIntentKeywords: string[]; + minKeywordMatchRatio: number; + forbiddenPatterns: RegExp[]; + maxLengthChars?: number; + customValidator?: (output: unknown) => ValidationResult; +} + +export interface ValidationResult { + passed: boolean; + score: number; + reason: string; +} + +export interface MatchResult { + matched: boolean; + confidence: number; + details: string[]; +} diff --git a/index.ts b/index.ts new file mode 100644 index 0000000..5660754 --- /dev/null +++ b/index.ts @@ -0,0 +1,12 @@ +/** + * Public entry — re-exports the reusable assertion layer from `framework/`. + * + * This repo is a **proof-of-concept** (`"private": true`); it is not published + * to npm. The demo LLM agent lives under `examples/agent/`, not here. + * + * In a TypeScript project that vendors this repo, use: + * import { AgentAssert, BehaviorContract } from 'agent-assert'; + * (with `package.json` dependencies pointing at this path or Git URL.) + */ + +export * from './framework/index.js'; diff --git a/package.json b/package.json index 3946790..2e041b4 100644 --- a/package.json +++ b/package.json @@ -1,12 +1,17 @@ { "name": "agent-assert", "version": "0.1.0", - "description": "A Playwright-based testing framework for agentic AI systems using MCP tool orchestration", + "private": true, + "description": "POC: Playwright tests + heuristic contracts for tool-calling LLM agents. Demo agent in examples/agent/. Not published to npm.", "author": "Biresh Patel", "license": "MIT", "type": "module", + "exports": { + ".": "./index.ts" + }, "scripts": { "test": "npx playwright test", + "test:smoke": "npx playwright test tests/behavioral/intent-routing.spec.ts -g \"routes file-read intent to file-reader tool\"", "test:behavioral": "npx playwright test tests/behavioral/", "test:boundary": "npx playwright test tests/boundary/", "test:ollama": "LLM_PROVIDER=ollama npx playwright test", diff --git a/playwright.config.ts b/playwright.config.ts index 12043c8..d88d78c 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -31,6 +31,12 @@ import { applyLlmVarsFromDotEnv } from './tests/env-llm.js'; /** Loads LLM + API keys from `.env` into `process.env` (see tests/env-llm.ts). */ applyLlmVarsFromDotEnv(); +/** GitHub Actions and similar CI set `CI=true`. Local Ollama on CPU there often exceeds 120s per test. */ +const isCi = process.env.CI === 'true'; + +/** Per-test cap for LLM + tool runs (`behavioral` project). CI gets a longer budget. */ +const behavioralTimeoutMs = isCi ? 300_000 : 120_000; + /** Label for the HTML report header: provider + resolved model (aligned with agent defaults). */ function htmlReportTitle(): string { const p = process.env.LLM_PROVIDER?.toLowerCase(); @@ -40,7 +46,7 @@ function htmlReportTitle(): string { provider === 'openai' ? 'gpt-4o' : provider === 'ollama' - ? 'llama3:latest' + ? 'llama3.2:3b' : 'claude-sonnet-4-20250514'; const model = process.env.LLM_MODEL?.trim() || defaultModel; return `Playwright report · LLM: ${provider} · ${model}`; @@ -53,7 +59,7 @@ export default defineConfig({ // Match files ending in .spec.ts testMatch: '**/*.spec.ts', - // Default cap; behavioral project overrides to 60s (multi-step LLM runs often need it). + // Default cap; behavioral project overrides for LLM + tool runs (see projects below). timeout: 45_000, // NON-DETERMINISM STRATEGY: Retry each failed test once. @@ -93,8 +99,8 @@ export default defineConfig({ { name: 'behavioral', testDir: './tests/behavioral', - // Multi-tool / multi-round agent runs often exceed 30s (local LLMs, variance). - timeout: 60_000, + // Local: 2 min is usually enough. CI (Ollama on shared CPU): often 2–4+ min for first inference. + timeout: behavioralTimeoutMs, }, { name: 'boundary', diff --git a/tests/behavioral/output-contract.spec.ts b/tests/behavioral/output-contract.spec.ts index 3c0bd7c..92b6ab3 100644 --- a/tests/behavioral/output-contract.spec.ts +++ b/tests/behavioral/output-contract.spec.ts @@ -3,10 +3,10 @@ * * PATTERN 2: BEHAVIOR CONTRACT VALIDATION * ───────────────────────────────────────── - * Tests whether the agent's output satisfies SEMANTIC CONTRACTS - * rather than exact string matches. - * - * THIS IS THE CORE INNOVATION OF THE FRAMEWORK. + * Tests whether the agent's output satisfies BEHAVIOR CONTRACTS (heuristic: + * fields, keywords, forbidden patterns) rather than exact string matches. + * + * Core idea of this pattern in the framework: * * THE PROBLEM WITH expect(output).toBe("..."): * Run 1: "The file contains 2 test failures: payment timeout and card error" @@ -20,7 +20,7 @@ * 1. The output has the right structure (required fields) * 2. The output expresses the right intent (keyword matching) * 3. The output doesn't contain red flags (forbidden patterns) - * 4. The confidence score from NonDeterministicMatcher is above threshold + * 4. The heuristic confidence score from HeuristicContractMatcher is above threshold * * TUNING GUIDANCE: * If tests are too flaky → lower minKeywordMatchRatio or add more keywords @@ -30,7 +30,7 @@ import { test, expect } from '@playwright/test'; import { AgentAssert } from '../../framework/AgentAssert.js'; import { BehaviorContract } from '../../framework/BehaviorContract.js'; -import { NonDeterministicMatcher } from '../../framework/NonDeterministicMatcher.js'; +import { HeuristicContractMatcher } from '../../framework/HeuristicContractMatcher.js'; import { createTestAgent, registerAgentTraceForDiagnostics, @@ -130,7 +130,7 @@ test.describe('Behavior Contract Validation', () => { /** * TEST 2D: Output contains intent-specific content (fuzzy match) * - * Uses NonDeterministicMatcher.containsIntent directly for a + * Uses HeuristicContractMatcher.containsIntent directly for a * targeted check: does the output mention the payment failure * from the fixture file? * @@ -144,7 +144,7 @@ test.describe('Behavior Contract Validation', () => { registerAgentTraceForDiagnostics(testInfo, trace); // Check that the output mentions the payment failure - const result = NonDeterministicMatcher.containsIntent( + const result = HeuristicContractMatcher.containsIntent( trace.output, 'payment gateway timeout failure', true, // fuzzy matching enabled @@ -163,7 +163,7 @@ test.describe('Behavior Contract Validation', () => { * TEST 2E: Output has valid AgentOutput structure * * Structural check — the output must be a valid AgentOutput object - * with all required fields. This doesn't check semantics, just shape. + * with all required fields. This doesn't run contract/heuristic checks — only shape. * * If this fails, the agent's system prompt isn't working — * Claude isn't producing structured JSON output. diff --git a/tests/boundary/retry-behavior.spec.ts b/tests/boundary/retry-behavior.spec.ts index a08cad9..f08d8da 100644 --- a/tests/boundary/retry-behavior.spec.ts +++ b/tests/boundary/retry-behavior.spec.ts @@ -37,7 +37,6 @@ import { test, expect } from '@playwright/test'; import { AgentAssert } from '../../framework/AgentAssert.js'; import { BehaviorContract } from '../../framework/BehaviorContract.js'; -import { NonDeterministicMatcher } from '../../framework/NonDeterministicMatcher.js'; import { createFailingApiAgent, createFailingFileReaderAgent, @@ -46,7 +45,7 @@ import { teardownFixtureFiles, } from '../fixtures/setup.js'; import { FAILURE_PROMPTS } from '../fixtures/prompts.js'; -import type { AgentTrace } from '../../agent/types.js'; +import type { AgentTrace } from '../../framework/types.js'; /** True if the trace records a failed file-reader tool result (upstream failure actually happened). */ function hasFileReaderToolFailure(trace: AgentTrace): boolean { diff --git a/tests/env-llm.ts b/tests/env-llm.ts index 3e5d545..60f2b0a 100644 --- a/tests/env-llm.ts +++ b/tests/env-llm.ts @@ -17,6 +17,9 @@ const DOTENV_KEYS = new Set([ 'LLM_MODEL', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', + /** Optional: Ollama Cloud / some `*:cloud` model tags. */ + 'OLLAMA_API_KEY', + 'OLLAMA_BASE_URL', ]); export function applyLlmVarsFromDotEnv(): void { diff --git a/tests/fixtures/expected-schemas.ts b/tests/fixtures/expected-schemas.ts index 9fca1ff..b9b6e43 100644 --- a/tests/fixtures/expected-schemas.ts +++ b/tests/fixtures/expected-schemas.ts @@ -4,11 +4,11 @@ * JSON SCHEMA CONTRACTS * * These schemas define the STRUCTURE of valid agent outputs. - * They complement the BehaviorContracts (which check semantics). + * They complement the BehaviorContracts (which apply keyword/heuristic checks on content). * * SCHEMA vs CONTRACT: * - Schema: "Does the output have the right fields and types?" - * - Contract: "Does the output express the right intent?" + * - Contract: "Does the output pass keyword / pattern heuristics for intent?" * * Both are needed. An output can have perfect structure but * wrong content (schema passes, contract fails). Or it can diff --git a/tests/fixtures/setup.ts b/tests/fixtures/setup.ts index a52f443..dfc9972 100644 --- a/tests/fixtures/setup.ts +++ b/tests/fixtures/setup.ts @@ -30,11 +30,11 @@ import * as fs from 'fs/promises'; import * as path from 'path'; import { test, type TestInfo } from '@playwright/test'; import { applyLlmVarsFromDotEnv } from '../env-llm.js'; -import type { AgentTrace } from '../../agent/types.js'; -import { Agent, type AgentConfig } from '../../agent/agent.js'; -import { ToolRegistry } from '../../agent/tools/registry.js'; -import { createFileReaderTool } from '../../agent/tools/file-reader.js'; -import { createApiCallerTool, MockResponse } from '../../agent/tools/api-caller.js'; +import type { AgentTrace } from '../../framework/types.js'; +import { Agent, type AgentConfig } from '../../examples/agent/agent.js'; +import { ToolRegistry } from '../../examples/agent/tools/registry.js'; +import { createFileReaderTool } from '../../examples/agent/tools/file-reader.js'; +import { createApiCallerTool, MockResponse } from '../../examples/agent/tools/api-caller.js'; applyLlmVarsFromDotEnv(); @@ -218,7 +218,7 @@ test.afterEach(async ({}, testInfo) => { /** * Optional env-driven overrides for which LLM to use in tests. * - `LLM_PROVIDER=openai` + `OPENAI_API_KEY` — OpenAI cloud (default model `gpt-4o` unless `LLM_MODEL`). - * - `LLM_PROVIDER=ollama` — local Ollama (OpenAI-compatible API); default model `llama3:latest` unless `LLM_MODEL`. + * - `LLM_PROVIDER=ollama` — local Ollama (OpenAI-compatible API); default model `llama3.2:3b` unless `LLM_MODEL`. * - `LLM_PROVIDER=openai` + `OPENAI_BASE_URL` (e.g. `http://127.0.0.1:11434/v1`) — same as Ollama without renaming provider. * - Default when unset: Anthropic with `claude-sonnet-4-20250514` unless `LLM_MODEL` overrides. */ diff --git a/tsconfig.json b/tsconfig.json index 527b7ce..6d5607a 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,6 +7,7 @@ "strict": true, "outDir": "./dist", "rootDir": ".", + "baseUrl": ".", "declaration": true, "sourceMap": true, "resolveJsonModule": true, @@ -14,9 +15,10 @@ "forceConsistentCasingInFileNames": true, "paths": { "@framework/*": ["./framework/*"], - "@agent/*": ["./agent/*"] + "@agent/*": ["./examples/agent/*"], + "agent-assert": ["./index.ts"] } }, - "include": ["agent/**/*.ts", "framework/**/*.ts", "tests/**/*.ts"], + "include": ["index.ts", "framework/**/*.ts", "examples/**/*.ts", "tests/**/*.ts"], "exclude": ["node_modules", "dist"] }