diff --git a/.env.example b/.env.example
index d96227c..8ef9281 100644
--- a/.env.example
+++ b/.env.example
@@ -17,3 +17,5 @@ OPENAI_API_KEY=sk-your-openai-key-here
 # OLLAMA_BASE_URL=http://127.0.0.1:11434/v1
 # OPENAI_BASE_URL=http://127.0.0.1:11434/v1
 # OPENAI_API_KEY is optional for local; defaults to a placeholder if OPENAI_BASE_URL/OLLAMA_BASE_URL is set (ollama provider only).
+# Optional — only if you use Ollama Cloud model tags (e.g. *:cloud); local models need LLM_PROVIDER + LLM_MODEL only.
+# OLLAMA_API_KEY=
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bacfc98..5a6edc1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,11 +1,17 @@
-# CI: TypeScript typecheck + Playwright agent tests against OpenAI (API key from repo secrets).
+# CI: typecheck + smoke test via local Ollama.
 #
-# Required secret (Settings → Secrets and variables → Actions):
-#   OPENAI_API_KEY   — your OpenAI API key
+# Configure without editing this file (repo → Settings → Secrets and variables → Actions):
+#   • Variables: LLM_PROVIDER, LLM_MODEL, optional OLLAMA_BASE_URL
+#   • Secret OLLAMA_API_KEY — only if you set LLM_MODEL to an Ollama *Cloud* tag (*:cloud).
 #
-# Optional: set repository variable LLM_MODEL (e.g. gpt-4o-mini) or edit the env block below.
+# Defaults if unset (all local, no API key): ollama + llama3.2:3b + http://127.0.0.1:11434/v1
+# llama3.2:3b is a good CI default: small/fast on CPU, much more reliable tool-calling than 1b.
 #
-# Fork PRs: the test job is skipped (secrets are not available to workflows from forks).
+# Fork PRs: the test job is skipped (secrets/vars from the base repo are not available to workflows from forks).
+#
+# Manual run (Actions → CI → Run workflow):
+#   • GitHub shows "Use workflow from" — pick the branch there (that version of the workflow runs).
+#   • Optional: set "Checkout ref" below only if you need a different ref than the branch picker.
 
 name: CI
 
@@ -15,6 +21,12 @@ on:
   pull_request:
     branches: [main, master]
   workflow_dispatch:
+    inputs:
+      checkout_ref:
+        description: 'Optional — branch name or refs/heads/... to checkout. Leave empty to use the branch selected in "Use workflow from" above.'
+        required: false
+        default: ''
+        type: string
 
 concurrency:
   group: ci-${{ github.workflow }}-${{ github.ref }}
@@ -26,6 +38,8 @@ jobs:
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.checkout_ref != '' && github.event.inputs.checkout_ref || github.ref }}
 
       - uses: actions/setup-node@v4
         with:
@@ -46,13 +60,17 @@ jobs:
       (github.event_name == 'pull_request' &&
         github.event.pull_request.head.repo.full_name == github.repository)
     runs-on: ubuntu-latest
-    timeout-minutes: 60
+    timeout-minutes: 45
     env:
-      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      LLM_PROVIDER: openai
-      LLM_MODEL: gpt-4o-mini
+      # secrets.* overrides vars.* (both optional); fallbacks keep CI working if nothing is configured.
+      LLM_PROVIDER: ${{ secrets.LLM_PROVIDER || vars.LLM_PROVIDER || 'ollama' }}
+      LLM_MODEL: ${{ secrets.LLM_MODEL || vars.LLM_MODEL || 'llama3.2:3b' }}
+      OLLAMA_BASE_URL: ${{ secrets.OLLAMA_BASE_URL || vars.OLLAMA_BASE_URL || 'http://127.0.0.1:11434/v1' }}
+      OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.checkout_ref != '' && github.event.inputs.checkout_ref || github.ref }}
 
       - uses: actions/setup-node@v4
         with:
@@ -65,8 +83,47 @@ jobs:
       - name: Install Playwright browsers
         run: npx playwright install --with-deps
 
-      - name: Run Playwright tests
-        run: npm test
+      - name: Ollama Cloud models need OLLAMA_API_KEY
+        run: |
+          case "$LLM_MODEL" in
+            *:cloud*)
+              if [ -z "${OLLAMA_API_KEY}" ]; then
+                echo "::error title=Missing OLLAMA_API_KEY::Models tagged *:cloud use Ollama Cloud. Add repository secret OLLAMA_API_KEY (https://ollama.com/settings/keys). Or use a local tag (e.g. llama3.2:3b) in Variables."
+                exit 1
+              fi
+              echo "OLLAMA_API_KEY is set for Cloud model."
+              ;;
+            *)
+              echo "LLM_MODEL=$LLM_MODEL (local tag — OLLAMA_API_KEY optional)"
+              ;;
+          esac
+
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+
+      - name: Start Ollama and wait for API
+        run: |
+          set -e
+          sudo systemctl stop ollama 2>/dev/null || true
+          nohup ollama serve > /tmp/ollama-serve.log 2>&1 &
+          echo "Waiting for http://127.0.0.1:11434 ..."
+          for i in $(seq 1 90); do
+            if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then
+              echo "Ollama is ready (after ${i}s)"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "--- ollama serve log ---"
+          cat /tmp/ollama-serve.log || true
+          exit 1
+
+      - name: Pull Ollama model
+        timeout-minutes: 30
+        run: ollama pull "$LLM_MODEL"
+
+      - name: Smoke test (single LLM case)
+        run: npm run test:smoke
 
       - name: Upload Playwright report (on failure)
         if: failure()
diff --git a/README.md b/README.md
index 9a126e6..e984bb7 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,27 @@
 # AgentAssert
 
-**A Playwright-based testing framework for agentic AI systems with MCP-compatible tool schemas and in-process tool orchestration.**
+**Proof-of-concept:** Playwright tests + reusable assertion helpers for tool-calling LLM agents (MCP-shaped tool schemas, in-process tools in the demo).
+
+This repository is **`"private": true`** in `package.json` — it is **not** published to npm and is **not** a productized “framework.” It is a **reference implementation** you can clone, read, and copy from. A clean entry point exists for imports (`index.ts` → `framework/`), but publishing a real package would add build steps, versioning, and semver guarantees — out of scope for this POC.
+
+---
+
+## Repository layout
+
+| Path | Role |
+|------|------|
+| **`framework/`** | Reusable assertions (`AgentAssert`, `HeuristicContractMatcher`, `BehaviorContract`) and **shared types** (`framework/types.ts` — traces, contracts, tool shapes). |
+| **`index.ts`** | Barrel export so you can `import { AgentAssert, … } from 'agent-assert'` when vendoring this repo. |
+| **`examples/agent/`** | **Demo system under test:** LLM loop, `ToolRegistry`, file-reader / api-caller tools. Replace with your own agent; keep compatible `AgentTrace` / `AgentOutput` shapes if you reuse the assertions. |
+| **`tests/`** | Playwright specs and fixtures wired to the demo agent via `tests/fixtures/setup.ts`. |
 
 ---
 
 ## About
 
-A working proof-of-concept that demonstrates five testing patterns for AI agents that call tools through a **`ToolRegistry`** (the same tool definitions map cleanly to Anthropic/OpenAI tool formats and to MCP-style schemas). The repo does not run a live MCP server by default; tools execute in-process so tests stay fast and deterministic. The framework introduces `NonDeterministicMatcher` — an assertion utility that evaluates LLM outputs against semantic intent contracts instead of exact string matches.
+A working proof-of-concept that demonstrates five testing patterns for AI agents that call tools through a **`ToolRegistry`** (the same tool definitions map cleanly to Anthropic/OpenAI tool formats and to MCP-style schemas). The repo does not run a live MCP server by default; tools execute in-process so tests stay fast and deterministic.
+
+**Behavior contracts** are checked by **`HeuristicContractMatcher`**: required fields, **case-insensitive keyword overlap**, **regex** forbidden phrases, and optional custom validators — *not* embedding similarity or LLM-as-judge semantics. The weighted **`confidence`** score is a tuning signal, not a calibrated measure of meaning. That avoids pretending the cheap matcher is “semantic” while still letting you reject exact-string tests for variable LLM wording. See **Heuristic matching: scope and limits** below.
 
 The deliberate use of Playwright (not Jest, not Vitest) as the test runner is itself a publishable insight.
 
@@ -17,7 +32,7 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it
 ```
 ┌─────────────────────────────────────────────────────────┐
 │                    YOUR TEST FILE                       │
-│  import { AgentAssert } from '../../framework/AgentAssert.js' │
+│  import { AgentAssert } from 'agent-assert'  // or ../../framework/AgentAssert.js │
 │  const trace = await agent.run("some prompt")           │
 │  AgentAssert.toolWasInvoked(trace, 'file-reader')       │
 │  AgentAssert.satisfiesContract(trace.output, CONTRACT)    │
@@ -36,11 +51,11 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it
       │ 5. Capture TRACE    │    └───────────┬─────────────┘
       └────────┬────────────┘                │
                │                  ┌──────────▼──────────────┐
-    ┌──────────▼───────────┐      │ NonDeterministicMatcher │
-    │   ToolRegistry       │      │                         │
-    │                      │      │ Layer 1: Structure      │
-    │ file-reader → exec() │      │ Layer 2: Semantics      │
-    │ api-caller  → exec() │      │ Layer 3: Forbidden      │
+    ┌──────────▼───────────┐      │ HeuristicContractMatcher │
+    │   ToolRegistry       │      │                          │
+    │                      │      │ Layer 1: Structure       │
+    │ file-reader → exec() │      │ Layer 2: Keywords (BoW)   │
+    │ api-caller  → exec() │      │ Layer 3: Forbidden (regex)│
     └──────────────────────┘      │ Layer 4: Custom         │
                                   │                         │
                                   │ Returns: MatchResult    │
@@ -58,8 +73,8 @@ The deliberate use of Playwright (not Jest, not Vitest) as the test runner is it
 6. **Loop continues** until the model produces a final text response
 7. **Agent builds `AgentTrace`** — captures EVERY step (tool calls, tool results, reasoning, output)
 8. **Test receives the trace** and passes it to AgentAssert methods
-9. **AgentAssert uses NonDeterministicMatcher** to evaluate output against BehaviorContracts
-10. **MatchResult returned** with confidence score and detailed breakdown
+9. **AgentAssert uses HeuristicContractMatcher** to score output against BehaviorContracts
+10. **MatchResult returned** with heuristic confidence and per-layer details
 
 ---
 
@@ -88,9 +103,9 @@ AgentAssert.expectMatched(result, 'file-reader should be invoked'); // embeds Ag
 ### Pattern 2: Behavior Contract Validation
 **File:** `tests/behavioral/output-contract.spec.ts`
 
-**What it tests:** Does the output satisfy a semantic contract (not exact string match)?
+**What it tests:** Does the output satisfy a **heuristic** contract (fields + keywords + patterns), not an exact string match?
 
-**Why it's unique:** `expect(output).toBe("...")` breaks on every LLM run. Contracts define rules that any correct output must satisfy, regardless of exact phrasing.
+**Why it's unique:** `expect(output).toBe("...")` breaks on every LLM run. Contracts define cheap rules that often track “good enough” outputs. Synonymous phrasing can still fail if keywords don’t align — widen keywords, lower thresholds, add a **customValidator**, or upgrade to embeddings / LLM-judge (see limits section below).
 
 **Key assertion:**
 ```typescript
@@ -100,9 +115,9 @@ AgentAssert.expectMatched(result, 'SUMMARIZATION contract should pass');
 
 **What to look at in the code:**
 - `BehaviorContract.ts` — pre-built contracts with required fields, keywords, forbidden patterns
-- `NonDeterministicMatcher.evaluate()` — the three-layer evaluation engine
-- `minKeywordMatchRatio` — controls how strict keyword matching is
-- `forbiddenPatterns` — hard-fail patterns that override the confidence score
+- `HeuristicContractMatcher.evaluate()` — structure + keyword overlap + forbidden regex (+ optional custom)
+- `minKeywordMatchRatio` — how much of the keyword list must appear as substrings
+- `forbiddenPatterns` — regex matches force a contract failure path
 
 ---
 
@@ -187,16 +202,14 @@ AgentAssert.expectMatched(
 
 ## Key Files Explained
 
-### agent/types.ts
-Every type definition. Read this first — everything else depends on these types.
+### framework/types.ts
+Shared types for traces, contracts, and (in the demo) tool definitions. Assertions and `examples/agent/` both import from here so the SUT and matchers stay aligned.
 
-- `AgentTrace` — the backbone. Every assertion operates on traces.
-- `TraceStep` — one decision the agent made (tool_call, tool_result, reasoning, output)
-- `ContractDefinition` — the rules that define "correct" for non-deterministic outputs
-- `MatchResult` — what assertions return (confidence score + details)
+- `AgentTrace` — what assertions operate on
+- `TraceStep`, `AgentOutput`, `ContractDefinition`, `MatchResult`, `ToolDefinition`, …
 
-### agent/agent.ts
-The System Under Test. The tool-calling loop is the core pattern:
+### examples/agent/agent.ts
+**Demo system under test** — not part of the reusable assertion layer. The tool-calling loop is the reference pattern:
 
 1. Send prompt + tool definitions to **Anthropic Messages API** or **OpenAI Chat Completions** (see `AgentConfig.provider`)
 2. The model responds with text and/or tool calls (`tool_use` vs `function` / `tool_calls` depending on provider)
@@ -209,27 +222,39 @@ The System Under Test. The tool-calling loop is the core pattern:
 
 **Important:** The system prompt in this file shapes agent behavior. If you change it, update the test contracts to match.
 
-### agent/tools/file-reader.ts and api-caller.ts
-Tools use MCP-aligned JSON schemas and register through **`ToolRegistry`**. In this POC they run locally (file-reader reads from disk, api-caller uses mock responses). To connect them to a real MCP server, replace the `execute` function with MCP transport calls — the schema stays the same.
+### examples/agent/tools/file-reader.ts and api-caller.ts
+Demo tools use MCP-aligned JSON schemas and register through **`ToolRegistry`**. In this POC they run locally (file-reader reads from disk, api-caller uses mock responses). To connect them to a real MCP server, replace the `execute` function with MCP transport calls — the schema stays the same.
 
 **Security note:** `file-reader.ts` includes path traversal protection. Read the comments.
 
-### agent/tools/registry.ts
+### examples/agent/tools/registry.ts
 Maps tool names to definitions. Provides `toAnthropicTools()` and `toOpenAITools()` so the same tool definitions work with either API. This is the bridge between your tool definitions and the LLM.
 
-### framework/NonDeterministicMatcher.ts
-**The core innovation.** Three evaluation layers:
+### framework/HeuristicContractMatcher.ts
+**Heuristic evaluation (not deep semantics).** Layers:
 
-1. **Structural** (40% weight) — are required fields present?
-2. **Semantic** (35% weight) — do enough intent keywords appear?
-3. **Forbidden** (25% weight) — do any red-flag patterns match?
+1. **Structural** (40% weight) — required fields (and optional length)
+2. **Keywords** (35% weight, or 25% + 10% custom when `customValidator` is set) — bag-of-words style: substring presence for each listed keyword
+3. **Forbidden** (25% weight) — regex patterns; any hit triggers the contract-failure path
+4. **Custom** (optional) — your own validator in the contract
 
-Forbidden patterns cause a hard failure regardless of other scores.
+The headline `confidence` is a **weighted average of those scores** — useful for ranking and thresholds, not as a semantic similarity score.
 
 **Tuning knobs:**
-- `minKeywordMatchRatio` in the contract — lower = more lenient
-- `confidence` threshold in the test — lower = fewer flaky tests
-- `forbiddenPatterns` — add patterns to catch more failure modes
+- `minKeywordMatchRatio` — lower = more lenient keyword layer
+- Assertion threshold on `result.confidence` — lower = fewer flaky tests
+- `forbiddenPatterns` — stricter guardrails (regex can be brittle; test them)
+- **Synonyms** — add alternate phrasings to `requiredIntentKeywords`, or use `customValidator` / external judges (below)
+
+### Heuristic matching: scope and limits
+
+| Approach | What this repo does | What would be “more semantic” |
+|----------|---------------------|--------------------------------|
+| Keyword list | Substring checks after lowercasing | LLM-as-judge, entailment models |
+| Confidence | Weighted heuristic blend | Calibrated metrics or judge scores |
+| Same meaning, different words | Can **fail** unless keywords or patterns cover both | Embeddings vs reference texts, synonym lists |
+
+**Possible upgrades (not implemented here):** call a second model to grade outputs against the contract; embed output and reference snippets and compare cosine similarity; use an NLP library for paraphrase / NLI. Those add latency, cost, and complexity — the heuristic matcher stays intentionally cheap and explicit.
 
 ### framework/BehaviorContract.ts
 Pre-built contracts for common task types. Each contract defines what "correct" means for that task type. The five contracts: SUMMARIZATION, API_ACTION, MULTI_STEP, SCOPE_BOUNDED, GRACEFUL_FAILURE.
@@ -252,14 +277,29 @@ Helpers:
 
 ### tests/env-llm.ts and `.env`
 
-Playwright loads **`tests/env-llm.ts`** from **`playwright.config.ts`** (`applyLlmVarsFromDotEnv()`). Selected keys from a project-root **`.env`** file are merged into `process.env` (`.env` wins over existing shell vars for those keys): `LLM_PROVIDER`, `LLM_MODEL`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`. Copy **`.env.example`** to **`.env`** and fill in keys so tests and IDE runs see the same configuration without exporting variables manually.
+Playwright loads **`tests/env-llm.ts`** from **`playwright.config.ts`** (`applyLlmVarsFromDotEnv()`). Selected keys from a project-root **`.env`** file are merged into `process.env` (`.env` wins over existing shell vars for those keys): `LLM_PROVIDER`, `LLM_MODEL`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OLLAMA_API_KEY`, `OLLAMA_BASE_URL`. Copy **`.env.example`** to **`.env`** and fill in keys so tests and IDE runs see the same configuration without exporting variables manually.
+
+**GitHub Actions CI** (`.github/workflows/ci.yml`) installs Ollama, pulls the configured model, and runs the smoke test. **Provider, model, and optional base URL** are read from repository **Variables** or **Secrets** (no code change):
+
+| Name | Purpose |
+|------|--------|
+| `LLM_PROVIDER` | e.g. `ollama` |
+| `LLM_MODEL` | e.g. `llama3.2:3b` or `deepseek-v3.2:cloud` |
+| `OLLAMA_BASE_URL` | Optional; default `http://127.0.0.1:11434/v1` |
+| `OLLAMA_API_KEY` | **Repository secret** — required when `LLM_MODEL` is an Ollama **Cloud** tag (`*:cloud`). Without it, Ollama returns **HTTP 500**. [Create a key](https://ollama.com/settings/keys). |
+
+**Best practice:** put **non-sensitive** values under **Variables**. Use **Secrets** for **`OLLAMA_API_KEY`** and other keys. Storing `LLM_MODEL` as a *secret* works but **masks** it in logs—prefer **Variables** for model name unless needed. If neither Variable nor Secret is set for provider/model, CI defaults to **`ollama`** + **`llama3.2:3b`** (local, no key; stronger tool-calling than `1b`).
+
+**Run CI on a chosen branch:** **Actions** → **CI** → **Run workflow**. GitHub shows **Use workflow from** — that dropdown is the branch selector for manual runs (the workflow file and checkout use that branch unless you override). Optionally fill **Checkout ref** in the form to checkout a different branch or full ref (e.g. `refs/heads/feature/x`).
+
+**Local vs Cloud tags:** tags like `llama3.2:3b` run fully locally. Tags ending in **`:cloud`** need **`OLLAMA_API_KEY`** in Secrets. Override **`LLM_MODEL`** in Variables if you want another local model (e.g. newer builds when Ollama adds them).
 
 ---
 
 ### playwright.config.ts (high level)
 
 - **`retries: 1`** — each failed test runs one more time (LLM outputs vary)
-- **Timeouts:** default **45s**; **`behavioral`** project **60s** (multi-step runs); **`boundary`** **45s**
+- **Timeouts:** default **45s**; **`behavioral`** project **120s** locally, **300s** when **`CI=true`** (e.g. GitHub Actions + Ollama on CPU); **`boundary`** **45s**
 - **`trace: 'off'`** — browser-style Playwright traces are disabled (this suite does not use a browser). Failures still get rich attachments from **`registerAgentTraceForDiagnostics`** in `tests/fixtures/setup.ts` (see below)
 - **`workers: 3`** — tune for your API rate limits
 - **HTML report `title`** — includes resolved LLM provider and model for quick scanning
@@ -350,7 +390,7 @@ npx playwright show-report
 ## How to Extend
 
 ### Add a New Tool
-1. Create `agent/tools/your-tool.ts` following the same factory pattern as `file-reader.ts`
+1. Create `examples/agent/tools/your-tool.ts` following the same factory pattern as `file-reader.ts`
 2. Register it in the ToolRegistry in your test setup
 3. Add mock responses in `setup.ts`
 4. Write tests using `AgentAssert.toolWasInvoked(trace, 'your-tool')`
@@ -366,14 +406,14 @@ npx playwright show-report
 1. Add a static method to `AgentAssert.ts`
 2. Accept `AgentTrace` or `AgentOutput` as input
 3. Return `MatchResult`
-4. Use `NonDeterministicMatcher` methods internally if needed
+4. Use `HeuristicContractMatcher` methods internally if needed
 5. Include detailed reasons in the `details` array
 
 ### Adapt for Another LLM Provider (beyond Anthropic, OpenAI, and Ollama)
-1. Add a branch in `agent/agent.ts` alongside the existing Anthropic and OpenAI-compatible loops
+1. Add a branch in `examples/agent/agent.ts` alongside the existing Anthropic and OpenAI-compatible loops
 2. Add a `toYourProviderTools()` (or equivalent) on `ToolRegistry` if the tool schema differs
 3. Map that provider’s tool-call and tool-result messages into the same `TraceStep` shapes the framework already expects
-4. The framework layer (AgentAssert, NonDeterministicMatcher, BehaviorContract) stays UNCHANGED — it operates on `AgentTrace`, which is provider-agnostic
+4. The framework layer (AgentAssert, HeuristicContractMatcher, BehaviorContract) stays UNCHANGED — it operates on `AgentTrace`, which is provider-agnostic
 
 ### Connect to a Real MCP Server
 1. Replace the `execute` function in your tool with MCP client calls
@@ -420,8 +460,8 @@ Each test run calls a real LLM API. Costs depend on provider and model.
 
 ## Troubleshooting
 
-**Tests timeout (>60s):**
-LLM APIs can be slow. Increase `timeout` in `playwright.config.ts`. Check your API key is valid for the chosen provider (`LLM_PROVIDER` / `AgentConfig.provider`). Check rate limits.
+**Tests timeout (behavioral tests: 120s local, 300s on CI):**
+LLM APIs and local Ollama on CPU can be slow (especially in GitHub Actions — first inference after `ollama pull` can take minutes). The **`behavioral`** project uses a longer cap when **`CI=true`**. You can raise `behavioralTimeoutMs` in `playwright.config.ts` if needed. For Ollama in CI, ensure the model is pulled before tests and the runner has enough RAM.
 
 **Tests are flaky (pass sometimes, fail sometimes):**
 This is expected with LLM testing. Three strategies:
@@ -433,6 +473,9 @@ This is expected with LLM testing. Three strategies:
 **Wrong provider or API URL (401 / unexpected host):**  
 Confirm `LLM_PROVIDER` matches the key you set. For `openai`, set **`OPENAI_BASE_URL`** for a custom endpoint; **`OLLAMA_BASE_URL` is not read** for that provider. Use **`LLM_PROVIDER=ollama`** with Ollama’s `/v1` base if you intend local Ollama.
 
+**Ollama `500` / `internal service error` with `*:cloud` models:**  
+Cloud-tagged models may require an Ollama Cloud API key; use a **local** model tag for the same behavior without keys, or set optional **`OLLAMA_API_KEY`**.
+
 **Agent output is not JSON:**
 The system prompt tells Claude to respond in JSON, but it sometimes wraps it in markdown fences. The `parseOutput()` method in `agent.ts` handles this. If you see `taskType: "unknown"`, the JSON parsing failed entirely — check the raw text in the trace.
 
diff --git a/agent/types.ts b/agent/types.ts
deleted file mode 100644
index 77224a1..0000000
--- a/agent/types.ts
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * agent/types.ts
- * 
- * ARCHITECTURE ROLE: Shared type definitions.
- * Every component in the system imports from here.
- * If you change a type here, the compiler tells you every file that breaks.
- * 
- * KEY DESIGN DECISION: The AgentTrace type is the backbone of the entire
- * testing framework. Every assertion method in AgentAssert operates on
- * traces, not on raw outputs. This is what makes the framework work —
- * you're testing the agent's BEHAVIOR (what tools it chose, what params
- * it passed, what path it took), not just its final answer.
- */
-
-// ─────────────────────────────────────────────
-// TOOL DEFINITIONS
-// ─────────────────────────────────────────────
-
-/**
- * Describes a tool the agent can call. Maps directly to
- * Anthropic's tool_use schema (and by extension, MCP protocol).
- * 
- * The `execute` function is what actually runs when the agent
- * decides to use this tool. In production, this calls real services.
- * In tests, you swap it with a mock.
- */
-export interface ToolDefinition {
-  name: string;
-  description: string;
-  inputSchema: Record<string, unknown>;  // JSON Schema for the tool's parameters
-  execute: (input: Record<string, unknown>) => Promise<ToolResult>;
-}
-
-/**
- * What a tool returns after execution.
- * `success` flag is critical — the retry-behavior tests
- * check how the agent responds when success=false.
- */
-export interface ToolResult {
-  success: boolean;
-  data: unknown;
-  error?: string;
-}
-
-// ─────────────────────────────────────────────
-// AGENT TRACE (the core testing data structure)
-// ─────────────────────────────────────────────
-
-/**
- * A single step in the agent's execution.
- * 
- * WHY THIS MATTERS:
- * Traditional testing checks input → output.
- * Agent testing checks input → [decision₁, decision₂, ... decisionₙ] → output.
- * 
- * Each TraceStep records ONE decision the agent made:
- * - 'tool_call': Agent decided to invoke a specific tool with specific params
- * - 'tool_result': The tool returned data (or failed)
- * - 'reasoning': Agent's internal reasoning (from Claude's response text)
- * - 'output': Agent's final answer
- * 
- * The sequence of steps IS the agent's behavior. Your tests assert
- * against this sequence, not against the final string.
- */
-export interface TraceStep {
-  type: 'tool_call' | 'tool_result' | 'reasoning' | 'output';
-  toolName?: string;
-  toolInput?: Record<string, unknown>;
-  toolOutput?: unknown;
-  content?: string;
-  timestamp: number;
-}
-
-/**
- * Complete record of an agent run.
- * This is what every test receives. This is what every assertion inspects.
- * 
- * EXTENDING THIS:
- * When you adapt this framework for other domains, you might add:
- * - `tokenUsage: { prompt: number, completion: number }` for cost tracking
- * - `parentTraceId: string` for multi-agent orchestration testing
- * - `guardrailResults: GuardrailCheck[]` for safety testing
- */
-export interface AgentTrace {
-  input: string;                  // The original natural language prompt
-  steps: TraceStep[];             // Ordered list of everything the agent did
-  output: AgentOutput;            // The final structured result
-  metadata: {
-    model: string;                // Which LLM model was used
-    /** Which API was used (`anthropic`, `openai`, or `ollama`). Omitted in older traces. */
-    provider?: 'anthropic' | 'openai' | 'ollama';
-    durationMs: number;           // Total wall-clock time
-    toolCallCount: number;        // How many tools were invoked
-    retryCount: number;           // How many retries happened
-  };
-}
-
-/**
- * The structured output the agent returns.
- * 
- * WHY STRUCTURED AND NOT JUST A STRING:
- * If the agent returns free text, you can't reliably assert on it.
- * By forcing structured output, you can check:
- * - Did the agent produce the right type of result?
- * - Did it include all required fields?
- * - Are the values within expected ranges?
- * 
- * The `toolsUsed` array is particularly important — it's the
- * agent's self-report of which tools it called. Your tests
- * cross-reference this against the actual trace to catch lies.
- */
-export interface AgentOutput {
-  taskType: string;               // Classification of what the agent did
-  result: unknown;                // The actual payload (varies by task)
-  toolsUsed: string[];            // Which tools the agent reports using
-  confidence: number;             // 0-1 confidence score
-  summary: string;                // Human-readable summary
-}
-
-// ─────────────────────────────────────────────
-// BEHAVIOR CONTRACTS
-// ─────────────────────────────────────────────
-
-/**
- * Defines what "correct" means for a specific type of agent output.
- * This replaces exact string matching with semantic rules.
- * 
- * EXAMPLE:
- * For a SUMMARIZATION contract:
- * - requiredFields: ['summary', 'sourceFile']
- * - requiredIntentKeywords: ['summary', 'key points', 'overview']
- * - maxLengthChars: 500
- * - forbiddenPatterns: [/I don't know/, /I cannot/]
- * 
- * The NonDeterministicMatcher evaluates output against these rules
- * and returns a confidence score, not a binary pass/fail.
- */
-export interface ContractDefinition {
-  name: string;
-  description: string;
-  requiredFields: string[];                // Fields that MUST exist in output
-  requiredIntentKeywords: string[];        // At least N of these must appear
-  minKeywordMatchRatio: number;            // What fraction of keywords must match (0-1)
-  forbiddenPatterns: RegExp[];             // Patterns that must NOT appear
-  maxLengthChars?: number;                 // Optional length constraint
-  customValidator?: (output: unknown) => ValidationResult;  // Escape hatch for complex rules
-}
-
-export interface ValidationResult {
-  passed: boolean;
-  score: number;        // 0-1 confidence
-  reason: string;       // Human-readable explanation of why it passed/failed
-}
-
-/**
- * What the NonDeterministicMatcher returns.
- * 
- * KEY INSIGHT: `confidence` is not binary.
- * An output might score 0.7 — it partially satisfies the contract.
- * Your test decides the threshold: strict tests require 0.9+,
- * exploratory tests might accept 0.5+.
- * 
- * `details` tells you exactly what matched and what didn't,
- * so when a test fails, you know WHY without re-reading the LLM output.
- */
-export interface MatchResult {
-  matched: boolean;
-  confidence: number;
-  details: string[];     // List of what passed and what failed
-}
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..2dc7350
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,9 @@
+# Examples
+
+## `agent/` — demo system under test
+
+This folder holds a **reference LLM agent** (tool loop + `ToolRegistry` + sample tools). It exists to drive the Playwright tests in `tests/` and to show how `AgentTrace` / `AgentOutput` are produced.
+
+It is **not** the reusable assertion library — that lives in `framework/` at the repo root and is re-exported from `index.ts`.
+
+To try your own agent: implement a runner that yields the same trace shapes (see `framework/types.ts`), wire it in `tests/fixtures/setup.ts`, and keep or replace the demo tools.
diff --git a/agent/agent.ts b/examples/agent/agent.ts
similarity index 94%
rename from agent/agent.ts
rename to examples/agent/agent.ts
index b69e0cb..b89c91b 100644
--- a/agent/agent.ts
+++ b/examples/agent/agent.ts
@@ -1,7 +1,7 @@
 /**
- * agent/agent.ts
+ * examples/agent/agent.ts
  *
- * THE AGENT — the System Under Test (SUT)
+ * Demo agent — reference system under test for this POC (not the assertion library).
  *
  * WHAT IT DOES:
  * 1. Takes a natural language task from the user
@@ -12,7 +12,9 @@
  * 6. The agent records EVERY step in a trace (AgentTrace)
  *
  * Provider selection: set `AgentConfig.provider`, or `LLM_PROVIDER=anthropic|openai|ollama`.
- * Keys: `ANTHROPIC_API_KEY` (Anthropic), `OPENAI_API_KEY` (OpenAI cloud or Ollama dummy).
+ * Keys: `ANTHROPIC_API_KEY` (Anthropic), `OPENAI_API_KEY` (OpenAI). For **`LLM_PROVIDER=ollama`**, local models
+ * typically need only **`LLM_MODEL`**; the client uses the placeholder apiKey `ollama`. Optional **`OLLAMA_API_KEY`**
+ * if you use Ollama Cloud–hosted model tags (e.g. `*:cloud`).
  * Local Ollama: `LLM_PROVIDER=ollama` or `LLM_PROVIDER=openai` with `OPENAI_BASE_URL`
  * (e.g. `http://127.0.0.1:11434/v1`). See `.env.example`.
  */
@@ -26,7 +28,7 @@ import {
   AgentOutput,
   TraceStep,
   ToolResult,
-} from './types.js';
+} from '../../framework/types.js';
 
 /** Which vendor API backs the agent. `ollama` uses the OpenAI SDK against a local Ollama server. */
 export type LlmProvider = 'anthropic' | 'openai' | 'ollama';
@@ -112,12 +114,20 @@ function resolveAgentConfig(config: AgentConfig): ResolvedAgentConfig {
       ? resolveOpenAIBaseURL(provider, config)
       : undefined;
 
-  let apiKey =
-    config.apiKey ??
-    (provider === 'openai' || provider === 'ollama'
-      ? process.env.OPENAI_API_KEY
-      : process.env.ANTHROPIC_API_KEY) ??
-    '';
+  let apiKey = config.apiKey?.trim() ?? '';
+
+  if (!apiKey) {
+    if (provider === 'anthropic') {
+      apiKey = process.env.ANTHROPIC_API_KEY?.trim() ?? '';
+    } else if (provider === 'ollama') {
+      apiKey =
+        process.env.OLLAMA_API_KEY?.trim() ||
+        process.env.OPENAI_API_KEY?.trim() ||
+        '';
+    } else {
+      apiKey = process.env.OPENAI_API_KEY?.trim() ?? '';
+    }
+  }
 
   if ((provider === 'openai' || provider === 'ollama') && !apiKey && baseURL) {
     apiKey = 'ollama';
@@ -127,7 +137,7 @@ function resolveAgentConfig(config: AgentConfig): ResolvedAgentConfig {
     provider === 'openai'
       ? 'gpt-4o'
       : provider === 'ollama'
-        ? 'llama3:latest'
+        ? 'llama3.2:3b'
         : 'claude-sonnet-4-20250514';
 
   return {
diff --git a/agent/tools/api-caller.ts b/examples/agent/tools/api-caller.ts
similarity index 98%
rename from agent/tools/api-caller.ts
rename to examples/agent/tools/api-caller.ts
index 9b58426..1e2e8bd 100644
--- a/agent/tools/api-caller.ts
+++ b/examples/agent/tools/api-caller.ts
@@ -1,5 +1,5 @@
 /**
- * agent/tools/api-caller.ts
+ * examples/agent/tools/api-caller.ts
  * 
  * MCP TOOL #2: API Caller
  * 
@@ -27,7 +27,7 @@
  * - Connect to your actual Jira/ServiceNow/Datadog APIs
  */
 
-import { ToolDefinition, ToolResult } from '../types.js';
+import { ToolDefinition, ToolResult } from '../../../framework/types.js';
 
 /**
  * Configuration for the API caller.
diff --git a/agent/tools/file-reader.ts b/examples/agent/tools/file-reader.ts
similarity index 86%
rename from agent/tools/file-reader.ts
rename to examples/agent/tools/file-reader.ts
index 6a6b6e8..5c0cafe 100644
--- a/agent/tools/file-reader.ts
+++ b/examples/agent/tools/file-reader.ts
@@ -1,5 +1,5 @@
 /**
- * agent/tools/file-reader.ts
+ * examples/agent/tools/file-reader.ts
  * 
  * MCP TOOL #1: File Reader
  * 
@@ -33,7 +33,15 @@
 
 import * as fs from 'fs/promises';
 import * as path from 'path';
-import { ToolDefinition, ToolResult } from '../types.js';
+import { ToolDefinition, ToolResult } from '../../../framework/types.js';
+
+/** Small / weak models sometimes omit `filePath` or use snake_case — normalize before resolve(). */
+function pickFilePath(input: Record<string, unknown>): string | undefined {
+  const raw = input.filePath ?? input.file_path ?? input.path;
+  if (typeof raw === 'string' && raw.trim()) return raw.trim();
+  if (raw != null && typeof raw !== 'object') return String(raw).trim() || undefined;
+  return undefined;
+}
 
 /**
  * Creates a file-reader tool instance.
@@ -96,7 +104,16 @@ export function createFileReaderTool(basePath: string = '/tmp/agent-files'): Too
      * The path.resolve + startsWith check below prevents this.
      */
     execute: async (input: Record<string, unknown>): Promise<ToolResult> => {
-      const filePath = input.filePath as string;
+      const filePath = pickFilePath(input);
+      if (!filePath) {
+        return {
+          success: false,
+          data: null,
+          error:
+            'Missing file path. Pass filePath as a string (e.g. "logs/test-results.log").',
+        };
+      }
+
       const encoding = (input.encoding as BufferEncoding) || 'utf-8';
 
       // SECURITY: Resolve the full path and verify it's within basePath.
diff --git a/agent/tools/registry.ts b/examples/agent/tools/registry.ts
similarity index 90%
rename from agent/tools/registry.ts
rename to examples/agent/tools/registry.ts
index 8f949d5..033428f 100644
--- a/agent/tools/registry.ts
+++ b/examples/agent/tools/registry.ts
@@ -1,5 +1,5 @@
 /**
- * agent/tools/registry.ts
+ * examples/agent/tools/registry.ts
  * 
  * TOOL REGISTRY
  * 
@@ -22,7 +22,7 @@
  */
 
 import type OpenAI from 'openai';
-import { ToolDefinition, ToolResult } from '../types.js';
+import { ToolDefinition, ToolResult } from '../../../framework/types.js';
 
 export class ToolRegistry {
   private tools: Map<string, ToolDefinition> = new Map();
@@ -62,7 +62,16 @@ export class ToolRegistry {
         error: `Tool "${name}" is not registered. Available tools: ${this.listNames().join(', ')}`,
       };
     }
-    return tool.execute(input);
+    try {
+      return await tool.execute(input);
+    } catch (err: unknown) {
+      const message = err instanceof Error ? err.message : String(err);
+      return {
+        success: false,
+        data: null,
+        error: `Tool "${name}" failed: ${message}`,
+      };
+    }
   }
 
   /**
diff --git a/framework/AgentAssert.ts b/framework/AgentAssert.ts
index d1a5939..329a59b 100644
--- a/framework/AgentAssert.ts
+++ b/framework/AgentAssert.ts
@@ -39,8 +39,8 @@
  */
 
 import { expect } from '@playwright/test';
-import { AgentTrace, ContractDefinition, MatchResult } from '../agent/types.js';
-import { NonDeterministicMatcher } from './NonDeterministicMatcher.js';
+import { AgentTrace, ContractDefinition, MatchResult } from './types.js';
+import { HeuristicContractMatcher } from './HeuristicContractMatcher.js';
 
 export class AgentAssert {
 
@@ -141,8 +141,8 @@ export class AgentAssert {
   /**
    * ASSERTION 2: Does the output satisfy a behavior contract?
    * 
-   * This is where the NonDeterministicMatcher does its work.
-   * Instead of checking exact values, we check semantic rules.
+   * This is where HeuristicContractMatcher does its work: fields, keyword overlap,
+   * forbidden regexes — not LLM-grade semantics (see HeuristicContractMatcher.ts).
    * 
    * @param output - The agent's structured output (AgentOutput)
    * @param contract - The behavior contract to evaluate against
@@ -169,7 +169,7 @@ export class AgentAssert {
     contract: ContractDefinition,
     minConfidence: number = 0.5
   ): MatchResult {
-    const result = NonDeterministicMatcher.evaluate(output, contract);
+    const result = HeuristicContractMatcher.evaluate(output, contract);
 
     // Override the matched flag based on minConfidence
     return {
diff --git a/framework/BehaviorContract.ts b/framework/BehaviorContract.ts
index 85dc10f..1c001b3 100644
--- a/framework/BehaviorContract.ts
+++ b/framework/BehaviorContract.ts
@@ -1,7 +1,7 @@
 /**
  * framework/BehaviorContract.ts
  * 
- * BEHAVIOR CONTRACTS — what "correct" means when outputs are non-deterministic
+ * BEHAVIOR CONTRACTS — what "correct" means when LLM wording varies run to run
  * 
  * THE PROBLEM:
  * In traditional testing, you write:
@@ -40,7 +40,7 @@
  * - Be under 1000 chars (it's a creation confirmation, not a novel)
  */
 
-import { ContractDefinition, ValidationResult } from '../agent/types.js';
+import { ContractDefinition, ValidationResult } from './types.js';
 
 /**
  * Pre-built contracts for common agent task types.
diff --git a/framework/HeuristicContractMatcher.ts b/framework/HeuristicContractMatcher.ts
new file mode 100644
index 0000000..f0f8155
--- /dev/null
+++ b/framework/HeuristicContractMatcher.ts
@@ -0,0 +1,236 @@
+/**
+ * framework/HeuristicContractMatcher.ts
+ *
+ * HEURISTIC CONTRACT EVALUATION — for LLM outputs that vary in wording
+ *
+ * WHAT THIS IS (honest scope):
+ * ────────────────────────────
+ * This is **not** deep semantic understanding. It does **not** embed text,
+ * call an LLM-as-judge, or parse meaning the way a human does. It applies
+ * **cheap, deterministic heuristics**:
+ *
+ * - Required fields present (structural)
+ * - Substring / bag-of-words overlap with a keyword list (“intent keywords”)
+ * - **Regex** forbidden patterns
+ * - Optional custom validator
+ *
+ * The numeric **confidence** is a **weighted average of those heuristic
+ * scores** — a tuning aid and ranking signal, not a calibrated probability
+ * of semantic correctness. Treat it accordingly in assertions.
+ *
+ * WHY IT STILL EXISTS:
+ * ────────────────────
+ * Exact `expect(output).toBe("...")` fails on every LLM run. Rules based on
+ * fields + keyword coverage + forbidden phrases often **do** catch wrong
+ * behavior cheaply. But phrasing that is **semantically equivalent** can
+ * still miss keywords (e.g. “unable to locate the file” vs “file not found”)
+ * unless your keyword lists and thresholds cover those variants — or you
+ * add synonyms / move to embeddings / LLM grading (see README).
+ *
+ * LAYERS (implementation):
+ * ─────────────────────────
+ * 1. STRUCTURE — required fields, optional length
+ * 2. KEYWORDS — case-insensitive substring checks; ratio vs `minKeywordMatchRatio`
+ * 3. FORBIDDEN — regex matches → hard failure path
+ * 4. CUSTOM — contract-supplied validator (optional)
+ *
+ * SCORING WEIGHTS: structural 40%, keyword 35% (or 25% + custom 10% if set),
+ * forbidden 25%. Forbidden violations force a failed match regardless of headline confidence.
+ */
+
+import { ContractDefinition, MatchResult } from './types.js';
+
+export class HeuristicContractMatcher {
+  /**
+   * Evaluate an output against a behavior contract using the heuristic layers above.
+   *
+   * @example
+   *   const result = HeuristicContractMatcher.evaluate(trace.output, BehaviorContract.SUMMARIZATION);
+   */
+  static evaluate(output: unknown, contract: ContractDefinition): MatchResult {
+    const details: string[] = [];
+    const scores: { weight: number; score: number; label: string }[] = [];
+
+    const structuralResult = this.checkStructure(output, contract);
+    scores.push({ weight: 0.4, score: structuralResult.score, label: 'structural' });
+    details.push(...structuralResult.details);
+
+    const keywordResult = this.checkKeywordOverlap(output, contract);
+    const keywordWeight = contract.customValidator ? 0.25 : 0.35;
+    scores.push({ weight: keywordWeight, score: keywordResult.score, label: 'keywords' });
+    details.push(...keywordResult.details);
+
+    const forbiddenResult = this.checkForbiddenPatterns(output, contract);
+    scores.push({ weight: 0.25, score: forbiddenResult.score, label: 'forbidden' });
+    details.push(...forbiddenResult.details);
+
+    if (contract.customValidator) {
+      const customResult = contract.customValidator(output);
+      scores.push({ weight: 0.1, score: customResult.score, label: 'custom' });
+      details.push(`[custom] ${customResult.reason}`);
+    }
+
+    const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
+    const confidence =
+      scores.reduce((sum, s) => sum + s.weight * s.score, 0) / totalWeight;
+
+    const hasForbiddenViolation = forbiddenResult.score < 1.0;
+
+    return {
+      matched: !hasForbiddenViolation && confidence >= 0.5,
+      confidence: hasForbiddenViolation ? Math.min(confidence, 0.3) : confidence,
+      details,
+    };
+  }
+
+  private static checkStructure(
+    output: unknown,
+    contract: ContractDefinition
+  ): { score: number; details: string[] } {
+    const details: string[] = [];
+
+    if (typeof output !== 'object' || output === null) {
+      details.push('[structural] FAIL: output is not an object');
+      return { score: 0, details };
+    }
+
+    const obj = output as Record<string, unknown>;
+    let presentCount = 0;
+
+    for (const field of contract.requiredFields) {
+      if (field in obj && obj[field] !== undefined && obj[field] !== null) {
+        presentCount++;
+        details.push(`[structural] PASS: field "${field}" present`);
+      } else {
+        details.push(`[structural] FAIL: field "${field}" missing`);
+      }
+    }
+
+    if (contract.maxLengthChars) {
+      const outputStr = JSON.stringify(output);
+      if (outputStr.length > contract.maxLengthChars) {
+        details.push(
+          `[structural] FAIL: output length ${outputStr.length} exceeds max ${contract.maxLengthChars}`
+        );
+      } else {
+        details.push(`[structural] PASS: output length ${outputStr.length} within limit`);
+      }
+    }
+
+    const score =
+      contract.requiredFields.length > 0
+        ? presentCount / contract.requiredFields.length
+        : 1.0;
+
+    return { score, details };
+  }
+
+  /**
+   * Keyword layer: case-insensitive substring presence against `requiredIntentKeywords`.
+   * Not synonym-aware; expand keywords or lower thresholds if tests are brittle.
+   */
+  private static checkKeywordOverlap(
+    output: unknown,
+    contract: ContractDefinition
+  ): { score: number; details: string[] } {
+    const details: string[] = [];
+    const outputStr = JSON.stringify(output).toLowerCase();
+
+    let matchCount = 0;
+    const matchedKeywords: string[] = [];
+    const missedKeywords: string[] = [];
+
+    for (const keyword of contract.requiredIntentKeywords) {
+      if (outputStr.includes(keyword.toLowerCase())) {
+        matchCount++;
+        matchedKeywords.push(keyword);
+      } else {
+        missedKeywords.push(keyword);
+      }
+    }
+
+    const ratio =
+      contract.requiredIntentKeywords.length > 0
+        ? matchCount / contract.requiredIntentKeywords.length
+        : 1.0;
+
+    const passed = ratio >= contract.minKeywordMatchRatio;
+
+    details.push(
+      `[keywords] ${passed ? 'PASS' : 'FAIL'}: ` +
+        `${matchCount}/${contract.requiredIntentKeywords.length} keywords matched ` +
+        `(${(ratio * 100).toFixed(0)}%, threshold: ${(contract.minKeywordMatchRatio * 100).toFixed(0)}%)`
+    );
+    details.push(`[keywords] Matched: [${matchedKeywords.join(', ')}]`);
+    if (missedKeywords.length > 0 && missedKeywords.length <= 10) {
+      details.push(`[keywords] Missed: [${missedKeywords.join(', ')}]`);
+    }
+
+    return {
+      score: Math.min(ratio / contract.minKeywordMatchRatio, 1.0),
+      details,
+    };
+  }
+
+  private static checkForbiddenPatterns(
+    output: unknown,
+    contract: ContractDefinition
+  ): { score: number; details: string[] } {
+    const details: string[] = [];
+    const outputStr = JSON.stringify(output);
+
+    let violations = 0;
+
+    for (const pattern of contract.forbiddenPatterns) {
+      if (pattern.test(outputStr)) {
+        violations++;
+        details.push(`[forbidden] FAIL: pattern ${pattern} matched in output`);
+      }
+    }
+
+    if (violations === 0) {
+      details.push(`[forbidden] PASS: no forbidden patterns detected`);
+    }
+
+    return { score: violations > 0 ? 0 : 1.0, details };
+  }
+
+  /**
+   * Substring search, or word-level overlap when `fuzzy` is true (still heuristic, not NLP).
+   */
+  static containsIntent(
+    output: unknown,
+    target: string,
+    fuzzy: boolean = false,
+    fuzzyThreshold: number = 0.5
+  ): MatchResult {
+    const outputStr = JSON.stringify(output).toLowerCase();
+    const targetLower = target.toLowerCase();
+
+    if (!fuzzy) {
+      const found = outputStr.includes(targetLower);
+      return {
+        matched: found,
+        confidence: found ? 1.0 : 0.0,
+        details: [
+          found ? `Found "${target}" in output` : `"${target}" not found in output`,
+        ],
+      };
+    }
+
+    const words = targetLower.split(/\s+/).filter(w => w.length > 2);
+    let matchCount = 0;
+    for (const word of words) {
+      if (outputStr.includes(word)) matchCount++;
+    }
+
+    const ratio = words.length > 0 ? matchCount / words.length : 0;
+    return {
+      matched: ratio >= fuzzyThreshold,
+      confidence: ratio,
+      details: [
+        `Fuzzy word overlap: ${matchCount}/${words.length} words found (${(ratio * 100).toFixed(0)}%)`,
+      ],
+    };
+  }
+}
diff --git a/framework/NonDeterministicMatcher.ts b/framework/NonDeterministicMatcher.ts
deleted file mode 100644
index a7438fd..0000000
--- a/framework/NonDeterministicMatcher.ts
+++ /dev/null
@@ -1,326 +0,0 @@
-/**
- * framework/NonDeterministicMatcher.ts
- * 
- * THE KEY INNOVATION — assertion logic for non-deterministic outputs
- * 
- * WHY THIS EXISTS:
- * ────────────────
- * Traditional test matchers are binary: the output either equals the
- * expected value or it doesn't. This is useless for LLM outputs because:
- * 
- * - Same prompt → different wording every time
- * - Same intent → different structure every time
- * - Same facts → different ordering every time
- * 
- * The NonDeterministicMatcher evaluates outputs against INTENT CONTRACTS
- * rather than exact values. It returns a CONFIDENCE SCORE (0-1) instead
- * of a binary pass/fail.
- * 
- * HOW IT WORKS (three evaluation layers):
- * ─────────────────────────────────────────
- * 
- * LAYER 1: STRUCTURAL VALIDATION
- * Does the output have the required fields? Is it valid JSON?
- * Is it within length limits? This is deterministic — it either
- * passes or doesn't.
- * 
- * LAYER 2: SEMANTIC KEYWORD MATCHING
- * Does the output text contain enough intent-related keywords?
- * This is fuzzy — we count how many keywords appear and compare
- * against a threshold. The keyword list is intentionally broad
- * to accommodate phrasing variation.
- * 
- * LAYER 3: FORBIDDEN PATTERN DETECTION
- * Does the output contain anything it shouldn't? Hallucination
- * markers, refusal language, fabricated data indicators.
- * Any match here FAILS the output regardless of other scores.
- * 
- * OPTIONAL LAYER 4: CUSTOM VALIDATION
- * Contract-specific logic that can't be expressed as keywords
- * or patterns. Example: "toolsUsed must have 2+ entries."
- * 
- * THE SCORING MODEL:
- * ──────────────────
- * Each layer produces a score from 0 to 1.
- * The final confidence is the WEIGHTED AVERAGE:
- * - Structural: 40% weight (must have right shape)
- * - Semantic:   35% weight (must express right intent)
- * - Forbidden:  25% weight (must not contain bad patterns)
- * 
- * If a custom validator exists, it replaces 10% of the semantic weight.
- * 
- * WHY THESE WEIGHTS:
- * Structure matters most because a broken JSON or missing field is
- * unambiguously wrong. Semantics is next because keyword absence
- * might just mean different phrasing. Forbidden patterns are weighted
- * lowest because a single accidental match shouldn't tank the score —
- * but they DO cause a hard failure if matched.
- * 
- * HOW TO TUNE:
- * If your tests are too flaky (passing sometimes, failing sometimes),
- * you have two knobs:
- * 1. Lower the minKeywordMatchRatio in the contract
- * 2. Add more keywords to the contract (broader coverage)
- * 3. Lower the confidence threshold in your test assertion
- * 
- * If your tests are too permissive (passing when they shouldn't),
- * do the opposite. Add more forbidden patterns. Raise thresholds.
- */
-
-import { ContractDefinition, MatchResult, ValidationResult } from '../agent/types.js';
-
-export class NonDeterministicMatcher {
-
-  /**
-   * Evaluate an output against a behavior contract.
-   * This is the main entry point. Every assertion in AgentAssert
-   * calls this method.
-   * 
-   * @param output - The agent's output (AgentOutput object)
-   * @param contract - The behavior contract to evaluate against
-   * @returns MatchResult with confidence score and details
-   * 
-   * EXAMPLE USAGE:
-   *   const result = NonDeterministicMatcher.evaluate(trace.output, BehaviorContract.SUMMARIZATION);
-   *   expect(result.confidence).toBeGreaterThan(0.7);
-   */
-  static evaluate(output: unknown, contract: ContractDefinition): MatchResult {
-    const details: string[] = [];
-    const scores: { weight: number; score: number; label: string }[] = [];
-
-    // ── LAYER 1: STRUCTURAL VALIDATION ──────────────────
-    const structuralResult = this.checkStructure(output, contract);
-    scores.push({ weight: 0.40, score: structuralResult.score, label: 'structural' });
-    details.push(...structuralResult.details);
-
-    // ── LAYER 2: SEMANTIC KEYWORD MATCHING ──────────────
-    const semanticResult = this.checkSemantics(output, contract);
-    const semanticWeight = contract.customValidator ? 0.25 : 0.35;
-    scores.push({ weight: semanticWeight, score: semanticResult.score, label: 'semantic' });
-    details.push(...semanticResult.details);
-
-    // ── LAYER 3: FORBIDDEN PATTERN DETECTION ────────────
-    const forbiddenResult = this.checkForbiddenPatterns(output, contract);
-    scores.push({ weight: 0.25, score: forbiddenResult.score, label: 'forbidden' });
-    details.push(...forbiddenResult.details);
-
-    // ── OPTIONAL LAYER 4: CUSTOM VALIDATION ─────────────
-    if (contract.customValidator) {
-      const customResult = contract.customValidator(output);
-      scores.push({ weight: 0.10, score: customResult.score, label: 'custom' });
-      details.push(`[custom] ${customResult.reason}`);
-    }
-
-    // ── COMPUTE WEIGHTED CONFIDENCE ─────────────────────
-    const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-    const confidence = scores.reduce((sum, s) => sum + (s.weight * s.score), 0) / totalWeight;
-
-    // Hard failure: if any forbidden pattern was found, override to fail
-    const hasForbiddenViolation = forbiddenResult.score < 1.0;
-
-    return {
-      matched: !hasForbiddenViolation && confidence >= 0.5,
-      confidence: hasForbiddenViolation ? Math.min(confidence, 0.3) : confidence,
-      details,
-    };
-  }
-
-  /**
-   * LAYER 1: Check if output has the required structure.
-   * 
-   * This is the simplest check — does the output object have
-   * the fields the contract requires?
-   * 
-   * WHY 40% WEIGHT:
-   * If the output doesn't even have the right fields, nothing
-   * else matters. A summarization that's missing the 'summary'
-   * field is fundamentally broken regardless of what the text says.
-   */
-  private static checkStructure(
-    output: unknown,
-    contract: ContractDefinition
-  ): { score: number; details: string[] } {
-    const details: string[] = [];
-
-    if (typeof output !== 'object' || output === null) {
-      details.push('[structural] FAIL: output is not an object');
-      return { score: 0, details };
-    }
-
-    const obj = output as Record<string, unknown>;
-    let presentCount = 0;
-
-    for (const field of contract.requiredFields) {
-      if (field in obj && obj[field] !== undefined && obj[field] !== null) {
-        presentCount++;
-        details.push(`[structural] PASS: field "${field}" present`);
-      } else {
-        details.push(`[structural] FAIL: field "${field}" missing`);
-      }
-    }
-
-    // Check length constraint if specified
-    if (contract.maxLengthChars) {
-      const outputStr = JSON.stringify(output);
-      if (outputStr.length > contract.maxLengthChars) {
-        details.push(
-          `[structural] FAIL: output length ${outputStr.length} exceeds max ${contract.maxLengthChars}`
-        );
-        // Don't count this as a field failure — it's a soft constraint
-      } else {
-        details.push(`[structural] PASS: output length ${outputStr.length} within limit`);
-      }
-    }
-
-    const score = contract.requiredFields.length > 0
-      ? presentCount / contract.requiredFields.length
-      : 1.0;
-
-    return { score, details };
-  }
-
-  /**
-   * LAYER 2: Check if output text contains enough intent keywords.
-   * 
-   * HOW THIS WORKS:
-   * 1. Serialize the entire output to a string (JSON.stringify)
-   * 2. Check each keyword in the contract's requiredIntentKeywords
-   * 3. Count how many keywords appear (case-insensitive)
-   * 4. Compare the match ratio against minKeywordMatchRatio
-   * 
-   * WHY STRINGIFY THE WHOLE OBJECT:
-   * Keywords might appear in any field — the summary, the result,
-   * the taskType. By stringifying everything, we search everywhere
-   * at once. This is intentionally permissive.
-   * 
-   * WHY CASE-INSENSITIVE:
-   * "Summary" and "summary" and "SUMMARY" are the same intent.
-   * Don't fail a test because the LLM capitalized differently.
-   */
-  private static checkSemantics(
-    output: unknown,
-    contract: ContractDefinition
-  ): { score: number; details: string[] } {
-    const details: string[] = [];
-    const outputStr = JSON.stringify(output).toLowerCase();
-
-    let matchCount = 0;
-    const matchedKeywords: string[] = [];
-    const missedKeywords: string[] = [];
-
-    for (const keyword of contract.requiredIntentKeywords) {
-      if (outputStr.includes(keyword.toLowerCase())) {
-        matchCount++;
-        matchedKeywords.push(keyword);
-      } else {
-        missedKeywords.push(keyword);
-      }
-    }
-
-    const ratio = contract.requiredIntentKeywords.length > 0
-      ? matchCount / contract.requiredIntentKeywords.length
-      : 1.0;
-
-    const passed = ratio >= contract.minKeywordMatchRatio;
-
-    details.push(
-      `[semantic] ${passed ? 'PASS' : 'FAIL'}: ` +
-      `${matchCount}/${contract.requiredIntentKeywords.length} keywords matched ` +
-      `(${(ratio * 100).toFixed(0)}%, threshold: ${(contract.minKeywordMatchRatio * 100).toFixed(0)}%)`
-    );
-    details.push(`[semantic] Matched: [${matchedKeywords.join(', ')}]`);
-    if (missedKeywords.length > 0 && missedKeywords.length <= 10) {
-      details.push(`[semantic] Missed: [${missedKeywords.join(', ')}]`);
-    }
-
-    // Score is the ratio itself, clamped to 0-1
-    return { score: Math.min(ratio / contract.minKeywordMatchRatio, 1.0), details };
-  }
-
-  /**
-   * LAYER 3: Check that no forbidden patterns appear in the output.
-   * 
-   * CRITICAL BEHAVIOR: This is the STRICTEST check.
-   * If ANY forbidden pattern matches, the score drops to 0.
-   * This is intentional — forbidden patterns represent hard failures:
-   * - "I cannot" → the agent refused the task
-   * - "I fabricated" → the agent hallucinated and admitted it
-   * - "I don't have access" → the agent couldn't use the tools
-   * 
-   * These are not "slightly wrong" — they're categorically wrong.
-   * No amount of correct keywords should compensate for a refusal.
-   */
-  private static checkForbiddenPatterns(
-    output: unknown,
-    contract: ContractDefinition
-  ): { score: number; details: string[] } {
-    const details: string[] = [];
-    const outputStr = JSON.stringify(output);
-
-    let violations = 0;
-
-    for (const pattern of contract.forbiddenPatterns) {
-      if (pattern.test(outputStr)) {
-        violations++;
-        details.push(`[forbidden] FAIL: pattern ${pattern} matched in output`);
-      }
-    }
-
-    if (violations === 0) {
-      details.push(`[forbidden] PASS: no forbidden patterns detected`);
-    }
-
-    // Binary: any violation → score 0
-    return { score: violations > 0 ? 0 : 1.0, details };
-  }
-
-  /**
-   * UTILITY: Check if a specific string appears in the output,
-   * with fuzzy matching support.
-   * 
-   * USE THIS WHEN:
-   * You need a one-off check that doesn't fit into a full contract.
-   * Example: "Does the output mention the file name 'test-results.log'?"
-   * 
-   * @param output - The output to check (any type, gets stringified)
-   * @param target - The string to look for
-   * @param fuzzy - If true, splits target into words and checks each independently.
-   *                "test results log" matches if any two of those words appear.
-   * @param fuzzyThreshold - What fraction of target words must appear (0-1)
-   */
-  static containsIntent(
-    output: unknown,
-    target: string,
-    fuzzy: boolean = false,
-    fuzzyThreshold: number = 0.5
-  ): MatchResult {
-    const outputStr = JSON.stringify(output).toLowerCase();
-    const targetLower = target.toLowerCase();
-
-    if (!fuzzy) {
-      const found = outputStr.includes(targetLower);
-      return {
-        matched: found,
-        confidence: found ? 1.0 : 0.0,
-        details: [found
-          ? `Found "${target}" in output`
-          : `"${target}" not found in output`
-        ],
-      };
-    }
-
-    // Fuzzy: check individual words
-    const words = targetLower.split(/\s+/).filter(w => w.length > 2);
-    let matchCount = 0;
-    for (const word of words) {
-      if (outputStr.includes(word)) matchCount++;
-    }
-
-    const ratio = words.length > 0 ? matchCount / words.length : 0;
-    return {
-      matched: ratio >= fuzzyThreshold,
-      confidence: ratio,
-      details: [`Fuzzy match: ${matchCount}/${words.length} words found (${(ratio * 100).toFixed(0)}%)`],
-    };
-  }
-}
diff --git a/framework/index.ts b/framework/index.ts
new file mode 100644
index 0000000..25f5cab
--- /dev/null
+++ b/framework/index.ts
@@ -0,0 +1,18 @@
+/**
+ * Reusable assertion helpers and types (this POC’s “library” surface).
+ */
+
+export type {
+  AgentOutput,
+  AgentTrace,
+  ContractDefinition,
+  MatchResult,
+  ToolDefinition,
+  ToolResult,
+  TraceStep,
+  ValidationResult,
+} from './types.js';
+
+export { AgentAssert } from './AgentAssert.js';
+export { BehaviorContract } from './BehaviorContract.js';
+export { HeuristicContractMatcher } from './HeuristicContractMatcher.js';
diff --git a/framework/types.ts b/framework/types.ts
new file mode 100644
index 0000000..7f75d94
--- /dev/null
+++ b/framework/types.ts
@@ -0,0 +1,91 @@
+/**
+ * framework/types.ts
+ *
+ * Public types for the assertion helpers (`AgentAssert`, contracts, traces).
+ * The demo agent under `examples/agent/` imports these same types so traces
+ * and contracts stay aligned — this file is not part of the example SUT.
+ *
+ * `AgentTrace` is what assertions operate on: tool decisions and structured output.
+ */
+
+// ─────────────────────────────────────────────
+// TOOL DEFINITIONS (used by demo agent + registry)
+// ─────────────────────────────────────────────
+
+/**
+ * Describes a tool the agent can call. Maps directly to
+ * Anthropic's tool_use schema (and by extension, MCP protocol).
+ */
+export interface ToolDefinition {
+  name: string;
+  description: string;
+  inputSchema: Record<string, unknown>;
+  execute: (input: Record<string, unknown>) => Promise<ToolResult>;
+}
+
+export interface ToolResult {
+  success: boolean;
+  data: unknown;
+  error?: string;
+}
+
+// ─────────────────────────────────────────────
+// AGENT TRACE
+// ─────────────────────────────────────────────
+
+export interface TraceStep {
+  type: 'tool_call' | 'tool_result' | 'reasoning' | 'output';
+  toolName?: string;
+  toolInput?: Record<string, unknown>;
+  toolOutput?: unknown;
+  content?: string;
+  timestamp: number;
+}
+
+export interface AgentTrace {
+  input: string;
+  steps: TraceStep[];
+  output: AgentOutput;
+  metadata: {
+    model: string;
+    provider?: 'anthropic' | 'openai' | 'ollama';
+    durationMs: number;
+    toolCallCount: number;
+    retryCount: number;
+  };
+}
+
+export interface AgentOutput {
+  taskType: string;
+  result: unknown;
+  toolsUsed: string[];
+  confidence: number;
+  summary: string;
+}
+
+// ─────────────────────────────────────────────
+// BEHAVIOR CONTRACTS
+// ─────────────────────────────────────────────
+
+export interface ContractDefinition {
+  name: string;
+  description: string;
+  requiredFields: string[];
+  requiredIntentKeywords: string[];
+  minKeywordMatchRatio: number;
+  forbiddenPatterns: RegExp[];
+  maxLengthChars?: number;
+  customValidator?: (output: unknown) => ValidationResult;
+}
+
+export interface ValidationResult {
+  passed: boolean;
+  score: number;
+  reason: string;
+}
+
+export interface MatchResult {
+  matched: boolean;
+  confidence: number;
+  details: string[];
+}
diff --git a/index.ts b/index.ts
new file mode 100644
index 0000000..5660754
--- /dev/null
+++ b/index.ts
@@ -0,0 +1,12 @@
+/**
+ * Public entry — re-exports the reusable assertion layer from `framework/`.
+ *
+ * This repo is a **proof-of-concept** (`"private": true`); it is not published
+ * to npm. The demo LLM agent lives under `examples/agent/`, not here.
+ *
+ * In a TypeScript project that vendors this repo, use:
+ *   import { AgentAssert, BehaviorContract } from 'agent-assert';
+ * (with `package.json` dependencies pointing at this path or Git URL.)
+ */
+
+export * from './framework/index.js';
diff --git a/package.json b/package.json
index 3946790..2e041b4 100644
--- a/package.json
+++ b/package.json
@@ -1,12 +1,17 @@
 {
   "name": "agent-assert",
   "version": "0.1.0",
-  "description": "A Playwright-based testing framework for agentic AI systems using MCP tool orchestration",
+  "private": true,
+  "description": "POC: Playwright tests + heuristic contracts for tool-calling LLM agents. Demo agent in examples/agent/. Not published to npm.",
   "author": "Biresh Patel",
   "license": "MIT",
   "type": "module",
+  "exports": {
+    ".": "./index.ts"
+  },
   "scripts": {
     "test": "npx playwright test",
+    "test:smoke": "npx playwright test tests/behavioral/intent-routing.spec.ts -g \"routes file-read intent to file-reader tool\"",
     "test:behavioral": "npx playwright test tests/behavioral/",
     "test:boundary": "npx playwright test tests/boundary/",
     "test:ollama": "LLM_PROVIDER=ollama npx playwright test",
diff --git a/playwright.config.ts b/playwright.config.ts
index 12043c8..d88d78c 100644
--- a/playwright.config.ts
+++ b/playwright.config.ts
@@ -31,6 +31,12 @@ import { applyLlmVarsFromDotEnv } from './tests/env-llm.js';
 /** Loads LLM + API keys from `.env` into `process.env` (see tests/env-llm.ts). */
 applyLlmVarsFromDotEnv();
 
+/** GitHub Actions and similar CI set `CI=true`. Local Ollama on CPU there often exceeds 120s per test. */
+const isCi = process.env.CI === 'true';
+
+/** Per-test cap for LLM + tool runs (`behavioral` project). CI gets a longer budget. */
+const behavioralTimeoutMs = isCi ? 300_000 : 120_000;
+
 /** Label for the HTML report header: provider + resolved model (aligned with agent defaults). */
 function htmlReportTitle(): string {
   const p = process.env.LLM_PROVIDER?.toLowerCase();
@@ -40,7 +46,7 @@ function htmlReportTitle(): string {
     provider === 'openai'
       ? 'gpt-4o'
       : provider === 'ollama'
-        ? 'llama3:latest'
+        ? 'llama3.2:3b'
         : 'claude-sonnet-4-20250514';
   const model = process.env.LLM_MODEL?.trim() || defaultModel;
   return `Playwright report · LLM: ${provider} · ${model}`;
@@ -53,7 +59,7 @@ export default defineConfig({
   // Match files ending in .spec.ts
   testMatch: '**/*.spec.ts',
 
-  // Default cap; behavioral project overrides to 60s (multi-step LLM runs often need it).
+  // Default cap; behavioral project overrides for LLM + tool runs (see projects below).
   timeout: 45_000,
 
   // NON-DETERMINISM STRATEGY: Retry each failed test once.
@@ -93,8 +99,8 @@ export default defineConfig({
     {
       name: 'behavioral',
       testDir: './tests/behavioral',
-      // Multi-tool / multi-round agent runs often exceed 30s (local LLMs, variance).
-      timeout: 60_000,
+      // Local: 2 min is usually enough. CI (Ollama on shared CPU): often 2–4+ min for first inference.
+      timeout: behavioralTimeoutMs,
     },
     {
       name: 'boundary',
diff --git a/tests/behavioral/output-contract.spec.ts b/tests/behavioral/output-contract.spec.ts
index 3c0bd7c..92b6ab3 100644
--- a/tests/behavioral/output-contract.spec.ts
+++ b/tests/behavioral/output-contract.spec.ts
@@ -3,10 +3,10 @@
  * 
  * PATTERN 2: BEHAVIOR CONTRACT VALIDATION
  * ─────────────────────────────────────────
- * Tests whether the agent's output satisfies SEMANTIC CONTRACTS
- * rather than exact string matches.
- * 
- * THIS IS THE CORE INNOVATION OF THE FRAMEWORK.
+ * Tests whether the agent's output satisfies BEHAVIOR CONTRACTS (heuristic:
+ * fields, keywords, forbidden patterns) rather than exact string matches.
+ *
+ * Core idea of this pattern in the framework:
  * 
  * THE PROBLEM WITH expect(output).toBe("..."):
  * Run 1: "The file contains 2 test failures: payment timeout and card error"
@@ -20,7 +20,7 @@
  * 1. The output has the right structure (required fields)
  * 2. The output expresses the right intent (keyword matching)
  * 3. The output doesn't contain red flags (forbidden patterns)
- * 4. The confidence score from NonDeterministicMatcher is above threshold
+ * 4. The heuristic confidence score from HeuristicContractMatcher is above threshold
  * 
  * TUNING GUIDANCE:
  * If tests are too flaky → lower minKeywordMatchRatio or add more keywords
@@ -30,7 +30,7 @@
 import { test, expect } from '@playwright/test';
 import { AgentAssert } from '../../framework/AgentAssert.js';
 import { BehaviorContract } from '../../framework/BehaviorContract.js';
-import { NonDeterministicMatcher } from '../../framework/NonDeterministicMatcher.js';
+import { HeuristicContractMatcher } from '../../framework/HeuristicContractMatcher.js';
 import {
   createTestAgent,
   registerAgentTraceForDiagnostics,
@@ -130,7 +130,7 @@ test.describe('Behavior Contract Validation', () => {
   /**
    * TEST 2D: Output contains intent-specific content (fuzzy match)
    * 
-   * Uses NonDeterministicMatcher.containsIntent directly for a
+   * Uses HeuristicContractMatcher.containsIntent directly for a
    * targeted check: does the output mention the payment failure
    * from the fixture file?
    * 
@@ -144,7 +144,7 @@ test.describe('Behavior Contract Validation', () => {
     registerAgentTraceForDiagnostics(testInfo, trace);
 
     // Check that the output mentions the payment failure
-    const result = NonDeterministicMatcher.containsIntent(
+    const result = HeuristicContractMatcher.containsIntent(
       trace.output,
       'payment gateway timeout failure',
       true,   // fuzzy matching enabled
@@ -163,7 +163,7 @@ test.describe('Behavior Contract Validation', () => {
    * TEST 2E: Output has valid AgentOutput structure
    * 
    * Structural check — the output must be a valid AgentOutput object
-   * with all required fields. This doesn't check semantics, just shape.
+   * with all required fields. This doesn't run contract/heuristic checks — only shape.
    * 
    * If this fails, the agent's system prompt isn't working —
    * Claude isn't producing structured JSON output.
diff --git a/tests/boundary/retry-behavior.spec.ts b/tests/boundary/retry-behavior.spec.ts
index a08cad9..f08d8da 100644
--- a/tests/boundary/retry-behavior.spec.ts
+++ b/tests/boundary/retry-behavior.spec.ts
@@ -37,7 +37,6 @@
 import { test, expect } from '@playwright/test';
 import { AgentAssert } from '../../framework/AgentAssert.js';
 import { BehaviorContract } from '../../framework/BehaviorContract.js';
-import { NonDeterministicMatcher } from '../../framework/NonDeterministicMatcher.js';
 import {
   createFailingApiAgent,
   createFailingFileReaderAgent,
@@ -46,7 +45,7 @@ import {
   teardownFixtureFiles,
 } from '../fixtures/setup.js';
 import { FAILURE_PROMPTS } from '../fixtures/prompts.js';
-import type { AgentTrace } from '../../agent/types.js';
+import type { AgentTrace } from '../../framework/types.js';
 
 /** True if the trace records a failed file-reader tool result (upstream failure actually happened). */
 function hasFileReaderToolFailure(trace: AgentTrace): boolean {
diff --git a/tests/env-llm.ts b/tests/env-llm.ts
index 3e5d545..60f2b0a 100644
--- a/tests/env-llm.ts
+++ b/tests/env-llm.ts
@@ -17,6 +17,9 @@ const DOTENV_KEYS = new Set([
   'LLM_MODEL',
   'OPENAI_API_KEY',
   'ANTHROPIC_API_KEY',
+  /** Optional: Ollama Cloud / some `*:cloud` model tags. */
+  'OLLAMA_API_KEY',
+  'OLLAMA_BASE_URL',
 ]);
 
 export function applyLlmVarsFromDotEnv(): void {
diff --git a/tests/fixtures/expected-schemas.ts b/tests/fixtures/expected-schemas.ts
index 9fca1ff..b9b6e43 100644
--- a/tests/fixtures/expected-schemas.ts
+++ b/tests/fixtures/expected-schemas.ts
@@ -4,11 +4,11 @@
  * JSON SCHEMA CONTRACTS
  * 
  * These schemas define the STRUCTURE of valid agent outputs.
- * They complement the BehaviorContracts (which check semantics).
+ * They complement the BehaviorContracts (which apply keyword/heuristic checks on content).
  * 
  * SCHEMA vs CONTRACT:
  * - Schema: "Does the output have the right fields and types?"
- * - Contract: "Does the output express the right intent?"
+ * - Contract: "Does the output pass keyword / pattern heuristics for intent?"
  * 
  * Both are needed. An output can have perfect structure but
  * wrong content (schema passes, contract fails). Or it can
diff --git a/tests/fixtures/setup.ts b/tests/fixtures/setup.ts
index a52f443..dfc9972 100644
--- a/tests/fixtures/setup.ts
+++ b/tests/fixtures/setup.ts
@@ -30,11 +30,11 @@ import * as fs from 'fs/promises';
 import * as path from 'path';
 import { test, type TestInfo } from '@playwright/test';
 import { applyLlmVarsFromDotEnv } from '../env-llm.js';
-import type { AgentTrace } from '../../agent/types.js';
-import { Agent, type AgentConfig } from '../../agent/agent.js';
-import { ToolRegistry } from '../../agent/tools/registry.js';
-import { createFileReaderTool } from '../../agent/tools/file-reader.js';
-import { createApiCallerTool, MockResponse } from '../../agent/tools/api-caller.js';
+import type { AgentTrace } from '../../framework/types.js';
+import { Agent, type AgentConfig } from '../../examples/agent/agent.js';
+import { ToolRegistry } from '../../examples/agent/tools/registry.js';
+import { createFileReaderTool } from '../../examples/agent/tools/file-reader.js';
+import { createApiCallerTool, MockResponse } from '../../examples/agent/tools/api-caller.js';
 
 applyLlmVarsFromDotEnv();
 
@@ -218,7 +218,7 @@ test.afterEach(async ({}, testInfo) => {
 /**
  * Optional env-driven overrides for which LLM to use in tests.
  * - `LLM_PROVIDER=openai` + `OPENAI_API_KEY` — OpenAI cloud (default model `gpt-4o` unless `LLM_MODEL`).
- * - `LLM_PROVIDER=ollama` — local Ollama (OpenAI-compatible API); default model `llama3:latest` unless `LLM_MODEL`.
+ * - `LLM_PROVIDER=ollama` — local Ollama (OpenAI-compatible API); default model `llama3.2:3b` unless `LLM_MODEL`.
  * - `LLM_PROVIDER=openai` + `OPENAI_BASE_URL` (e.g. `http://127.0.0.1:11434/v1`) — same as Ollama without renaming provider.
  * - Default when unset: Anthropic with `claude-sonnet-4-20250514` unless `LLM_MODEL` overrides.
  */
diff --git a/tsconfig.json b/tsconfig.json
index 527b7ce..6d5607a 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -7,6 +7,7 @@
     "strict": true,
     "outDir": "./dist",
     "rootDir": ".",
+    "baseUrl": ".",
     "declaration": true,
     "sourceMap": true,
     "resolveJsonModule": true,
@@ -14,9 +15,10 @@
     "forceConsistentCasingInFileNames": true,
     "paths": {
       "@framework/*": ["./framework/*"],
-      "@agent/*": ["./agent/*"]
+      "@agent/*": ["./examples/agent/*"],
+      "agent-assert": ["./index.ts"]
     }
   },
-  "include": ["agent/**/*.ts", "framework/**/*.ts", "tests/**/*.ts"],
+  "include": ["index.ts", "framework/**/*.ts", "examples/**/*.ts", "tests/**/*.ts"],
   "exclude": ["node_modules", "dist"]
 }