Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ packages/
judges/
legacy/
harness-ai-sdk/
harness-flue/
harness-pi-ai/
apps/
demo-ai-sdk/
demo-flue/
demo-pi/
docs/
```
Expand All @@ -62,6 +64,10 @@ Owns:

Owns the AI SDK adapter into `HarnessRun`.

### `packages/harness-flue`

Owns the Flue framework adapter into `HarnessRun`.

### `packages/harness-pi-ai`

Owns the `pi-ai` adapter, wrapped tool runtime, and tool replay behavior.
Expand Down Expand Up @@ -106,6 +112,7 @@ Prefer targeted verification when possible.
- Root API changes: test `packages/vitest-evals/src/*.test.ts`
- Reporter changes: test `packages/vitest-evals/src/reporter.test.ts`
- AI SDK harness changes: test `packages/harness-ai-sdk/src/index.test.ts`
- Flue harness changes: test `packages/harness-flue/src/index.test.ts`
- `pi-ai` harness changes: test `packages/harness-pi-ai/src/index.test.ts`
- Legacy changes: test `packages/vitest-evals/src/legacy/...`
- Demo behavior changes: run `pnpm evals` or a filtered demo command
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ Monorepo for the explicit-run `vitest-evals` shape:
- `packages/vitest-evals`: core suite API, judges, normalized harness/session
types, and reporter
- `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter
- `packages/harness-flue`: Flue framework harness adapter
- `packages/harness-openai-agents`: `@openai/agents`-focused harness adapter
- `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay
- `packages/github-reporter`: GitHub Actions summary, annotation, and optional
Check Run publishing from Vitest JSON output
- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools
- `apps/demo-flue`: end-to-end Flue demo evals with app-local refund tools
- `apps/demo-openai-agents`: end-to-end OpenAI Agents demo evals with
app-local refund tools
- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent

## Reading Guide

Expand All @@ -34,11 +36,13 @@ Monorepo for the explicit-run `vitest-evals` shape:
packages/
vitest-evals/
harness-ai-sdk/
harness-flue/
harness-openai-agents/
harness-pi-ai/
github-reporter/
apps/
demo-ai-sdk/
demo-flue/
demo-openai-agents/
demo-pi/
```
Expand Down
49 changes: 49 additions & 0 deletions apps/demo-flue/evals/refund.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { anthropic } from "@ai-sdk/anthropic";
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
import { describeEval, FactualityJudge } from "vitest-evals";
import { assertRefundCase, refundHarness, type RefundCase } from "./shared";

const judgeHarness = aiSdkJudgeHarness({
model: anthropic("claude-sonnet-4-5"),
temperature: 0,
});
const factualityJudge = FactualityJudge({ judgeHarness });

describeEval(
"demo flue refund agent",
{
skipIf: () => !process.env.ANTHROPIC_API_KEY,
harness: refundHarness,
judges: [factualityJudge],
judgeThreshold: 0.6,
},
(it) => {
it("approves refundable invoice", async ({ run }) => {
const metadata: Omit<RefundCase, "input"> = {
expected:
"Invoice inv_123 should be approved and refunded for the full 4200 cents.",
expectedStatus: "approved",
expectedTools: ["lookupInvoice", "createRefund"],
};

await assertRefundCase(
await run("Refund invoice inv_123", { metadata }),
metadata,
);
});

it("denies non-refundable invoice", async ({ run }) => {
const metadata: Omit<RefundCase, "input"> = {
expected:
"Invoice inv_404 should be denied because it is not refundable.",
expectedStatus: "denied",
expectedTools: ["lookupInvoice"],
};

await assertRefundCase(
await run("Refund invoice inv_404", { metadata }),
metadata,
);
});
},
);
156 changes: 156 additions & 0 deletions apps/demo-flue/evals/shared.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import { flueHarness } from "@vitest-evals/harness-flue";
import { Type } from "@flue/runtime";
import type { ToolDef } from "@flue/runtime";
import {
createFlueContext,
InMemorySessionStore,
bashFactoryToSessionEnv,
resolveModel,
} from "@flue/runtime/internal";
import * as v from "valibot";
import { expect } from "vitest";
import { type HarnessRun, toolCalls } from "vitest-evals";

type RefundDecision =
| {
status: "approved";
invoiceId: string;
refundId: string;
amount: number;
}
| {
status: "denied";
invoiceId: string;
reason: string;
};

export type RefundCase = {
input: string;
expected?: unknown;
expectedStatus: RefundDecision["status"];
expectedTools: string[];
};

export const REFUND_MODEL = "anthropic/claude-sonnet-4-6";

const INVOICES: Record<
string,
{ invoiceId: string; amount: number; refundable: boolean; customer: string }
> = {
inv_123: {
invoiceId: "inv_123",
amount: 4200,
refundable: true,
customer: "Acme Co",
},
inv_404: {
invoiceId: "inv_404",
amount: 1700,
refundable: false,
customer: "Globex",
},
};

const refundTools: ToolDef[] = [
{
name: "lookupInvoice",
description: "Look up invoice details inside demo billing.",
parameters: Type.Object({
invoiceId: Type.String({ description: "The invoice id to inspect." }),
}),
execute: async (args) => {
const invoice = INVOICES[args.invoiceId as string];
if (!invoice) throw new Error(`Invoice ${args.invoiceId} not found`);
return JSON.stringify(invoice);
},
},
{
name: "createRefund",
description: "Create a refund for a refundable invoice.",
parameters: Type.Object({
invoiceId: Type.String({ description: "The invoice id to refund." }),
amount: Type.Number({ description: "The amount to refund in cents." }),
}),
execute: async (args) => {
return JSON.stringify({
refundId: `rf_${args.invoiceId}`,
amount: args.amount,
status: "submitted",
});
},
},
];

const refundResultSchema = v.object({
status: v.picklist(["approved", "denied"]),
invoiceId: v.string(),
refundId: v.optional(v.string()),
amount: v.optional(v.number()),
reason: v.optional(v.string()),
});

export const refundHarness = flueHarness<string, RefundDecision>({
name: "flue-refund-agent",
model: REFUND_MODEL,
run: async (input, { signal, eventHandler }) => {
const store = new InMemorySessionStore();
const runId = crypto.randomUUID();

const ctx = createFlueContext({
id: `eval-${runId}`,
runId,
payload: input,
env: process.env as Record<string, any>,
agentConfig: {
systemPrompt: "",
skills: {},
roles: {},
model: resolveModel(REFUND_MODEL),
resolveModel,
},
createDefaultEnv: async () => {
const { Bash } = await import("just-bash");
return bashFactoryToSessionEnv(() => new Bash());
},
defaultStore: store,
});

ctx.subscribeEvent(eventHandler);

const harness = await ctx.init({
model: REFUND_MODEL,
tools: refundTools,
});
const session = await harness.session();

return await session.prompt(
[
"You are the demo refund operations agent.",
"You must decide whether a refund should be approved for the invoice in the user's request.",
"Always call lookupInvoice before making a decision.",
"If the invoice is refundable, call createRefund with the full invoice amount.",
"If the invoice is not refundable, do not call createRefund.",
"",
input,
].join("\n"),
{ result: refundResultSchema, signal },
);
},
output: (response) => {
if ("data" in response) return response.data as RefundDecision;
throw new Error("Expected structured result from Flue agent");
},
});

export async function assertRefundCase(
run: HarnessRun,
expected: Pick<RefundCase, "expectedStatus" | "expectedTools">,
) {
expect(run.output).toMatchObject({ status: expected.expectedStatus });
expect(toolCalls(run.session).map((call) => call.name)).toEqual(
expected.expectedTools,
);
expect(run.usage.provider).toBe("anthropic");
expect(run.usage.model).toContain("claude");
expect(run.usage.totalTokens).toBeGreaterThan(0);
}
19 changes: 19 additions & 0 deletions apps/demo-flue/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"name": "@demo/demo-flue",
"private": true,
"version": "0.1.0",
"scripts": {
"evals": "node ./scripts/run-evals.mjs",
"evals:info": "node ./scripts/run-evals.mjs --info",
"evals:verbose": "node ./scripts/run-evals.mjs --info"
},
"dependencies": {
"@flue/runtime": "^0.7.0",
"@vitest-evals/harness-ai-sdk": "workspace:*",
"@vitest-evals/harness-flue": "workspace:*",
"ai": "^6.0.141",
"@ai-sdk/anthropic": "^3.0.71",
"valibot": "^1.1.0",
"vitest-evals": "workspace:*"
}
}
48 changes: 48 additions & 0 deletions apps/demo-flue/scripts/run-evals.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { spawnSync } from "node:child_process";
import { dirname, resolve } from "node:path";
import { fileURLToPath } from "node:url";
import { createEvalEnv, parseEvalCliArgs } from "../../../scripts/eval-cli.mjs";

const WORKSPACE_ROOT = resolve(
dirname(fileURLToPath(import.meta.url)),
"../../..",
);

const { failMode, forwardedArgs, reportLevel } = parseEvalCliArgs(
process.argv.slice(2),
);
const env = createEvalEnv(process.env, reportLevel, { failMode });

const explicitTargetIndex = forwardedArgs.findIndex(
(arg) => !arg.startsWith("-"),
);
const target =
explicitTargetIndex >= 0
? forwardedArgs.splice(explicitTargetIndex, 1)[0]
: "apps/demo-flue/evals/refund.eval.ts";

const command = [
"exec",
"dotenv",
"-e",
".env",
"-e",
".env.local",
"--",
"vitest",
"run",
target,
"--config",
"vitest.config.ts",
"--reporter",
"packages/vitest-evals/src/reporter.ts",
...forwardedArgs,
];

const result = spawnSync("pnpm", command, {
cwd: WORKSPACE_ROOT,
env,
stdio: "inherit",
});

process.exit(result.status ?? 1);
8 changes: 8 additions & 0 deletions apps/demo-flue/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"module": "esnext",
"moduleResolution": "bundler"
},
"include": ["**/*.ts"]
}
4 changes: 4 additions & 0 deletions packages/docs/src/content/docs/docs/harnesses.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ full control over normalized run data.
<strong>AI SDK</strong>
<span>Use when your app calls <code>generateText</code>, <code>streamText</code>, or an AI SDK wrapper.</span>
</a>
<a class="api-link-card" href="/docs/harnesses/flue/">
<strong>Flue</strong>
<span>Use when your app builds agents with the Flue agent harness framework.</span>
</a>
<a class="api-link-card" href="/docs/harnesses/openai-agents/">
<strong>OpenAI Agents</strong>
<span>Use when your app owns an <code>Agent</code> and runs it with a <code>Runner</code>.</span>
Expand Down
Loading
Loading