getsentry · sergical · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -39,9 +39,11 @@ packages/
       judges/
       legacy/
   harness-ai-sdk/
+  harness-flue/
   harness-pi-ai/
 apps/
   demo-ai-sdk/
+  demo-flue/
   demo-pi/
 docs/
 ```
@@ -62,6 +64,10 @@ Owns:
 
 Owns the AI SDK adapter into `HarnessRun`.
 
+### `packages/harness-flue`
+
+Owns the Flue framework adapter into `HarnessRun`.
+
 ### `packages/harness-pi-ai`
 
 Owns the `pi-ai` adapter, wrapped tool runtime, and tool replay behavior.
@@ -106,6 +112,7 @@ Prefer targeted verification when possible.
 - Root API changes: test `packages/vitest-evals/src/*.test.ts`
 - Reporter changes: test `packages/vitest-evals/src/reporter.test.ts`
 - AI SDK harness changes: test `packages/harness-ai-sdk/src/index.test.ts`
+- Flue harness changes: test `packages/harness-flue/src/index.test.ts`
 - `pi-ai` harness changes: test `packages/harness-pi-ai/src/index.test.ts`
 - Legacy changes: test `packages/vitest-evals/src/legacy/...`
 - Demo behavior changes: run `pnpm evals` or a filtered demo command

diff --git a/README.md b/README.md
@@ -5,14 +5,16 @@ Monorepo for the explicit-run `vitest-evals` shape:
 - `packages/vitest-evals`: core suite API, judges, normalized harness/session
   types, and reporter
 - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter
+- `packages/harness-flue`: Flue framework harness adapter
 - `packages/harness-openai-agents`: `@openai/agents`-focused harness adapter
 - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay
 - `packages/github-reporter`: GitHub Actions summary, annotation, and optional
   Check Run publishing from Vitest JSON output
-- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
 - `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools
+- `apps/demo-flue`: end-to-end Flue demo evals with app-local refund tools
 - `apps/demo-openai-agents`: end-to-end OpenAI Agents demo evals with
   app-local refund tools
+- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
 
 ## Reading Guide
 
@@ -34,11 +36,13 @@ Monorepo for the explicit-run `vitest-evals` shape:
 packages/
   vitest-evals/
   harness-ai-sdk/
+  harness-flue/
   harness-openai-agents/
   harness-pi-ai/
   github-reporter/
 apps/
   demo-ai-sdk/
+  demo-flue/
   demo-openai-agents/
   demo-pi/
 ```

diff --git a/apps/demo-flue/evals/refund.eval.ts b/apps/demo-flue/evals/refund.eval.ts
@@ -0,0 +1,49 @@
+import { anthropic } from "@ai-sdk/anthropic";
+import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
+import { describeEval, FactualityJudge } from "vitest-evals";
+import { assertRefundCase, refundHarness, type RefundCase } from "./shared";
+
+const judgeHarness = aiSdkJudgeHarness({
+  model: anthropic("claude-sonnet-4-5"),
+  temperature: 0,
+});
+const factualityJudge = FactualityJudge({ judgeHarness });
+
+describeEval(
+  "demo flue refund agent",
+  {
+    skipIf: () => !process.env.ANTHROPIC_API_KEY,
+    harness: refundHarness,
+    judges: [factualityJudge],
+    judgeThreshold: 0.6,
+  },
+  (it) => {
+    it("approves refundable invoice", async ({ run }) => {
+      const metadata: Omit<RefundCase, "input"> = {
+        expected:
+          "Invoice inv_123 should be approved and refunded for the full 4200 cents.",
+        expectedStatus: "approved",
+        expectedTools: ["lookupInvoice", "createRefund"],
+      };
+
+      await assertRefundCase(
+        await run("Refund invoice inv_123", { metadata }),
+        metadata,
+      );
+    });
+
+    it("denies non-refundable invoice", async ({ run }) => {
+      const metadata: Omit<RefundCase, "input"> = {
+        expected:
+          "Invoice inv_404 should be denied because it is not refundable.",
+        expectedStatus: "denied",
+        expectedTools: ["lookupInvoice"],
+      };
+
+      await assertRefundCase(
+        await run("Refund invoice inv_404", { metadata }),
+        metadata,
+      );
+    });
+  },
+);
diff --git a/apps/demo-flue/evals/shared.ts b/apps/demo-flue/evals/shared.ts
@@ -0,0 +1,156 @@
+import { flueHarness } from "@vitest-evals/harness-flue";
+import { Type } from "@flue/runtime";
+import type { ToolDef } from "@flue/runtime";
+import {
+  createFlueContext,
+  InMemorySessionStore,
+  bashFactoryToSessionEnv,
+  resolveModel,
+} from "@flue/runtime/internal";
+import * as v from "valibot";
+import { expect } from "vitest";
+import { type HarnessRun, toolCalls } from "vitest-evals";
+
+type RefundDecision =
+  | {
+      status: "approved";
+      invoiceId: string;
+      refundId: string;
+      amount: number;
+    }
+  | {
+      status: "denied";
+      invoiceId: string;
+      reason: string;
+    };
+
+export type RefundCase = {
+  input: string;
+  expected?: unknown;
+  expectedStatus: RefundDecision["status"];
+  expectedTools: string[];
+};
+
+export const REFUND_MODEL = "anthropic/claude-sonnet-4-6";
+
+const INVOICES: Record<
+  string,
+  { invoiceId: string; amount: number; refundable: boolean; customer: string }
+> = {
+  inv_123: {
+    invoiceId: "inv_123",
+    amount: 4200,
+    refundable: true,
+    customer: "Acme Co",
+  },
+  inv_404: {
+    invoiceId: "inv_404",
+    amount: 1700,
+    refundable: false,
+    customer: "Globex",
+  },
+};
+
+const refundTools: ToolDef[] = [
+  {
+    name: "lookupInvoice",
+    description: "Look up invoice details inside demo billing.",
+    parameters: Type.Object({
+      invoiceId: Type.String({ description: "The invoice id to inspect." }),
+    }),
+    execute: async (args) => {
+      const invoice = INVOICES[args.invoiceId as string];
+      if (!invoice) throw new Error(`Invoice ${args.invoiceId} not found`);
+      return JSON.stringify(invoice);
+    },
+  },
+  {
+    name: "createRefund",
+    description: "Create a refund for a refundable invoice.",
+    parameters: Type.Object({
+      invoiceId: Type.String({ description: "The invoice id to refund." }),
+      amount: Type.Number({ description: "The amount to refund in cents." }),
+    }),
+    execute: async (args) => {
+      return JSON.stringify({
+        refundId: `rf_${args.invoiceId}`,
+        amount: args.amount,
+        status: "submitted",
+      });
+    },
+  },
+];
+
+const refundResultSchema = v.object({
+  status: v.picklist(["approved", "denied"]),
+  invoiceId: v.string(),
+  refundId: v.optional(v.string()),
+  amount: v.optional(v.number()),
+  reason: v.optional(v.string()),
+});
+
+export const refundHarness = flueHarness<string, RefundDecision>({
+  name: "flue-refund-agent",
+  model: REFUND_MODEL,
+  run: async (input, { signal, eventHandler }) => {
+    const store = new InMemorySessionStore();
+    const runId = crypto.randomUUID();
+
+    const ctx = createFlueContext({
+      id: `eval-${runId}`,
+      runId,
+      payload: input,
+      env: process.env as Record<string, any>,
+      agentConfig: {
+        systemPrompt: "",
+        skills: {},
+        roles: {},
+        model: resolveModel(REFUND_MODEL),
+        resolveModel,
+      },
+      createDefaultEnv: async () => {
+        const { Bash } = await import("just-bash");
+        return bashFactoryToSessionEnv(() => new Bash());
+      },
+      defaultStore: store,
+    });
+
+    ctx.subscribeEvent(eventHandler);
+
+    const harness = await ctx.init({
+      model: REFUND_MODEL,
+      tools: refundTools,
+    });
+    const session = await harness.session();
+
+    return await session.prompt(
+      [
+        "You are the demo refund operations agent.",
+        "You must decide whether a refund should be approved for the invoice in the user's request.",
+        "Always call lookupInvoice before making a decision.",
+        "If the invoice is refundable, call createRefund with the full invoice amount.",
+        "If the invoice is not refundable, do not call createRefund.",
+        "",
+        input,
+      ].join("\n"),
+      { result: refundResultSchema, signal },
+    );
+  },
+  output: (response) => {
+    if ("data" in response) return response.data as RefundDecision;
+    throw new Error("Expected structured result from Flue agent");
+  },
+});
+
+export async function assertRefundCase(
+  run: HarnessRun,
+  expected: Pick<RefundCase, "expectedStatus" | "expectedTools">,
+) {
+  expect(run.output).toMatchObject({ status: expected.expectedStatus });
+  expect(toolCalls(run.session).map((call) => call.name)).toEqual(
+    expected.expectedTools,
+  );
+  expect(run.usage.provider).toBe("anthropic");
+  expect(run.usage.model).toContain("claude");
+  expect(run.usage.totalTokens).toBeGreaterThan(0);
+}
diff --git a/apps/demo-flue/package.json b/apps/demo-flue/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "@demo/demo-flue",
+  "private": true,
+  "version": "0.1.0",
+  "scripts": {
+    "evals": "node ./scripts/run-evals.mjs",
+    "evals:info": "node ./scripts/run-evals.mjs --info",
+    "evals:verbose": "node ./scripts/run-evals.mjs --info"
+  },
+  "dependencies": {
+    "@flue/runtime": "^0.7.0",
+    "@vitest-evals/harness-ai-sdk": "workspace:*",
+    "@vitest-evals/harness-flue": "workspace:*",
+    "ai": "^6.0.141",
+    "@ai-sdk/anthropic": "^3.0.71",
+    "valibot": "^1.1.0",
+    "vitest-evals": "workspace:*"
+  }
+}
diff --git a/apps/demo-flue/scripts/run-evals.mjs b/apps/demo-flue/scripts/run-evals.mjs
@@ -0,0 +1,48 @@
+import { spawnSync } from "node:child_process";
+import { dirname, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+import { createEvalEnv, parseEvalCliArgs } from "../../../scripts/eval-cli.mjs";
+
+const WORKSPACE_ROOT = resolve(
+  dirname(fileURLToPath(import.meta.url)),
+  "../../..",
+);
+
+const { failMode, forwardedArgs, reportLevel } = parseEvalCliArgs(
+  process.argv.slice(2),
+);
+const env = createEvalEnv(process.env, reportLevel, { failMode });
+
+const explicitTargetIndex = forwardedArgs.findIndex(
+  (arg) => !arg.startsWith("-"),
+);
+const target =
+  explicitTargetIndex >= 0
+    ? forwardedArgs.splice(explicitTargetIndex, 1)[0]
+    : "apps/demo-flue/evals/refund.eval.ts";
+
+const command = [
+  "exec",
+  "dotenv",
+  "-e",
+  ".env",
+  "-e",
+  ".env.local",
+  "--",
+  "vitest",
+  "run",
+  target,
+  "--config",
+  "vitest.config.ts",
+  "--reporter",
+  "packages/vitest-evals/src/reporter.ts",
+  ...forwardedArgs,
+];
+
+const result = spawnSync("pnpm", command, {
+  cwd: WORKSPACE_ROOT,
+  env,
+  stdio: "inherit",
+});
+
+process.exit(result.status ?? 1);
diff --git a/apps/demo-flue/tsconfig.json b/apps/demo-flue/tsconfig.json
@@ -0,0 +1,8 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "module": "esnext",
+    "moduleResolution": "bundler"
+  },
+  "include": ["**/*.ts"]
+}
diff --git a/packages/docs/src/content/docs/docs/harnesses.mdx b/packages/docs/src/content/docs/docs/harnesses.mdx
@@ -20,6 +20,10 @@ full control over normalized run data.
     <strong>AI SDK</strong>
     <span>Use when your app calls <code>generateText</code>, <code>streamText</code>, or an AI SDK wrapper.</span>
   </a>
+  <a class="api-link-card" href="/docs/harnesses/flue/">
+    <strong>Flue</strong>
+    <span>Use when your app builds agents with the Flue agent harness framework.</span>
+  </a>
   <a class="api-link-card" href="/docs/harnesses/openai-agents/">
     <strong>OpenAI Agents</strong>
     <span>Use when your app owns an <code>Agent</code> and runs it with a <code>Runner</code>.</span>