diff --git a/src/ai-sdk-integration.test.ts b/src/ai-sdk-integration.test.ts
index 82443ba..d1fa3b2 100644
--- a/src/ai-sdk-integration.test.ts
+++ b/src/ai-sdk-integration.test.ts
@@ -72,7 +72,12 @@ describeEval("@ai/sdk ToolCallScorer", {
     });
 
     return {
-      result: text,
+      transcript: [
+        {
+          role: "assistant",
+          parts: [{ type: "text", text }],
+        },
+      ],
       toolCalls: steps
         .flatMap((step) => step.toolCalls)
         .map((call) => ({
@@ -112,10 +117,7 @@ describeEval("@ai/sdk StructuredOutputScorer", {
       }),
     });
 
-    return {
-      result: JSON.stringify(object),
-      toolCalls: [],
-    };
+    return JSON.stringify(object);
   },
   scorers: [
     StructuredOutputScorer({
@@ -148,7 +150,12 @@ describeEval("@ai/sdk ToolCallScorer (No stopWhen)", {
     });
 
     return {
-      result: text,
+      transcript: [
+        {
+          role: "assistant",
+          parts: [{ type: "text", text }],
+        },
+      ],
       toolCalls: steps
         .flatMap((step) => step.toolCalls)
         .map((call) => ({
diff --git a/src/evaluate/index.test.ts b/src/evaluate/index.test.ts
index bfd954c..def56db 100644
--- a/src/evaluate/index.test.ts
+++ b/src/evaluate/index.test.ts
@@ -152,8 +152,95 @@ describe("evaluate", () => {
     });
 
     const call = mockGenerateObject.mock.calls[0][0];
-    expect(call.prompt).toContain("the task output");
-    expect(call.prompt).toContain("must mention specific details");
+    expect(call.messages[0].content[0].text).toContain("[ASSISTANT]");
+    expect(call.messages[0].content[0].text).toContain("the task output");
+    expect(call.messages[1].content).toContain("must mention specific details");
+  });
+
+  test("passes multimodal transcripts to the judge", async () => {
+    mockGenerateObject.mockResolvedValueOnce({
+      object: { answer: "A", rationale: "Handled the transcript correctly" },
+    } as any);
+
+    const ctx = makeContext();
+    await _evaluate(ctx, {
+      task: async () => ({
+        transcript: [
+          {
+            role: "user",
+            parts: [
+              { type: "text", text: "What is shown here?" },
+              {
+                type: "image",
+                image: "data:image/png;base64,abc123",
+                mediaType: "image/png",
+              },
+            ],
+          },
+          {
+            role: "assistant",
+            parts: [{ type: "text", text: "It is a cat." }],
+          },
+        ],
+      }),
+      criteria: "The answer should identify the subject of the image",
+      threshold: 1,
+    });
+
+    const call = mockGenerateObject.mock.calls[0][0];
+    const transcriptText = call.messages[0].content
+      .filter((part: any) => part.type === "text")
+      .map((part: any) => part.text)
+      .join("");
+    expect(transcriptText).toContain("[USER]\nWhat is shown here?");
+    expect(transcriptText).toContain("[image image/png]");
+    expect(transcriptText).toContain("[ASSISTANT]\nIt is a cat.");
+    expect(call.messages[0].content).toContainEqual({
+      type: "image",
+      image: "data:image/png;base64,abc123",
+      mediaType: "image/png",
+    });
+  });
+
+  test("does not pass tool metadata to the judge by default", async () => {
+    mockGenerateObject.mockResolvedValueOnce({
+      object: { answer: "A", rationale: "Focused on the visible transcript" },
+    } as any);
+
+    const ctx = makeContext();
+    await _evaluate(ctx, {
+      task: async () => ({
+        transcript: [
+          {
+            role: "user",
+            parts: [{ type: "text", text: "What is the weather?" }],
+          },
+          {
+            role: "assistant",
+            parts: [{ type: "text", text: "It is 72F in Seattle." }],
+          },
+        ],
+        toolCalls: [
+          {
+            name: "getWeather",
+            arguments: { location: "Seattle" },
+            result: { temperature: 72 },
+          },
+        ],
+      }),
+      criteria: "The answer should report the weather to the user",
+      threshold: 1,
+    });
+
+    const call = mockGenerateObject.mock.calls[0][0];
+    const transcriptText = call.messages[0].content
+      .filter((part: any) => part.type === "text")
+      .map((part: any) => part.text)
+      .join("");
+    expect(transcriptText).toContain("[USER]\nWhat is the weather?");
+    expect(transcriptText).toContain("[ASSISTANT]\nIt is 72F in Seattle.");
+    expect(transcriptText).not.toContain("tool-call");
+    expect(transcriptText).not.toContain("tool-result");
   });
 
   test("maps all answer choices to correct scores", async () => {
diff --git a/src/evaluate/index.ts b/src/evaluate/index.ts
index b48088e..0c8244f 100644
--- a/src/evaluate/index.ts
+++ b/src/evaluate/index.ts
@@ -1,7 +1,12 @@
 import { generateObject } from "ai";
 import { z } from "zod";
 import { assert, test } from "vitest";
-import { wrapText } from "../wrapText";
+import {
+  type Transcript,
+  formatEvalValue,
+  normalizeEvaluateOutput,
+  toJudgeUserMessage,
+} from "../messages";
 
 type LanguageModel = Parameters<typeof generateObject>[0]["model"];
 
@@ -13,15 +18,11 @@ export function configure(opts: { model: LanguageModel }) {
 
 const EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;
 
-const EVAL_PROMPT = (output: string, criteria: string) => `<submission>
-${output}
-</submission>
-
-<criteria>
+const EVAL_PROMPT = (criteria: string) => `<criteria>
 ${criteria}
 </criteria>
 
-Does the submission meet the criteria? Select one option:
+Does the conversation transcript meet the criteria? Select one option:
 (A) The criteria is fully met with no issues
 (B) The criteria is mostly met with minor gaps
 (C) The criteria is partially met with notable gaps
@@ -37,7 +38,7 @@ const CHOICE_SCORES: Record<string, number> = {
 };
 
 interface EvaluateOptions {
-  task: () => Promise<string>;
+  task: () => Promise<string | { transcript: Transcript }>;
   criteria: string;
   threshold?: number;
 }
@@ -57,9 +58,11 @@ export async function _evaluate(
     );
   }
 
-  let output: string;
+  let taskOutput: string | { transcript: Transcript };
+  let evaluationOutput: ReturnType<typeof normalizeEvaluateOutput>;
   try {
-    output = await opts.task();
+    taskOutput = await opts.task();
+    evaluationOutput = normalizeEvaluateOutput(taskOutput);
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
     ctx.task.meta.eval = {
@@ -84,7 +87,13 @@ export async function _evaluate(
         rationale: z.string(),
       }),
       system: EVAL_SYSTEM,
-      prompt: EVAL_PROMPT(output, opts.criteria),
+      messages: [
+        toJudgeUserMessage(evaluationOutput.transcript),
+        {
+          role: "user",
+          content: EVAL_PROMPT(opts.criteria),
+        },
+      ],
     }));
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
@@ -118,7 +127,9 @@ export async function _evaluate(
   if (score < threshold) {
     assert(
       false,
-      `Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${wrapText(output)}\n\n## Rationale:\n${wrapText(object.rationale)}`,
+      `Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${formatEvalValue(
+        typeof taskOutput === "string" ? taskOutput : taskOutput.transcript,
+      )}\n\n## Rationale:\n${formatEvalValue(object.rationale)}`,
     );
   }
 }
diff --git a/src/formatScores.test.ts b/src/formatScores.test.ts
index 81d69fc..a3ebd4a 100644
--- a/src/formatScores.test.ts
+++ b/src/formatScores.test.ts
@@ -72,4 +72,47 @@ describe("formatScores", () => {
       # Scorer B [0.8]"
     `);
   });
+
+  it("should format transcript outputs", () => {
+    const scores = [
+      {
+        name: "Scorer A",
+        score: 0.2,
+        metadata: {
+          rationale: "Image description was incorrect",
+          output: [
+            {
+              role: "assistant",
+              parts: [
+                { type: "text", text: "A dog on a sofa." },
+                {
+                  type: "image",
+                  image: "data:image/png;base64,abc",
+                  mediaType: "image/png",
+                },
+              ],
+            },
+          ],
+        },
+      },
+    ];
+
+    const result = formatScores(scores);
+
+    expect(result).toMatchInlineSnapshot(`
+      "# Scorer A [0.2]
+
+      ## Rationale
+
+      Image description was incorrect
+
+      ## Response
+
+      ## assistant
+
+      A dog on a sofa.
+
+      [image image/png]"
+    `);
+  });
 });
diff --git a/src/index.ts b/src/index.ts
index d77425d..b8c85ed 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -7,44 +7,27 @@ import {
   test,
 } from "vitest";
 import "vitest";
+import {
+  type EvalDataInput,
+  type Transcript,
+  type TranscriptMessage,
+  type TranscriptPart,
+  type TaskInput,
+  type TaskResult,
+  type ToolCall,
+  formatEvalValue,
+  getDefaultTestName,
+  getTaskInput,
+  normalizeScorerPayload,
+} from "./messages";
 import { wrapText } from "./wrapText";
 
-/**
- * Represents a tool/function call made during task execution.
- * Supports various LLM provider formats and use cases.
- */
-export type ToolCall = {
-  // Core fields (required for basic usage)
-  name: string;
-  arguments?: Record<string, any>;
-
-  // Additional metadata
-  [key: string]: any; // Allow provider-specific fields
-};
-
-export type TaskResult = {
-  result: string;
-  toolCalls?: ToolCall[];
-};
-
 /**
  * Task function that processes an input and returns either a string result
- * or a TaskResult object containing the result and any tool calls made.
- *
- * @param input - The input string to process
- * @returns Promise resolving to either a string or TaskResult object
- *
- * @example
- * // Simple tasks can just return a string
- * const simpleTask: TaskFn = async (input) => "The answer is 42";
- *
- * // Tasks that use tools should return TaskResult
- * const taskWithTools: TaskFn = async (input) => ({
- *   result: "The answer is 42",
- *   toolCalls: [{ name: "calculate", arguments: { expr: "6*7" }, result: 42 }]
- * });
+ * or a TaskResult object containing a multimodal response transcript and any
+ * tool calls made.
  */
-export type TaskFn = (input: string) => Promise<string | TaskResult>;
+export type TaskFn = (input: TaskInput) => Promise<string | TaskResult>;
 
 export type Score = {
   score: number | null;
@@ -57,6 +40,7 @@ export type Score = {
 export interface BaseScorerOptions {
   input: string;
   output: string;
+  transcript: Transcript;
   toolCalls?: ToolCall[];
 }
 
@@ -91,53 +75,33 @@ declare module "vitest" {
   }
 }
 
+function formatEvaluationOutputForDisplay(
+  taskOutput: string | TaskResult,
+): string {
+  if (typeof taskOutput === "string") {
+    return formatEvalValue(taskOutput);
+  }
+
+  return formatEvalValue(taskOutput.transcript);
+}
+
 expect.extend({
   /**
    * Evaluates a language model output against an expected answer using a scoring function.
    *
    * @deprecated Use describeEval() instead for better test organization and multiple scorers support
-   * @param expected - The expected (ground truth) answer, can be any type depending on the scorer
-   * @param taskFn - Async function that processes the input and returns the model output
-   *                 Can return either a string or TaskResult object with result and optional toolCalls
-   * @param scoreFn - Function that evaluates the model output against the expected answer
-   * @param threshold - Minimum acceptable score (0-1), defaults to 1.0
-   *
-   * @example
-   * ```javascript
-   * test("checks capital of France", async () => {
-   *   expect("What is the capital of France?").toEval(
-   *     "Paris",
-   *     async (input) => {
-   *       const response = await queryLLM(input);
-   *       // Recommended: return TaskResult
-   *       return {
-   *         result: response.text,
-   *         toolCalls: response.toolCalls || []
-   *       };
-   *     },
-   *     checkFactuality,
-   *     0.8
-   *   );
-   * });
-   * ```
    */
-  // TODO: this needs to be support true extensibility with Eval scorers
   toEval: async function toEval(
-    input: string,
+    input: TaskInput,
     expected: any,
     taskFn: TaskFn,
     scoreFn: ScoreFn<any>,
     threshold = 1.0,
   ) {
-    const { isNot } = this;
-
     const taskOutput = await taskFn(input);
-    const output =
-      typeof taskOutput === "string" ? taskOutput : taskOutput.result;
-    const toolCalls =
-      typeof taskOutput === "object" ? taskOutput.toolCalls : undefined;
+    const normalized = normalizeScorerPayload(input, taskOutput);
 
-    let result = scoreFn({ input, expected, output, toolCalls });
+    let result = scoreFn({ expected, ...normalized });
     if (result instanceof Promise) {
       result = await result;
     }
@@ -149,59 +113,6 @@ expect.extend({
   },
 });
 
-/**
- * Creates a test suite for evaluating language model outputs.
- *
- * @param name - The name of the test suite
- * @param options - Configuration options
- * @param options.data - Async function that returns an array of test cases with input and any additional fields
- * @param options.task - Function that processes the input and returns the model output
- *                       Can return either a string or TaskResult object with result and optional toolCalls
- * @param options.skipIf - Optional function that determines if tests should be skipped
- * @param options.scorers - Array of scoring functions that evaluate model outputs
- * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
- * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
- *
- * @example
- * ```javascript
- * // Recommended: TaskResult format with tool tracking
- * describeEval("capital cities test", {
- *   data: async () => [{
- *     input: "What is the capital of France?",
- *     expected: "Paris"
- *   }],
- *   task: async (input) => {
- *     const response = await queryLLM(input);
- *     return {
- *       result: response.text,
- *       toolCalls: response.toolCalls || []
- *     };
- *   },
- *   scorers: [checkFactuality],
- *   threshold: 0.8
- * });
- *
- * // Example with tool usage evaluation
- * describeEval("tool usage test", {
- *   data: async () => [{
- *     input: "Search for weather in Seattle",
- *     expectedTools: [{ name: "weather_api", arguments: { location: "Seattle" } }]
- *   }],
- *   task: async (input) => {
- *     return {
- *       result: "The weather in Seattle is 65°F",
- *       toolCalls: [{
- *         name: "weather_api",
- *         arguments: { location: "Seattle" },
- *         result: { temp: 65, condition: "partly cloudy" }
- *       }]
- *     };
- *   },
- *   scorers: [ToolCallScorer()],
- *   threshold: 1.0
- * });
- * ```
- */
 export function describeEval(
   name: string,
   {
@@ -210,14 +121,12 @@ export function describeEval(
     skipIf,
     scorers,
     threshold = 1.0,
-    // increase default test timeout as 5s is usually not enough for
-    // a single factuality check
     timeout = 60000,
     beforeEach: beforeEachHook,
     afterEach: afterEachHook,
   }: {
     data: () => Promise<
-      Array<{ input: string; name?: string } & Record<string, any>>
+      Array<{ name?: string } & EvalDataInput & Record<string, any>>
     >;
     task: TaskFn;
     skipIf?: () => boolean;
@@ -237,49 +146,60 @@ export function describeEval(
     }
 
     const testFn = skipIf ? test.skipIf(skipIf()) : test;
-    // TODO: should data just be a generator?
-    for (const { input, name: testName, ...params } of await data()) {
+    for (const testCase of await data()) {
+      const {
+        input,
+        transcript,
+        name: testName,
+        ...params
+      } = testCase as {
+        input?: string;
+        transcript?: Transcript;
+        name?: string;
+      } & Record<string, any>;
+
+      const taskInput = getTaskInput(input, transcript);
+
       testFn(
-        testName ?? input,
+        testName ?? getDefaultTestName(taskInput),
         {
           timeout,
         },
         async ({ task: testTask }) => {
-          const taskOutput = await task(input);
-          const output =
-            typeof taskOutput === "string" ? taskOutput : taskOutput.result;
-          const toolCalls =
-            typeof taskOutput === "object" ? taskOutput.toolCalls : undefined;
+          const taskOutput = await task(taskInput);
+          const normalized = normalizeScorerPayload(taskInput, taskOutput);
 
           const scores = await Promise.all(
             scorers.map((scorer) => {
-              const result = scorer({ input, ...params, output, toolCalls });
+              const result = scorer({ ...params, ...normalized });
               if (result instanceof Promise) {
                 return result;
               }
-              return new Promise<Score>((resolve) => resolve(result));
+              return Promise.resolve(result);
             }),
           );
-          const scoresWithName = scores.map((s, i) => ({
-            ...s,
-            name: scorers[i].name,
+
+          const scoresWithName = scores.map((score, index) => ({
+            ...score,
+            name: scorers[index].name,
           }));
 
           const avgScore =
-            scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;
+            scores.reduce((acc, score) => acc + (score.score ?? 0), 0) /
+            scores.length;
 
           testTask.meta.eval = {
             scores: scoresWithName,
             avgScore,
-            ...(toolCalls && { toolCalls }),
+            ...(normalized.toolCalls && { toolCalls: normalized.toolCalls }),
           };
 
           if (threshold) {
             assert(
               avgScore >= threshold,
-              `Score: ${avgScore} below threshold: ${threshold}\n\n## Output:\n${wrapText(output)}\n\n${formatScores(
-                scoresWithName,
-              )}`,
+              `Score: ${avgScore} below threshold: ${threshold}\n\n## Output:\n${formatEvaluationOutputForDisplay(
+                taskOutput,
+              )}\n\n${formatScores(scoresWithName)}`,
             );
           }
         },
@@ -291,27 +211,20 @@ export function describeEval(
 export function formatScores(scores: (Score & { name: string })[]) {
   return scores
     .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))
-    .map((s) => {
-      const scoreLine = `# ${s.name || "Unknown"} [${(s.score ?? 0).toFixed(1)}]`;
+    .map((score) => {
+      const scoreLine = `# ${score.name || "Unknown"} [${(score.score ?? 0).toFixed(1)}]`;
       if (
-        ((s.score ?? 0) < 1.0 && s.metadata?.rationale) ||
-        s.metadata?.output
+        ((score.score ?? 0) < 1.0 && score.metadata?.rationale) ||
+        score.metadata?.output !== undefined
       ) {
-        // Format output - handle both strings and objects
-        let formattedOutput = "";
-        if (s.metadata?.output !== undefined) {
-          const output = s.metadata.output;
-          if (typeof output === "string") {
-            formattedOutput = `\n\n## Response\n\n${wrapText(output)}`;
-          } else {
-            // For objects, stringify with proper formatting
-            formattedOutput = `\n\n## Response\n\n${wrapText(JSON.stringify(output, null, 2))}`;
-          }
-        }
+        const formattedOutput =
+          score.metadata?.output !== undefined
+            ? `\n\n## Response\n\n${formatEvalValue(score.metadata.output)}`
+            : "";
 
         return `${scoreLine}${
-          s.metadata?.rationale
-            ? `\n\n## Rationale\n\n${wrapText(s.metadata.rationale)}`
+          score.metadata?.rationale
+            ? `\n\n## Rationale\n\n${wrapText(score.metadata.rationale)}`
             : ""
         }${formattedOutput}`;
       }
@@ -321,8 +234,16 @@ export function formatScores(scores: (Score & { name: string })[]) {
 }
 
 export { wrapText } from "./wrapText";
+export type {
+  EvalDataInput,
+  Transcript,
+  TranscriptMessage,
+  TranscriptPart,
+  TaskInput,
+  TaskResult,
+  ToolCall,
+} from "./messages";
 
-// Export built-in scorers
 export {
   ToolCallScorer,
   type ToolCallScorerOptions,
diff --git a/src/messages.test.ts b/src/messages.test.ts
new file mode 100644
index 0000000..8857ca8
--- /dev/null
+++ b/src/messages.test.ts
@@ -0,0 +1,137 @@
+import { describe, expect, test, vi } from "vitest";
+import { describeEval, ToolCallScorer } from "./index";
+import {
+  formatEvalValue,
+  getTaskInput,
+  normalizeScorerPayload,
+  type Transcript,
+} from "./messages";
+
+const multimodalInput: Transcript = [
+  {
+    role: "user",
+    parts: [
+      { type: "text", text: "Describe this image" },
+      {
+        type: "image",
+        image: "data:image/png;base64,abc123",
+        mediaType: "image/png",
+      },
+    ],
+  },
+];
+
+const multimodalOutput: Transcript = [
+  {
+    role: "assistant",
+    parts: [{ type: "text", text: "A cat sitting on a chair." }],
+  },
+];
+
+describe("transcript normalization", () => {
+  test("toEval passes a combined transcript to scorers", async () => {
+    const scorer = vi.fn(async (opts) => {
+      expect(opts.input).toBe("Describe this image");
+      expect(opts.output).toBe("A cat sitting on a chair.");
+      expect(opts.transcript).toEqual([
+        ...multimodalInput,
+        ...multimodalOutput,
+      ]);
+      return { score: 1 };
+    });
+
+    const task = vi.fn(async (input) => {
+      expect(input).toEqual(multimodalInput);
+      return { transcript: multimodalOutput };
+    });
+
+    await expect(multimodalInput).toEval(
+      { expected: "cat" },
+      task,
+      scorer,
+      1.0,
+    );
+
+    expect(task).toHaveBeenCalledOnce();
+    expect(scorer).toHaveBeenCalledOnce();
+  });
+
+  test("rejects eval cases that define both input and transcript", () => {
+    expect(() =>
+      getTaskInput("hello", [
+        { role: "user", parts: [{ type: "text", text: "world" }] },
+      ]),
+    ).toThrow(
+      "Each eval case must define exactly one of `input` or `transcript`.",
+    );
+  });
+
+  test("rejects task outputs without a transcript", () => {
+    expect(() =>
+      normalizeScorerPayload("hello", { messages: [] } as any),
+    ).toThrow(
+      "Task output must be either a string or an object with `transcript`.",
+    );
+  });
+
+  test("formats transcripts safely for debug output", () => {
+    expect(formatEvalValue(multimodalInput)).toMatchInlineSnapshot(`
+      "## user
+
+      Describe this image
+
+      [image image/png]"
+    `);
+  });
+});
+
+describeEval("transcript scorer payload", {
+  data: async () => [
+    {
+      name: "passes transcript through describeEval",
+      transcript: multimodalInput,
+    },
+  ],
+  task: async (input) => {
+    expect(input).toEqual(multimodalInput);
+    return { transcript: multimodalOutput };
+  },
+  scorers: [
+    async (opts) => {
+      expect(opts.transcript).toEqual([
+        ...multimodalInput,
+        ...multimodalOutput,
+      ]);
+      expect(opts.output).toBe("A cat sitting on a chair.");
+      return { score: 1 };
+    },
+  ],
+});
+
+describeEval("explicit tool call metadata", {
+  data: async () => [
+    {
+      name: "tool calls are passed separately from the transcript",
+      input: "What is the weather in Seattle?",
+      expectedTools: [
+        { name: "getWeather", arguments: { location: "Seattle" } },
+      ],
+    },
+  ],
+  task: async () => ({
+    transcript: [
+      {
+        role: "assistant",
+        parts: [{ type: "text", text: "It is 72F in Seattle." }],
+      },
+    ],
+    toolCalls: [
+      {
+        name: "getWeather",
+        arguments: { location: "Seattle" },
+        result: { temperature: 72 },
+      },
+    ],
+  }),
+  scorers: [ToolCallScorer()],
+});
diff --git a/src/messages.ts b/src/messages.ts
new file mode 100644
index 0000000..1b60351
--- /dev/null
+++ b/src/messages.ts
@@ -0,0 +1,371 @@
+import { wrapText } from "./wrapText";
+
+export type ToolCall = {
+  name: string;
+  arguments?: unknown;
+  [key: string]: unknown;
+};
+
+export type TranscriptTextPart = {
+  type: "text";
+  text: string;
+};
+
+export type TranscriptImagePart = {
+  type: "image";
+  image: unknown;
+  mediaType?: string;
+};
+
+export type TranscriptFilePart = {
+  type: "file";
+  data: unknown;
+  mediaType: string;
+  filename?: string;
+};
+
+export type TranscriptPart =
+  | TranscriptTextPart
+  | TranscriptImagePart
+  | TranscriptFilePart;
+
+export type TranscriptMessage = {
+  role: "user" | "assistant";
+  parts: TranscriptPart[];
+};
+
+export type Transcript = TranscriptMessage[];
+
+export type TaskInput = string | Transcript;
+
+export type TaskResult = {
+  transcript: Transcript;
+  toolCalls?: ToolCall[];
+};
+
+export type TaskOutput = string | TaskResult;
+
+export type EvalDataInput =
+  | {
+      input: string;
+      transcript?: never;
+    }
+  | {
+      transcript: Transcript;
+      input?: never;
+    };
+
+interface NormalizedInput {
+  input: string;
+  inputTranscript: Transcript;
+}
+
+interface NormalizedOutput {
+  output: string;
+  outputTranscript: Transcript;
+  toolCalls?: ToolCall[];
+}
+
+export interface NormalizedScorerPayload {
+  input: string;
+  output: string;
+  transcript: Transcript;
+  toolCalls?: ToolCall[];
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+
+export function isTranscript(value: unknown): value is Transcript {
+  return (
+    Array.isArray(value) &&
+    value.every(
+      (message) =>
+        isRecord(message) &&
+        (message.role === "user" || message.role === "assistant") &&
+        Array.isArray(message.parts),
+    )
+  );
+}
+
+function textMessage(
+  role: TranscriptMessage["role"],
+  text: string,
+): TranscriptMessage {
+  return {
+    role,
+    parts: [{ type: "text", text }],
+  };
+}
+
+function assertValidTranscript(
+  transcript: unknown,
+  fieldName: string,
+): asserts transcript is Transcript {
+  if (!isTranscript(transcript)) {
+    throw new Error(`${fieldName} must be an array of transcript messages.`);
+  }
+}
+
+function assertString(
+  value: unknown,
+  fieldName: string,
+): asserts value is string {
+  if (typeof value !== "string") {
+    throw new Error(`${fieldName} must be a string.`);
+  }
+}
+
+export function getTaskInput(
+  input: string | undefined,
+  transcript: Transcript | undefined,
+): TaskInput {
+  const hasInput = input !== undefined;
+  const hasTranscript = transcript !== undefined;
+
+  if (hasInput === hasTranscript) {
+    throw new Error(
+      "Each eval case must define exactly one of `input` or `transcript`.",
+    );
+  }
+
+  if (hasInput) {
+    assertString(input, "`input`");
+    return input;
+  }
+
+  assertValidTranscript(transcript, "`transcript`");
+  return transcript;
+}
+
+export function normalizeEvalInput(input: TaskInput): NormalizedInput {
+  if (typeof input === "string") {
+    return {
+      input,
+      inputTranscript: [textMessage("user", input)],
+    };
+  }
+
+  assertValidTranscript(input, "Eval input");
+
+  return {
+    input: extractTextFromTranscript(input),
+    inputTranscript: input,
+  };
+}
+
+export function normalizeTaskOutput(taskOutput: TaskOutput): NormalizedOutput {
+  if (typeof taskOutput === "string") {
+    return {
+      output: taskOutput,
+      outputTranscript: [textMessage("assistant", taskOutput)],
+    };
+  }
+
+  if (!isRecord(taskOutput) || !("transcript" in taskOutput)) {
+    throw new Error(
+      "Task output must be either a string or an object with `transcript`.",
+    );
+  }
+
+  assertValidTranscript(taskOutput.transcript, "`transcript`");
+
+  return {
+    output: extractTextFromTranscript(taskOutput.transcript),
+    outputTranscript: taskOutput.transcript,
+    toolCalls: taskOutput.toolCalls,
+  };
+}
+
+export function normalizeScorerPayload(
+  input: TaskInput,
+  taskOutput: TaskOutput,
+): NormalizedScorerPayload {
+  const normalizedInput = normalizeEvalInput(input);
+  const normalizedOutput = normalizeTaskOutput(taskOutput);
+
+  return {
+    input: normalizedInput.input,
+    output: normalizedOutput.output,
+    transcript: [
+      ...normalizedInput.inputTranscript,
+      ...normalizedOutput.outputTranscript,
+    ],
+    toolCalls: normalizedOutput.toolCalls,
+  };
+}
+
+export function normalizeEvaluateOutput(taskOutput: TaskOutput): {
+  transcript: Transcript;
+  output: string;
+} {
+  const normalizedOutput = normalizeTaskOutput(taskOutput);
+
+  return {
+    transcript: normalizedOutput.outputTranscript,
+    output: normalizedOutput.output,
+  };
+}
+
+function extractTextFromPart(part: TranscriptPart): string {
+  return part.type === "text" ? part.text : "";
+}
+
+export function extractTextFromTranscript(transcript: Transcript): string {
+  return transcript
+    .flatMap((message) => message.parts.map(extractTextFromPart))
+    .filter(Boolean)
+    .join("\n");
+}
+
+function summarizeUnknown(value: unknown): string {
+  if (typeof value === "string") {
+    return value;
+  }
+
+  if (value instanceof URL) {
+    return value.toString();
+  }
+
+  if (value instanceof Error) {
+    return value.message;
+  }
+
+  try {
+    const json = JSON.stringify(value, null, 2);
+    return json ?? String(value);
+  } catch {
+    return String(value);
+  }
+}
+
+function formatPartForDisplay(part: TranscriptPart): string {
+  switch (part.type) {
+    case "text":
+      return part.text;
+    case "image":
+      return `[image${part.mediaType ? ` ${part.mediaType}` : ""}]`;
+    case "file":
+      return `[file${part.filename ? ` ${part.filename}` : ""}${part.mediaType ? ` ${part.mediaType}` : ""}]`;
+  }
+}
+
+export function formatTranscript(transcript: Transcript): string {
+  if (transcript.length === 0) {
+    return "(empty transcript)";
+  }
+
+  return transcript
+    .map((message) => {
+      const heading = `## ${message.role}`;
+      const body = message.parts.length
+        ? message.parts.map(formatPartForDisplay).join("\n\n")
+        : "(empty message)";
+      return `${heading}\n\n${body}`;
+    })
+    .join("\n\n");
+}
+
+export function formatEvalValue(value: unknown): string {
+  if (typeof value === "string") {
+    return wrapText(value);
+  }
+
+  if (isTranscript(value)) {
+    return formatTranscript(value);
+  }
+
+  return wrapText(summarizeUnknown(value));
+}
+
+function pushJudgeText(content: Array<any>, text: string) {
+  if (text.length === 0) {
+    return;
+  }
+
+  const lastPart = content[content.length - 1];
+  if (lastPart?.type === "text") {
+    lastPart.text += text;
+    return;
+  }
+
+  content.push({ type: "text", text });
+}
+
+export function toJudgeUserMessage(transcript: Transcript) {
+  const content: Array<any> = [];
+
+  if (transcript.length === 0) {
+    content.push({
+      type: "text",
+      text: "(empty transcript)",
+    });
+    return { role: "user" as const, content };
+  }
+
+  transcript.forEach((message, index) => {
+    if (index > 0) {
+      pushJudgeText(content, "\n\n");
+    }
+
+    pushJudgeText(content, `[${message.role.toUpperCase()}]\n`);
+
+    if (message.parts.length === 0) {
+      pushJudgeText(content, "(empty message)");
+      return;
+    }
+
+    message.parts.forEach((part, partIndex) => {
+      if (partIndex > 0) {
+        pushJudgeText(content, "\n");
+      }
+
+      switch (part.type) {
+        case "text":
+          pushJudgeText(content, `${part.text}\n`);
+          return;
+        case "image":
+          pushJudgeText(
+            content,
+            `[image${part.mediaType ? ` ${part.mediaType}` : ""}]\n`,
+          );
+          content.push({
+            type: "image",
+            image: part.image,
+            ...(part.mediaType ? { mediaType: part.mediaType } : {}),
+          });
+          pushJudgeText(content, "\n");
+          return;
+        case "file":
+          pushJudgeText(
+            content,
+            `[file${part.filename ? ` ${part.filename}` : ""}${part.mediaType ? ` ${part.mediaType}` : ""}]\n`,
+          );
+          content.push({
+            type: "file",
+            data: part.data,
+            mediaType: part.mediaType,
+            ...(part.filename ? { filename: part.filename } : {}),
+          });
+          pushJudgeText(content, "\n");
+      }
+    });
+  });
+
+  const lastPart = content[content.length - 1];
+  if (lastPart?.type === "text") {
+    lastPart.text = lastPart.text.trimEnd();
+  }
+
+  return { role: "user" as const, content };
+}
+
+export function getDefaultTestName(input: TaskInput): string {
+  if (typeof input === "string") {
+    return input;
+  }
+
+  const firstText = extractTextFromTranscript(input).trim();
+  return firstText.length > 0 ? firstText : "transcript";
+}