EntityProcess · christso · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts
@@ -9,13 +9,91 @@ import {
   createTargetProxy,
 } from '../../runtime/target-proxy.js';
 import { toSnakeCaseDeep } from '../case-conversion.js';
+import { type ContentImage, isContentArray } from '../content.js';
 import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 
 /** Threshold in bytes above which output is written to a temp file instead of inlined. */
 const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;
 
+/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
+const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
+
+/**
+ * Convert ContentImage blocks in message arrays for code grader consumption.
+ *
+ * - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
+ * - Non-URI images (already a path or URL) → `source` carried through as `path`.
+ * - ContentText, ContentFile blocks → passed through unchanged.
+ * - Messages with plain string content → passed through unchanged.
+ *
+ * Returns the original array when no image blocks exist (zero-copy fast path).
+ */
+export async function materializeContentForGrader(
+  messages: readonly Record<string, unknown>[] | null | undefined,
+  getWorkDir: () => Promise<string>,
+): Promise<readonly Record<string, unknown>[] | null> {
+  if (!messages || messages.length === 0) return messages ?? null;
+
+  // Fast path: skip if no image blocks exist
+  let hasAnyImage = false;
+  for (const msg of messages) {
+    if (isContentArray(msg.content)) {
+      for (const block of msg.content) {
+        if (block.type === 'image') {
+          hasAnyImage = true;
+          break;
+        }
+      }
+    }
+    if (hasAnyImage) break;
+  }
+  if (!hasAnyImage) return messages;
+
+  let counter = 0;
+  const result: Record<string, unknown>[] = [];
+
+  for (const msg of messages) {
+    if (!isContentArray(msg.content)) {
+      result.push(msg);
+      continue;
+    }
+
+    if (!msg.content.some((b) => b.type === 'image')) {
+      result.push(msg);
+      continue;
+    }
+
+    const blocks: Record<string, unknown>[] = [];
+    for (const block of msg.content) {
+      if (block.type !== 'image') {
+        blocks.push({ ...block });
+        continue;
+      }
+
+      const img = block as ContentImage;
+      const match = DATA_URI_RE.exec(img.source);
+
+      if (match) {
+        const [, mediaType, base64Data] = match;
+        const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
+        const dir = await getWorkDir();
+        const filePath = join(dir, `img-${counter++}.${ext}`);
+        await writeFile(filePath, Buffer.from(base64Data, 'base64'));
+        blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
+      } else {
+        // Already a path or URL → carry through as path
+        blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
+      }
+    }
+
+    result.push({ ...msg, content: blocks });
+  }
+
+  return result;
+}
+
 export interface CodeEvaluatorOptions {
   readonly command: readonly string[];
   /** @deprecated Use `command` instead */
@@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator {
   }
 
   async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    // Lazy temp dir for materialized image files
+    let imageTmpDir: string | undefined;
+    const getImageDir = async () => {
+      if (!imageTmpDir) {
+        imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
+      }
+      return imageTmpDir;
+    };
+
+    // Materialize multimodal content (data URIs → temp files, source → path)
+    const materializedOutput = await materializeContentForGrader(
+      context.output as readonly Record<string, unknown>[] | undefined,
+      getImageDir,
+    );
+
     // Determine whether to use file-backed output for large payloads
-    let outputForPayload = context.output ?? null;
+    let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
     let outputPath: string | undefined;
 
     if (outputForPayload) {
@@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator {
     // Build payload (camelCase internally, converted to snake_case for graders)
     const payload = {
       criteria: context.evalCase.criteria,
-      expectedOutput: context.evalCase.expected_output,
+      expectedOutput: await materializeContentForGrader(
+        context.evalCase.expected_output as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       output: outputForPayload,
       outputPath,
       inputFiles: context.evalCase.file_paths,
-      input: context.evalCase.input,
+      input: await materializeContentForGrader(
+        context.evalCase.input as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
@@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator {
       if (outputPath) {
         await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
       }
+      // Clean up temp dir for materialized images
+      if (imageTmpDir) {
+        await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
+      }
     }
   }
 }

diff --git a/packages/core/src/evaluation/providers/claude-content.ts b/packages/core/src/evaluation/providers/claude-content.ts
@@ -52,15 +52,13 @@ export function toContentArray(content: unknown): Content[] | undefined {
             ? src.media_type
             : 'application/octet-stream';
       const data =
-        typeof src.data === 'string' && src.data.length > 0
+        typeof src.data === 'string'
           ? `data:${mediaType};base64,${src.data}`
-          : typeof p.url === 'string' && (p.url as string).length > 0
+          : typeof p.url === 'string'
             ? (p.url as string)
             : '';
-      if (data) {
-        blocks.push({ type: 'image', media_type: mediaType, source: data });
-        hasNonText = true;
-      }
+      blocks.push({ type: 'image', media_type: mediaType, source: data });
+      hasNonText = true;
     } else if (p.type === 'tool_use') {
       // tool_use blocks are handled separately as ToolCall — skip
     } else if (p.type === 'tool_result') {

diff --git a/packages/core/src/evaluation/providers/pi-utils.ts b/packages/core/src/evaluation/providers/pi-utils.ts
@@ -59,8 +59,12 @@ export function toPiContentArray(content: unknown): Content[] | undefined {
       let source = '';
       if (typeof p.source === 'object' && p.source !== null) {
         const src = p.source as Record<string, unknown>;
-        const srcMediaType = typeof src.media_type === 'string' ? src.media_type : mediaType;
-        source = typeof src.data === 'string' ? `data:${srcMediaType};base64,${src.data}` : '';
+        const srcMediaType =
+          typeof src.media_type === 'string' ? src.media_type : mediaType;
+        source =
+          typeof src.data === 'string'
+            ? `data:${srcMediaType};base64,${src.data}`
+            : '';
       }
       if (!source && typeof p.url === 'string') {
         source = p.url;

diff --git a/packages/core/src/evaluation/validation/prompt-validator.ts b/packages/core/src/evaluation/validation/prompt-validator.ts
@@ -45,9 +45,7 @@ export function validateTemplateVariables(content: string, source: string): void
   const hasCandidateAnswer =
     foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) ||
     foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
-  const hasExpectedOutput =
-    foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) ||
-    foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
+  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
   const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
 
   // ERROR: Missing required fields - throw error to skip this evaluator/eval case