Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 106 additions & 3 deletions packages/core/src/evaluation/evaluators/code-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,91 @@ import {
createTargetProxy,
} from '../../runtime/target-proxy.js';
import { toSnakeCaseDeep } from '../case-conversion.js';
import { type ContentImage, isContentArray } from '../content.js';
import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';

/** Threshold in bytes above which output is written to a temp file instead of inlined. */
const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;

/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;

/**
* Convert ContentImage blocks in message arrays for code grader consumption.
*
* - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
* - Non-URI images (already a path or URL) → `source` carried through as `path`.
* - ContentText, ContentFile blocks → passed through unchanged.
* - Messages with plain string content → passed through unchanged.
*
* Returns the original array when no image blocks exist (zero-copy fast path).
*/
export async function materializeContentForGrader(
messages: readonly Record<string, unknown>[] | null | undefined,
getWorkDir: () => Promise<string>,
): Promise<readonly Record<string, unknown>[] | null> {
if (!messages || messages.length === 0) return messages ?? null;

// Fast path: skip if no image blocks exist
let hasAnyImage = false;
for (const msg of messages) {
if (isContentArray(msg.content)) {
for (const block of msg.content) {
if (block.type === 'image') {
hasAnyImage = true;
break;
}
}
}
if (hasAnyImage) break;
}
if (!hasAnyImage) return messages;

let counter = 0;
const result: Record<string, unknown>[] = [];

for (const msg of messages) {
if (!isContentArray(msg.content)) {
result.push(msg);
continue;
}

if (!msg.content.some((b) => b.type === 'image')) {
result.push(msg);
continue;
}

const blocks: Record<string, unknown>[] = [];
for (const block of msg.content) {
if (block.type !== 'image') {
blocks.push({ ...block });
continue;
}

const img = block as ContentImage;
const match = DATA_URI_RE.exec(img.source);

if (match) {
const [, mediaType, base64Data] = match;
const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
const dir = await getWorkDir();
const filePath = join(dir, `img-${counter++}.${ext}`);
await writeFile(filePath, Buffer.from(base64Data, 'base64'));
blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
} else {
// Already a path or URL → carry through as path
blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
}
}

result.push({ ...msg, content: blocks });
}

return result;
}

export interface CodeEvaluatorOptions {
readonly command: readonly string[];
/** @deprecated Use `command` instead */
Expand Down Expand Up @@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator {
}

async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
// Lazy temp dir for materialized image files
let imageTmpDir: string | undefined;
const getImageDir = async () => {
if (!imageTmpDir) {
imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
}
return imageTmpDir;
};

// Materialize multimodal content (data URIs → temp files, source → path)
const materializedOutput = await materializeContentForGrader(
context.output as readonly Record<string, unknown>[] | undefined,
getImageDir,
);

// Determine whether to use file-backed output for large payloads
let outputForPayload = context.output ?? null;
let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
let outputPath: string | undefined;

if (outputForPayload) {
Expand All @@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator {
// Build payload (camelCase internally, converted to snake_case for graders)
const payload = {
criteria: context.evalCase.criteria,
expectedOutput: context.evalCase.expected_output,
expectedOutput: await materializeContentForGrader(
context.evalCase.expected_output as readonly Record<string, unknown>[],
getImageDir,
),
output: outputForPayload,
outputPath,
inputFiles: context.evalCase.file_paths,
input: context.evalCase.input,
input: await materializeContentForGrader(
context.evalCase.input as readonly Record<string, unknown>[],
getImageDir,
),
trace: context.trace ?? null,
tokenUsage: context.tokenUsage ?? null,
costUsd: context.costUsd ?? null,
Expand Down Expand Up @@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator {
if (outputPath) {
await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
}
// Clean up temp dir for materialized images
if (imageTmpDir) {
await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
}
}
}
}
Expand Down
10 changes: 4 additions & 6 deletions packages/core/src/evaluation/providers/claude-content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,13 @@ export function toContentArray(content: unknown): Content[] | undefined {
? src.media_type
: 'application/octet-stream';
const data =
typeof src.data === 'string' && src.data.length > 0
typeof src.data === 'string'
? `data:${mediaType};base64,${src.data}`
: typeof p.url === 'string' && (p.url as string).length > 0
: typeof p.url === 'string'
? (p.url as string)
: '';
if (data) {
blocks.push({ type: 'image', media_type: mediaType, source: data });
hasNonText = true;
}
blocks.push({ type: 'image', media_type: mediaType, source: data });
hasNonText = true;
} else if (p.type === 'tool_use') {
// tool_use blocks are handled separately as ToolCall — skip
} else if (p.type === 'tool_result') {
Expand Down
8 changes: 6 additions & 2 deletions packages/core/src/evaluation/providers/pi-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,12 @@ export function toPiContentArray(content: unknown): Content[] | undefined {
let source = '';
if (typeof p.source === 'object' && p.source !== null) {
const src = p.source as Record<string, unknown>;
const srcMediaType = typeof src.media_type === 'string' ? src.media_type : mediaType;
source = typeof src.data === 'string' ? `data:${srcMediaType};base64,${src.data}` : '';
const srcMediaType =
typeof src.media_type === 'string' ? src.media_type : mediaType;
source =
typeof src.data === 'string'
? `data:${srcMediaType};base64,${src.data}`
: '';
}
if (!source && typeof p.url === 'string') {
source = p.url;
Expand Down
4 changes: 1 addition & 3 deletions packages/core/src/evaluation/validation/prompt-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ export function validateTemplateVariables(content: string, source: string): void
const hasCandidateAnswer =
foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) ||
foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
const hasExpectedOutput =
foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) ||
foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;

// ERROR: Missing required fields - throw error to skip this evaluator/eval case
Expand Down
Loading
Loading