Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 37 additions & 18 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1593,6 +1593,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
let attempt = 0;
let providerResponse: ProviderResponse | undefined = cachedResponse;
let lastError: unknown;
/** Set when a fallback target actually served the response. */
let targetUsed: string | undefined;

while (!providerResponse && attempt < attemptBudget) {
try {
Expand All @@ -1616,25 +1618,36 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
attempt += 1;
continue;
}
// On error, keep workspace for debugging (unless forceCleanup is set)
const errorResult = buildErrorResult(
evalCase,
target.name,
nowFn(),
error,
promptInputs,
provider,
'agent',
'provider_error',
verbose,
);
if (workspacePath) {
if (forceCleanup) {
await cleanupWorkspace(workspacePath).catch(() => {});
}
return { ...errorResult, workspacePath };
break; // Exhausted retries on primary — try fallback targets below
}
}

// Try fallback targets in order after exhausting retries on the primary
if (!providerResponse && target.fallbackTargets?.length && targetResolver) {
for (const fallbackName of target.fallbackTargets) {
const fallbackProvider = targetResolver(fallbackName);
if (!fallbackProvider) {
continue;
}
try {
providerResponse = await invokeProvider(fallbackProvider, {
evalCase: evalCase,
target,
promptInputs,
attempt: 0,
agentTimeoutMs,
signal,
cwd: workspacePath,
workspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
captureFileChanges: !!baselineCommit,
streamCallbacks: options.streamCallbacks,
});
targetUsed = fallbackName;
break; // Fallback succeeded
} catch (error) {
lastError = error;
// Continue to next fallback
}
return errorResult;
}
}

Expand Down Expand Up @@ -1812,9 +1825,13 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
? 'execution_error'
: classifyQualityStatus(result.score, caseThreshold);

// Include targetUsed only when a fallback target served the response
const targetUsedField = targetUsed ? { targetUsed } : {};

const finalResult = providerError
? {
...result,
...targetUsedField,
evalRun,
error: providerError,
executionStatus,
Expand All @@ -1828,6 +1845,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
: skippedEvaluatorError
? {
...result,
...targetUsedField,
score: 0,
evalRun,
error: skippedEvaluatorError,
Expand All @@ -1841,6 +1859,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
}
: {
...result,
...targetUsedField,
evalRun,
executionStatus,
beforeAllOutput,
Expand Down
11 changes: 11 additions & 0 deletions packages/core/src/evaluation/providers/targets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,11 @@ interface ResolvedTargetBase {
* to force CLI invocation even in subagent mode.
*/
readonly subagentModeAllowed?: boolean;
/**
* Ordered list of target names to try when the primary target fails after
* exhausting retries. Each fallback is attempted in order.
*/
readonly fallbackTargets?: readonly string[];
}

export type ResolvedTarget =
Expand Down Expand Up @@ -642,6 +647,8 @@ export const COMMON_TARGET_SETTINGS = [
'providerBatching',
'subagent_mode_allowed',
'subagentModeAllowed',
'fallback_targets',
'fallbackTargets',
] as const;

const BASE_TARGET_SCHEMA = z
Expand All @@ -654,6 +661,8 @@ const BASE_TARGET_SCHEMA = z
workspace_template: z.string().optional(),
workspaceTemplate: z.string().optional(),
subagent_mode_allowed: z.boolean().optional(),
fallback_targets: z.array(z.string().min(1)).optional(),
fallbackTargets: z.array(z.string().min(1)).optional(),
})
.passthrough();

Expand Down Expand Up @@ -741,12 +750,14 @@ export function resolveTargetDefinition(
);

// Shared base fields for all resolved targets
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
const base = {
name: parsed.name,
graderTarget: parsed.grader_target ?? parsed.judge_target,
workers: parsed.workers,
providerBatching,
subagentModeAllowed,
...(fallbackTargets ? { fallbackTargets } : {}),
} as const;

switch (provider) {
Expand Down
3 changes: 3 additions & 0 deletions packages/core/src/evaluation/providers/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -382,4 +382,7 @@ export interface TargetDefinition {
readonly retryBackoffFactor?: number | unknown | undefined;
readonly retry_status_codes?: unknown | undefined;
readonly retryStatusCodes?: unknown | undefined;
// Fallback targets for provider errors
readonly fallback_targets?: readonly string[] | unknown | undefined;
readonly fallbackTargets?: readonly string[] | unknown | undefined;
}
5 changes: 5 additions & 0 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,11 @@ export interface EvaluationResult {
readonly score: number;
readonly assertions: readonly AssertionEntry[];
readonly target: string;
/**
* The target that actually served the response, when different from the
* primary target. Present only when a fallback target was used.
*/
readonly targetUsed?: string;
/** Token usage metrics from provider (optional) */
readonly tokenUsage?: TokenUsage;
/** Total cost in USD (optional, from provider) */
Expand Down
Loading
Loading