@@ -45,6 +45,7 @@ export async function runSingleEval(
4545 const startTime = new Date ( )
4646 const trace : CodebuffTrace [ ] = [ ]
4747 let error : string | undefined
48+ let totalCostUsd = 0
4849
4950 // Add process-level error handlers for this eval
5051 const originalUncaughtHandler = process . listeners ( 'uncaughtException' )
@@ -173,6 +174,8 @@ Explain your reasoning in detail.`,
173174 60_000 * 30 ,
174175 )
175176
177+ // Track credits used
178+ totalCostUsd += codebuffResult . totalCostUsd
176179 trace . push ( { prompt, steps : codebuffResult . steps } )
177180 }
178181
@@ -223,19 +226,26 @@ Explain your reasoning in detail.`,
223226 error,
224227 gitDiff : fileStates ,
225228 durationMs,
229+ costUsd : totalCostUsd ,
226230 }
227231
228232 // Add judging results even for failed runs
229233 try {
230234 const judgingResults = await judgeEvalRun ( evalRun )
231235 console . log ( 'Judging results:' , judgingResults )
236+
232237 return {
233238 ...evalRun ,
234239 judging_results : judgingResults ,
240+ computed_metrics : {
241+ runtime_sec : durationMs / 1000 ,
242+ cost_usd : totalCostUsd ,
243+ } ,
235244 }
236245 } catch ( judgingError ) {
237246 console . error ( 'Error in judging:' , judgingError )
238247 // Return without judging results if judging fails
248+
239249 return {
240250 ...evalRun ,
241251 judging_results : {
@@ -249,6 +259,10 @@ Explain your reasoning in detail.`,
249259 overallScore : 0 ,
250260 } ,
251261 } ,
262+ computed_metrics : {
263+ runtime_sec : durationMs / 1000 ,
264+ cost_usd : totalCostUsd ,
265+ } ,
252266 }
253267 }
254268}
@@ -591,6 +605,16 @@ export async function runGitEvals(
591605
592606function calculateOverallMetrics ( evalRuns : EvalRunJudged [ ] ) {
593607 return {
608+ average_runtime_sec :
609+ evalRuns . reduce (
610+ ( sum , run ) => sum + ( run . computed_metrics ?. runtime_sec || 0 ) ,
611+ 0 ,
612+ ) / evalRuns . length ,
613+ average_cost_usd :
614+ evalRuns . reduce (
615+ ( sum , run ) => sum + ( run . computed_metrics ?. cost_usd || 0 ) ,
616+ 0 ,
617+ ) / evalRuns . length ,
594618 average_completion :
595619 evalRuns . reduce (
596620 ( sum , run ) => sum + ( run . judging_results . metrics . completionScore || 0 ) ,
0 commit comments