@@ -248,6 +248,95 @@ async function reqJSON<T>(method: string, path: string, body?: unknown): Promise
248248 return r . json ( ) as Promise < T > ;
249249}
250250
251+ // ── Evals ──────────────────────────────────────────────────────────────────
252+ export interface GoldenExpectation {
253+ mode : "exact" | "contains" | "regex" ;
254+ value : string ;
255+ }
256+ export interface EvalCase {
257+ id : string ;
258+ prompt : string ;
259+ criteria ?: string ;
260+ golden ?: GoldenExpectation ;
261+ expectedTools ?: string [ ] ;
262+ forbiddenTools ?: string [ ] ;
263+ maxCostUsd ?: number ;
264+ maxLatencyMs ?: number ;
265+ }
266+ export interface ScorerConfig {
267+ taskSuccess : boolean ;
268+ toolCompliance : boolean ;
269+ golden : boolean ;
270+ nfr : boolean ;
271+ }
272+ export interface EvalSuite {
273+ _id : string ;
274+ name : string ;
275+ description ?: string ;
276+ agentName : string ;
277+ cases : EvalCase [ ] ;
278+ scorers : ScorerConfig ;
279+ judges ?: JudgeDef [ ] ;
280+ // legacy single-judge fields (read-only back-compat)
281+ judgeModel ?: string ;
282+ judgePrompt ?: string ;
283+ judgePassThreshold ?: number ;
284+ passThreshold ?: number ;
285+ createdAt ?: string ;
286+ updatedAt ?: string ;
287+ }
288+ export interface JudgeDef {
289+ id : string ;
290+ name : string ;
291+ rubric ?: string ;
292+ model ?: string ;
293+ passThreshold ?: number ;
294+ }
295+ export type EvalSuiteInput = Omit < EvalSuite , "_id" | "createdAt" | "updatedAt" > ;
296+ export interface ScoreResult {
297+ scorer : "taskSuccess" | "toolCompliance" | "golden" | "nfr" ;
298+ label ?: string ;
299+ passed : boolean ;
300+ score ?: number ;
301+ detail ?: string ;
302+ }
303+ export interface PolicyDenial {
304+ tool : string ;
305+ reason : string ;
306+ }
307+ export interface EvalTraceEntry {
308+ type : "thinking" | "text" | "tool_use" | "tool_result" ;
309+ text ?: string ;
310+ tool ?: string ;
311+ input ?: unknown ;
312+ isError ?: boolean ;
313+ }
314+ export interface CaseResult {
315+ caseId : string ;
316+ prompt : string ;
317+ output : string ;
318+ toolCalls : string [ ] ;
319+ policyDenials : PolicyDenial [ ] ;
320+ transcript : EvalTraceEntry [ ] ;
321+ costUsd : number ;
322+ latencyMs : number ;
323+ scores : ScoreResult [ ] ;
324+ passed : boolean ;
325+ error ?: string ;
326+ }
327+ export interface EvalRun {
328+ _id : string ;
329+ suiteId : string ;
330+ suiteName : string ;
331+ agentName : string ;
332+ status : "running" | "completed" | "failed" ;
333+ startedAt : string ;
334+ completedAt ?: string ;
335+ results : CaseResult [ ] ;
336+ summary : { total : number ; passed : number ; passRate : number ; gatePassed ?: boolean } ;
337+ error ?: string ;
338+ }
339+
251340export const api = {
252341 agents : ( ) => getJSON < { agents : Agent [ ] } > ( "/agents" ) . then ( ( d ) => d . agents ) ,
253342 registerAgent : ( input : RegisterAgentInput ) =>
@@ -307,4 +396,20 @@ export const api = {
307396 reqJSON < OPAPolicyDoc | { success ?: boolean } > ( "PUT" , `/opa-policies/${ encodeURIComponent ( id ) } ` , body ) ,
308397 deleteOpaPolicy : ( id : string ) =>
309398 reqJSON < { success ?: boolean } > ( "DELETE" , `/opa-policies/${ encodeURIComponent ( id ) } ` ) ,
399+
400+ // Evals — suite CRUD + run trigger + run readback.
401+ evals : {
402+ listSuites : ( ) => getJSON < { suites : EvalSuite [ ] } > ( "/evals/suites" ) . then ( ( d ) => d . suites ) ,
403+ getSuite : ( id : string ) => getJSON < EvalSuite > ( `/evals/suites/${ encodeURIComponent ( id ) } ` ) ,
404+ createSuite : ( body : EvalSuiteInput ) => postJSON < EvalSuite > ( "/evals/suites" , body ) ,
405+ updateSuite : ( id : string , body : EvalSuiteInput ) =>
406+ reqJSON < EvalSuite > ( "PUT" , `/evals/suites/${ encodeURIComponent ( id ) } ` , body ) ,
407+ deleteSuite : ( id : string ) => reqJSON < { ok : boolean } > ( "DELETE" , `/evals/suites/${ encodeURIComponent ( id ) } ` ) ,
408+ runSuite : ( id : string ) => postJSON < { runId : string } > ( `/evals/suites/${ encodeURIComponent ( id ) } /run` , { } ) ,
409+ generateCases : ( agentName : string , count : number , focus ?: string ) =>
410+ postJSON < { cases : EvalCase [ ] } > ( "/evals/generate" , { agentName, count, focus } ) . then ( ( d ) => d . cases ) ,
411+ listRuns : ( suiteId ?: string ) =>
412+ getJSON < { runs : EvalRun [ ] } > ( `/evals/runs${ suiteId ? `?suite=${ encodeURIComponent ( suiteId ) } ` : "" } ` ) . then ( ( d ) => d . runs ) ,
413+ getRun : ( id : string ) => getJSON < EvalRun > ( `/evals/runs/${ encodeURIComponent ( id ) } ` ) ,
414+ } ,
310415} ;
0 commit comments