Skip to content

Commit bf783c5

Browse files
Merge remote-tracking branch 'origin/deploy' into feat/agentos-overhaul
# Conflicts: # agentos/src/App.tsx # packages/agentos-server/src/mongo.ts
2 parents f1ead75 + 65c4afd commit bf783c5

11 files changed

Lines changed: 2215 additions & 1 deletion

File tree

agentos/src/App.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import { Home as HomeIcon, Activity, Shield, Boxes } from "lucide-react";
1+
import { Home as HomeIcon, Activity, Shield, Boxes, FlaskConical } from "lucide-react";
22
import { NavLink, Navigate, Outlet, Route, Routes, useLocation, useNavigate } from "react-router-dom";
33
import { HomePage } from "./components/HomePage.tsx";
44
import { PoliciesPage } from "./components/PoliciesPage.tsx";
5+
import { EvalsPage } from "./components/EvalsPage.tsx";
56
import { ObservabilityTab } from "./components/observability/ObservabilityTab.tsx";
67
import { RegistryPage } from "./components/RegistryPage.tsx";
78
import { AgentDashboard } from "./components/AgentDashboard.tsx";
@@ -18,6 +19,7 @@ export default function App() {
1819
<Route path="registry" element={<RegistryRoute />} />
1920
<Route path="observability" element={<ObservabilityTab />} />
2021
<Route path="policies" element={<PoliciesPage />} />
22+
<Route path="evals" element={<EvalsPage />} />
2123
<Route path="agents/:id" element={<AgentDashboard />} />
2224
<Route path="*" element={<Navigate to="/home" replace />} />
2325
</Route>
@@ -49,6 +51,7 @@ function Layout() {
4951
<RailLink to="/registry" icon={Boxes} label="Agent Registry" active={registryActive} />
5052
<RailLink to="/observability" icon={Activity} label="Observability" />
5153
<RailLink to="/policies" icon={Shield} label="Policies" />
54+
<RailLink to="/evals" icon={FlaskConical} label="Agent Simulation Engine" />
5255
</nav>
5356

5457
<div className="flex-1" />

agentos/src/api.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,95 @@ async function reqJSON<T>(method: string, path: string, body?: unknown): Promise
248248
return r.json() as Promise<T>;
249249
}
250250

251+
// ── Evals ──────────────────────────────────────────────────────────────────
252+
export interface GoldenExpectation {
253+
mode: "exact" | "contains" | "regex";
254+
value: string;
255+
}
256+
export interface EvalCase {
257+
id: string;
258+
prompt: string;
259+
criteria?: string;
260+
golden?: GoldenExpectation;
261+
expectedTools?: string[];
262+
forbiddenTools?: string[];
263+
maxCostUsd?: number;
264+
maxLatencyMs?: number;
265+
}
266+
export interface ScorerConfig {
267+
taskSuccess: boolean;
268+
toolCompliance: boolean;
269+
golden: boolean;
270+
nfr: boolean;
271+
}
272+
export interface EvalSuite {
273+
_id: string;
274+
name: string;
275+
description?: string;
276+
agentName: string;
277+
cases: EvalCase[];
278+
scorers: ScorerConfig;
279+
judges?: JudgeDef[];
280+
// legacy single-judge fields (read-only back-compat)
281+
judgeModel?: string;
282+
judgePrompt?: string;
283+
judgePassThreshold?: number;
284+
passThreshold?: number;
285+
createdAt?: string;
286+
updatedAt?: string;
287+
}
288+
export interface JudgeDef {
289+
id: string;
290+
name: string;
291+
rubric?: string;
292+
model?: string;
293+
passThreshold?: number;
294+
}
295+
export type EvalSuiteInput = Omit<EvalSuite, "_id" | "createdAt" | "updatedAt">;
296+
export interface ScoreResult {
297+
scorer: "taskSuccess" | "toolCompliance" | "golden" | "nfr";
298+
label?: string;
299+
passed: boolean;
300+
score?: number;
301+
detail?: string;
302+
}
303+
export interface PolicyDenial {
304+
tool: string;
305+
reason: string;
306+
}
307+
export interface EvalTraceEntry {
308+
type: "thinking" | "text" | "tool_use" | "tool_result";
309+
text?: string;
310+
tool?: string;
311+
input?: unknown;
312+
isError?: boolean;
313+
}
314+
export interface CaseResult {
315+
caseId: string;
316+
prompt: string;
317+
output: string;
318+
toolCalls: string[];
319+
policyDenials: PolicyDenial[];
320+
transcript: EvalTraceEntry[];
321+
costUsd: number;
322+
latencyMs: number;
323+
scores: ScoreResult[];
324+
passed: boolean;
325+
error?: string;
326+
}
327+
export interface EvalRun {
328+
_id: string;
329+
suiteId: string;
330+
suiteName: string;
331+
agentName: string;
332+
status: "running" | "completed" | "failed";
333+
startedAt: string;
334+
completedAt?: string;
335+
results: CaseResult[];
336+
summary: { total: number; passed: number; passRate: number; gatePassed?: boolean };
337+
error?: string;
338+
}
339+
251340
export const api = {
252341
agents: () => getJSON<{ agents: Agent[] }>("/agents").then((d) => d.agents),
253342
registerAgent: (input: RegisterAgentInput) =>
@@ -307,4 +396,20 @@ export const api = {
307396
reqJSON<OPAPolicyDoc | { success?: boolean }>("PUT", `/opa-policies/${encodeURIComponent(id)}`, body),
308397
deleteOpaPolicy: (id: string) =>
309398
reqJSON<{ success?: boolean }>("DELETE", `/opa-policies/${encodeURIComponent(id)}`),
399+
400+
// Evals — suite CRUD + run trigger + run readback.
401+
evals: {
402+
listSuites: () => getJSON<{ suites: EvalSuite[] }>("/evals/suites").then((d) => d.suites),
403+
getSuite: (id: string) => getJSON<EvalSuite>(`/evals/suites/${encodeURIComponent(id)}`),
404+
createSuite: (body: EvalSuiteInput) => postJSON<EvalSuite>("/evals/suites", body),
405+
updateSuite: (id: string, body: EvalSuiteInput) =>
406+
reqJSON<EvalSuite>("PUT", `/evals/suites/${encodeURIComponent(id)}`, body),
407+
deleteSuite: (id: string) => reqJSON<{ ok: boolean }>("DELETE", `/evals/suites/${encodeURIComponent(id)}`),
408+
runSuite: (id: string) => postJSON<{ runId: string }>(`/evals/suites/${encodeURIComponent(id)}/run`, {}),
409+
generateCases: (agentName: string, count: number, focus?: string) =>
410+
postJSON<{ cases: EvalCase[] }>("/evals/generate", { agentName, count, focus }).then((d) => d.cases),
411+
listRuns: (suiteId?: string) =>
412+
getJSON<{ runs: EvalRun[] }>(`/evals/runs${suiteId ? `?suite=${encodeURIComponent(suiteId)}` : ""}`).then((d) => d.runs),
413+
getRun: (id: string) => getJSON<EvalRun>(`/evals/runs/${encodeURIComponent(id)}`),
414+
},
310415
};

0 commit comments

Comments
 (0)