From 018c2e5e68c372aa6fd53ac3007857d400341750 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:08:45 +1100 Subject: [PATCH 01/13] fix(studio): add /api/config endpoint and use configurable pass threshold Replace hardcoded `score >= 1` checks in 5 server endpoints with a configurable pass_threshold loaded from config.yaml in the runs directory. Defaults to PASS_THRESHOLD (0.8) from @agentv/core when no config exists. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 15 +++-- .../cli/src/commands/results/studio-config.ts | 52 ++++++++++++++++++ .../commands/results/studio-config.test.ts | 55 +++++++++++++++++++ 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 apps/cli/src/commands/results/studio-config.ts create mode 100644 apps/cli/test/commands/results/studio-config.test.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index ec39f2cc..bbae6a2f 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -36,6 +36,7 @@ import { resolveResultSourcePath, } from './manifest.js'; import { patchTestIds } from './shared.js'; +import { loadStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -142,8 +143,12 @@ export function createApp( options?: { studioDir?: string }, ): Hono { const searchDir = cwd ?? resultDir; + const config = loadStudioConfig(searchDir); const app = new Hono(); + // Studio configuration + app.get('/api/config', (c) => c.json(config)); + // Dashboard HTML — serve Studio SPA (React app). const studioDistPath = options?.studioDir ?? resolveStudioDistDir(); if (!studioDistPath || !existsSync(path.join(studioDistPath, 'index.html'))) { @@ -278,7 +283,7 @@ export function createApp( const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= config.pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -318,7 +323,7 @@ export function createApp( datasets: new Set(), }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= config.pass_threshold) entry.passed++; entry.scoreSum += r.score; entry.datasets.add(r.dataset ?? r.target ?? 'default'); categoryMap.set(cat, entry); @@ -354,7 +359,7 @@ export function createApp( const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= config.pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -601,7 +606,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.target) entry.targets.add(r.target); entry.evalCount++; - if (r.score >= 1) entry.passedCount++; + if (r.score >= config.pass_threshold) entry.passedCount++; if (r.timestamp && r.timestamp > entry.lastTimestamp) { entry.lastTimestamp = r.timestamp; } @@ -652,7 +657,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.experiment) entry.experiments.add(r.experiment); entry.evalCount++; - if (r.score >= 1) entry.passedCount++; + if (r.score >= config.pass_threshold) entry.passedCount++; targetMap.set(target, entry); } } catch { diff --git a/apps/cli/src/commands/results/studio-config.ts b/apps/cli/src/commands/results/studio-config.ts new file mode 100644 index 00000000..f1062fb5 --- /dev/null +++ b/apps/cli/src/commands/results/studio-config.ts @@ -0,0 +1,52 @@ +/** + * Studio configuration loader. + * + * Reads an optional `config.yaml` from the runs directory to configure + * AgentV Studio behavior (e.g., pass/fail threshold). + * + * config.yaml format: + * pass_threshold: 0.8 # score >= this value is considered "pass" + * + * If no config.yaml exists, defaults are used. + */ + +import { existsSync, readFileSync } from 'node:fs'; +import path from 'node:path'; + +import { PASS_THRESHOLD } from '@agentv/core'; +import { parse as parseYaml } from 'yaml'; + +export interface StudioConfig { + pass_threshold: number; +} + +const DEFAULTS: StudioConfig = { + pass_threshold: PASS_THRESHOLD, +}; + +/** + * Load studio config from `config.yaml` in the given runs directory. + * Returns defaults when the file does not exist or is empty. + * Clamps `pass_threshold` to [0, 1]. + */ +export function loadStudioConfig(runsDir: string): StudioConfig { + const configPath = path.join(runsDir, 'config.yaml'); + + if (!existsSync(configPath)) { + return { ...DEFAULTS }; + } + + const raw = readFileSync(configPath, 'utf-8'); + const parsed = parseYaml(raw); + + if (!parsed || typeof parsed !== 'object') { + return { ...DEFAULTS }; + } + + const threshold = + typeof parsed.pass_threshold === 'number' ? parsed.pass_threshold : DEFAULTS.pass_threshold; + + return { + pass_threshold: Math.min(1, Math.max(0, threshold)), + }; +} diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts new file mode 100644 index 00000000..df0b1f46 --- /dev/null +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -0,0 +1,55 @@ +import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; + +import { PASS_THRESHOLD } from '@agentv/core'; + +import { loadStudioConfig } from '../../../src/commands/results/studio-config.js'; + +describe('loadStudioConfig', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'studio-config-')); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('returns defaults when no config.yaml exists', () => { + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); + + it('reads pass_threshold from config.yaml', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.6\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(0.6); + }); + + it('clamps pass_threshold to 0 when negative', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: -0.5\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(0); + }); + + it('clamps pass_threshold to 1 when above 1', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 1.5\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(1); + }); + + it('returns defaults for empty config.yaml', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), ''); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); + + it('returns defaults when pass_threshold is not a number', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: "high"\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); +}); From c0e77992c5afee7a522e847f28cab1dc24de7caa Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:10:37 +1100 Subject: [PATCH 02/13] feat(studio): add StudioConfigResponse type, useStudioConfig hook, and isPassing helper Co-Authored-By: Claude Sonnet 4.6 --- apps/studio/src/lib/api.ts | 18 ++++++++++++++++++ apps/studio/src/lib/types.ts | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index a1d6c7f0..bcd55aab 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -18,6 +18,7 @@ import type { IndexResponse, RunDetailResponse, RunListResponse, + StudioConfigResponse, TargetsResponse, } from './types'; @@ -126,6 +127,12 @@ export function categoryDatasetsOptions(runId: string, category: string) { }); } +export const studioConfigOptions = queryOptions({ + queryKey: ['config'], + queryFn: () => fetchJson('/api/config'), + staleTime: 60_000, +}); + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -175,3 +182,14 @@ export function useRunCategories(runId: string) { export function useCategoryDatasets(runId: string, category: string) { return useQuery(categoryDatasetsOptions(runId, category)); } + +export function useStudioConfig() { + return useQuery(studioConfigOptions); +} + +/** Default pass threshold matching @agentv/core PASS_THRESHOLD */ +export const DEFAULT_PASS_THRESHOLD = 0.8; + +export function isPassing(score: number, passThreshold: number = DEFAULT_PASS_THRESHOLD): boolean { + return score >= passThreshold; +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 4c1ef6f0..6555dbe7 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -164,3 +164,7 @@ export interface CategorySummary { export interface CategoriesResponse { categories: CategorySummary[]; } + +export interface StudioConfigResponse { + pass_threshold: number; +} From b7e4bf37ed703fb4a6aa69d9511174f0481683f2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:11:34 +1100 Subject: [PATCH 03/13] fix(trace): use PASS_THRESHOLD constant instead of hardcoded 1.0 in utils Replace hardcoded score >= 1.0 with PASS_THRESHOLD (0.8) in listResultFiles pass count calculation so it aligns with the standard evaluation threshold. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/trace/utils.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index d6b51925..7baec2dc 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -1,7 +1,7 @@ import { readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { toCamelCaseDeep } from '@agentv/core'; +import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; import { RESULT_INDEX_FILENAME, RESULT_RUNS_DIRNAME, @@ -596,7 +596,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const results = loadResultFile(filePath); const testCount = results.length; - const passCount = results.filter((r) => r.score >= 1.0).length; + const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; const passRate = testCount > 0 ? passCount / testCount : 0; const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; From e9720c733d4f0f6389e0ba1023d080591dad61bf Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:12:30 +1100 Subject: [PATCH 04/13] fix(studio): use configurable pass threshold in EvalDetail failure logic Replace hardcoded score < 1 checks with isPassing(score, passThreshold) using the studio config's pass_threshold (default 0.8). Co-Authored-By: Claude Sonnet 4.6 --- apps/studio/src/components/EvalDetail.tsx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 420ed976..3c638cfb 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -8,7 +8,7 @@ import { useState } from 'react'; -import { useEvalFileContent, useEvalFiles } from '~/lib/api'; +import { useEvalFileContent, useEvalFiles, useStudioConfig, isPassing } from '~/lib/api'; import type { EvalResult } from '~/lib/types'; import { FeedbackPanel } from './FeedbackPanel'; @@ -119,9 +119,11 @@ export function EvalDetail({ eval: result, runId }: EvalDetailProps) { } function StepsTab({ result }: { result: EvalResult }) { + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; const assertions = result.assertions ?? []; const hasFailed = - result.score < 1 || result.executionStatus === 'error' || result.executionStatus === 'failed'; + !isPassing(result.score, passThreshold) || result.executionStatus === 'error' || result.executionStatus === 'failed'; // Collect failure reasons from multiple sources const failureReasons: string[] = []; @@ -138,7 +140,7 @@ function StepsTab({ result }: { result: EvalResult }) { // Also check per-evaluator scores for failure details if (result.scores) { for (const s of result.scores) { - if (s.score < 1 && s.details) { + if (!isPassing(s.score, passThreshold) && s.details) { const detailStr = typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2); failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`); From 5661b38f2a06c77eef6a1f8e3327927a5bcc34ae Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:12:37 +1100 Subject: [PATCH 05/13] fix(studio): use configurable pass threshold in RunDetail instead of hardcoded score >= 1 Co-Authored-By: Claude Sonnet 4.6 --- apps/studio/src/components/RunDetail.tsx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 01961138..53ab4664 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -10,6 +10,7 @@ import { useState } from 'react'; import type { EvalResult } from '~/lib/types'; +import { useStudioConfig, isPassing } from '~/lib/api'; import { ScoreBar } from './ScoreBar'; import { StatsCards } from './StatsCards'; @@ -35,7 +36,7 @@ interface CategoryGroup { avgScore: number; } -function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { +function buildCategoryGroups(results: EvalResult[], passThreshold: number): CategoryGroup[] { const categoryMap = new Map< string, Map @@ -50,7 +51,7 @@ function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { const entry = dsMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; entry.scoreSum += r.score; - if (r.score >= 1) entry.passed += 1; + if (isPassing(r.score, passThreshold)) entry.passed += 1; else entry.failed += 1; dsMap.set(ds, entry); } @@ -83,13 +84,16 @@ function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { } export function RunDetail({ results, runId }: RunDetailProps) { + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; + const total = results.length; - const passed = results.filter((r) => r.score >= 1).length; + const passed = results.filter((r) => isPassing(r.score, passThreshold)).length; const failed = total - passed; const passRate = total > 0 ? passed / total : 0; const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - const categories = buildCategoryGroups(results); + const categories = buildCategoryGroups(results, passThreshold); const hasMultipleCategories = categories.length > 1; if (total === 0) { From 1fc5d7d9a83a3bec74cc517129da7e009af41d1a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 16:13:13 +1100 Subject: [PATCH 06/13] fix(studio): use configurable pass threshold in Sidebar and dataset page Replace hardcoded `score >= 1` checks with `isPassing(score, passThreshold)` using the `useStudioConfig` hook in EvalSidebar, DatasetSidebar, and DatasetPage. Co-Authored-By: Claude Sonnet 4.6 --- apps/studio/src/components/Sidebar.tsx | 10 +++++++--- .../src/routes/runs/$runId_.dataset.$dataset.tsx | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 3ca1d43e..0a41704d 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -10,7 +10,7 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { useCategoryDatasets, useExperiments, useRunDetail, useRunList } from '~/lib/api'; +import { isPassing, useCategoryDatasets, useExperiments, useRunDetail, useRunList, useStudioConfig } from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); @@ -109,6 +109,8 @@ function RunSidebar() { function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: string }) { const { data } = useRunDetail(runId); + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; return ( ); } diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index bcd55aab..5b340056 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -130,7 +130,7 @@ export function categoryDatasetsOptions(runId: string, category: string) { export const studioConfigOptions = queryOptions({ queryKey: ['config'], queryFn: () => fetchJson('/api/config'), - staleTime: 60_000, + staleTime: 5_000, }); // ── Hooks ─────────────────────────────────────────────────────────────── @@ -193,3 +193,17 @@ export const DEFAULT_PASS_THRESHOLD = 0.8; export function isPassing(score: number, passThreshold: number = DEFAULT_PASS_THRESHOLD): boolean { return score >= passThreshold; } + +export async function saveStudioConfig( + config: Partial, +): Promise { + const res = await fetch('/api/config', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(config), + }); + if (!res.ok) { + throw new Error(`Failed to save config: ${res.status}`); + } + return res.json() as Promise; +} diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 118360b3..eed4c15d 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -9,6 +9,7 @@ // Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified. import { Route as rootRouteImport } from './routes/__root' +import { Route as SettingsRouteImport } from './routes/settings' import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' @@ -16,6 +17,11 @@ import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$eva import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset' import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' +const SettingsRoute = SettingsRouteImport.update({ + id: '/settings', + path: '/settings', + getParentRoute: () => rootRouteImport, +} as any) const IndexRoute = IndexRouteImport.update({ id: '/', path: '/', @@ -51,6 +57,7 @@ const RunsRunIdCategoryCategoryRoute = export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -59,6 +66,7 @@ export interface FileRoutesByFullPath { } export interface FileRoutesByTo { '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -68,6 +76,7 @@ export interface FileRoutesByTo { export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -78,6 +87,7 @@ export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -86,6 +96,7 @@ export interface FileRouteTypes { fileRoutesByTo: FileRoutesByTo to: | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -94,6 +105,7 @@ export interface FileRouteTypes { id: | '__root__' | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -103,6 +115,7 @@ export interface FileRouteTypes { } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + SettingsRoute: typeof SettingsRoute ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute @@ -112,6 +125,13 @@ export interface RootRouteChildren { declare module '@tanstack/react-router' { interface FileRoutesByPath { + '/settings': { + id: '/settings' + path: '/settings' + fullPath: '/settings' + preLoaderRoute: typeof SettingsRouteImport + parentRoute: typeof rootRouteImport + } '/': { id: '/' path: '/' @@ -159,6 +179,7 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + SettingsRoute: SettingsRoute, ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, diff --git a/apps/studio/src/routes/settings.tsx b/apps/studio/src/routes/settings.tsx new file mode 100644 index 00000000..d404192a --- /dev/null +++ b/apps/studio/src/routes/settings.tsx @@ -0,0 +1,126 @@ +/** + * Settings page for configuring AgentV Studio behavior. + * + * Reads and writes to .agentv/config.yaml via the /api/config endpoint. + * Changes take effect immediately on page refresh. + */ + +import { useQueryClient } from '@tanstack/react-query'; +import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; + +import { DEFAULT_PASS_THRESHOLD, saveStudioConfig, useStudioConfig } from '~/lib/api'; + +export const Route = createFileRoute('/settings')({ + component: SettingsPage, +}); + +function SettingsPage() { + const { data: config, isLoading } = useStudioConfig(); + const queryClient = useQueryClient(); + const [threshold, setThreshold] = useState(''); + const [saving, setSaving] = useState(false); + const [message, setMessage] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + const currentThreshold = config?.pass_threshold ?? DEFAULT_PASS_THRESHOLD; + const displayThreshold = threshold || String(currentThreshold); + + const handleSave = async () => { + const value = Number.parseFloat(threshold || String(currentThreshold)); + if (Number.isNaN(value) || value < 0 || value > 1) { + setMessage({ type: 'error', text: 'Threshold must be a number between 0 and 1' }); + return; + } + + setSaving(true); + setMessage(null); + try { + await saveStudioConfig({ pass_threshold: value }); + await queryClient.invalidateQueries({ queryKey: ['config'] }); + setThreshold(''); + setMessage({ type: 'success', text: 'Settings saved' }); + setTimeout(() => setMessage(null), 3000); + } catch { + setMessage({ type: 'error', text: 'Failed to save settings' }); + } finally { + setSaving(false); + } + }; + + if (isLoading) { + return ( +
+
+
+
+ ); + } + + return ( +
+
+

Settings

+

Configure your AgentV Studio dashboard

+
+ + {/* Pass Threshold Card */} +
+

Evaluation

+

+ Configure how evaluation results are classified +

+ +
+
+ +

+ Score at or above this value is considered passing. Default: {DEFAULT_PASS_THRESHOLD} +

+
+ setThreshold(e.target.value)} + className="w-32 rounded-md border border-gray-700 bg-gray-800 px-3 py-2 text-sm text-white placeholder-gray-500 focus:border-cyan-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> + + ({Math.round((Number.parseFloat(displayThreshold) || 0) * 100)}%) + +
+
+
+ +
+ + {message && ( + + {message.text} + + )} +
+
+ + {/* Config file info */} +
+

+ Settings are stored in .agentv/config.yaml +

+
+
+ ); +} From 74e4c0f39383273f2cfd7c26f1f9611bfe171f2c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 17:28:58 +1100 Subject: [PATCH 13/13] perf(studio): hoist loadStudioConfig outside loops in serve endpoints Read config once per request, not once per result row. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index d801c129..14a9b855 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -293,12 +293,13 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const datasetMap = new Map(); for (const r of loaded) { const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= loadStudioConfig(agentvDir).pass_threshold) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -325,6 +326,7 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, { total: number; passed: number; scoreSum: number; datasets: Set } @@ -338,7 +340,7 @@ export function createApp( datasets: new Set(), }; entry.total++; - if (r.score >= loadStudioConfig(agentvDir).pass_threshold) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; entry.datasets.add(r.dataset ?? r.target ?? 'default'); categoryMap.set(cat, entry); @@ -368,13 +370,14 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const datasetMap = new Map(); for (const r of filtered) { const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= loadStudioConfig(agentvDir).pass_threshold) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -595,6 +598,7 @@ export function createApp( // Experiments aggregate (group all runs by experiment) app.get('/api/experiments', (c) => { const metas = listResultFiles(searchDir); + const { pass_threshold } = loadStudioConfig(agentvDir); const experimentMap = new Map< string, { @@ -621,7 +625,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.target) entry.targets.add(r.target); entry.evalCount++; - if (r.score >= loadStudioConfig(agentvDir).pass_threshold) entry.passedCount++; + if (r.score >= pass_threshold) entry.passedCount++; if (r.timestamp && r.timestamp > entry.lastTimestamp) { entry.lastTimestamp = r.timestamp; } @@ -648,6 +652,7 @@ export function createApp( // Targets aggregate (group all runs by target) app.get('/api/targets', (c) => { const metas = listResultFiles(searchDir); + const { pass_threshold } = loadStudioConfig(agentvDir); const targetMap = new Map< string, { @@ -672,7 +677,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.experiment) entry.experiments.add(r.experiment); entry.evalCount++; - if (r.score >= loadStudioConfig(agentvDir).pass_threshold) entry.passedCount++; + if (r.score >= pass_threshold) entry.passedCount++; targetMap.set(target, entry); } } catch {