diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index ec39f2cc..14a9b855 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -36,6 +36,7 @@ import { resolveResultSourcePath, } from './manifest.js'; import { patchTestIds } from './shared.js'; +import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -142,8 +143,27 @@ export function createApp( options?: { studioDir?: string }, ): Hono { const searchDir = cwd ?? resultDir; + const agentvDir = path.join(searchDir, '.agentv'); const app = new Hono(); + // Studio configuration (re-read on each request so external edits are picked up) + app.get('/api/config', (c) => c.json(loadStudioConfig(agentvDir))); + + app.post('/api/config', async (c) => { + try { + const body = await c.req.json>(); + const current = loadStudioConfig(agentvDir); + const updated = { ...current, ...body }; + if (typeof updated.pass_threshold === 'number') { + updated.pass_threshold = Math.min(1, Math.max(0, updated.pass_threshold)); + } + saveStudioConfig(agentvDir, updated); + return c.json(updated); + } catch { + return c.json({ error: 'Failed to save config' }, 500); + } + }); + // Dashboard HTML — serve Studio SPA (React app). const studioDistPath = options?.studioDir ?? resolveStudioDistDir(); if (!studioDistPath || !existsSync(path.join(studioDistPath, 'index.html'))) { @@ -273,12 +293,13 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const datasetMap = new Map(); for (const r of loaded) { const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -305,6 +326,7 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, { total: number; passed: number; scoreSum: number; datasets: Set } @@ -318,7 +340,7 @@ export function createApp( datasets: new Set(), }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; entry.datasets.add(r.dataset ?? r.target ?? 'default'); categoryMap.set(cat, entry); @@ -348,13 +370,14 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); + const { pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const datasetMap = new Map(); for (const r of filtered) { const ds = r.dataset ?? r.target ?? 'default'; const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; - if (r.score >= 1) entry.passed++; + if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; datasetMap.set(ds, entry); } @@ -575,6 +598,7 @@ export function createApp( // Experiments aggregate (group all runs by experiment) app.get('/api/experiments', (c) => { const metas = listResultFiles(searchDir); + const { pass_threshold } = loadStudioConfig(agentvDir); const experimentMap = new Map< string, { @@ -601,7 +625,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.target) entry.targets.add(r.target); entry.evalCount++; - if (r.score >= 1) entry.passedCount++; + if (r.score >= pass_threshold) entry.passedCount++; if (r.timestamp && r.timestamp > entry.lastTimestamp) { entry.lastTimestamp = r.timestamp; } @@ -628,6 +652,7 @@ export function createApp( // Targets aggregate (group all runs by target) app.get('/api/targets', (c) => { const metas = listResultFiles(searchDir); + const { pass_threshold } = loadStudioConfig(agentvDir); const targetMap = new Map< string, { @@ -652,7 +677,7 @@ export function createApp( entry.runFilenames.add(m.filename); if (r.experiment) entry.experiments.add(r.experiment); entry.evalCount++; - if (r.score >= 1) entry.passedCount++; + if (r.score >= pass_threshold) entry.passedCount++; targetMap.set(target, entry); } } catch { diff --git a/apps/cli/src/commands/results/studio-config.ts b/apps/cli/src/commands/results/studio-config.ts new file mode 100644 index 00000000..a89f4adc --- /dev/null +++ b/apps/cli/src/commands/results/studio-config.ts @@ -0,0 +1,67 @@ +/** + * Studio configuration loader. + * + * Reads an optional `config.yaml` from the `.agentv/` directory to configure + * AgentV Studio behavior (e.g., pass/fail threshold). + * + * Location: `.agentv/config.yaml` + * + * config.yaml format: + * pass_threshold: 0.8 # score >= this value is considered "pass" + * + * If no config.yaml exists, defaults are used. + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; + +import { PASS_THRESHOLD } from '@agentv/core'; +import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; + +export interface StudioConfig { + pass_threshold: number; +} + +const DEFAULTS: StudioConfig = { + pass_threshold: PASS_THRESHOLD, +}; + +/** + * Load studio config from `config.yaml` in the given `.agentv/` directory. + * Returns defaults when the file does not exist or is empty. + * Clamps `pass_threshold` to [0, 1]. + */ +export function loadStudioConfig(agentvDir: string): StudioConfig { + const configPath = path.join(agentvDir, 'config.yaml'); + + if (!existsSync(configPath)) { + return { ...DEFAULTS }; + } + + const raw = readFileSync(configPath, 'utf-8'); + const parsed = parseYaml(raw); + + if (!parsed || typeof parsed !== 'object') { + return { ...DEFAULTS }; + } + + const threshold = + typeof parsed.pass_threshold === 'number' ? parsed.pass_threshold : DEFAULTS.pass_threshold; + + return { + pass_threshold: Math.min(1, Math.max(0, threshold)), + }; +} + +/** + * Save studio config to `config.yaml` in the given `.agentv/` directory. + * Creates the directory if it does not exist. + */ +export function saveStudioConfig(agentvDir: string, config: StudioConfig): void { + if (!existsSync(agentvDir)) { + mkdirSync(agentvDir, { recursive: true }); + } + const configPath = path.join(agentvDir, 'config.yaml'); + const yamlStr = stringifyYaml(config); + writeFileSync(configPath, yamlStr, 'utf-8'); +} diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index d6b51925..7baec2dc 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -1,7 +1,7 @@ import { readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { toCamelCaseDeep } from '@agentv/core'; +import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; import { RESULT_INDEX_FILENAME, RESULT_RUNS_DIRNAME, @@ -596,7 +596,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const results = loadResultFile(filePath); const testCount = results.length; - const passCount = results.filter((r) => r.score >= 1.0).length; + const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; const passRate = testCount > 0 ? passCount / testCount : 0; const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts new file mode 100644 index 00000000..54b7cf96 --- /dev/null +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -0,0 +1,55 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { PASS_THRESHOLD } from '@agentv/core'; + +import { loadStudioConfig } from '../../../src/commands/results/studio-config.js'; + +describe('loadStudioConfig', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'studio-config-')); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('returns defaults when no config.yaml exists', () => { + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); + + it('reads pass_threshold from config.yaml', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.6\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(0.6); + }); + + it('clamps pass_threshold to 0 when negative', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: -0.5\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(0); + }); + + it('clamps pass_threshold to 1 when above 1', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 1.5\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(1); + }); + + it('returns defaults for empty config.yaml', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), ''); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); + + it('returns defaults when pass_threshold is not a number', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: "high"\n'); + const config = loadStudioConfig(tempDir); + expect(config.pass_threshold).toBe(PASS_THRESHOLD); + }); +}); diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 420ed976..6efd3905 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -1,14 +1,13 @@ /** - * Three-tab eval detail view: Steps (assertions), Output, and Task (input). + * Two-tab eval detail view: Steps (assertions) and Files (artifact browser). * * Shows the full evaluation result with score breakdown, assertions list, - * and Monaco viewers for output/input content. Output and Task tabs include - * a file tree sidebar when artifact files are available. + * and a file tree browser for artifact files (input, output, grading, timing). */ import { useState } from 'react'; -import { useEvalFileContent, useEvalFiles } from '~/lib/api'; +import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api'; import type { EvalResult } from '~/lib/types'; import { FeedbackPanel } from './FeedbackPanel'; @@ -22,7 +21,7 @@ interface EvalDetailProps { runId: string; } -type Tab = 'steps' | 'output' | 'task'; +type Tab = 'steps' | 'files'; /** Recursively find the first file node in the tree. */ function findFirstFile(nodes: FileNode[]): string | null { @@ -41,8 +40,7 @@ export function EvalDetail({ eval: result, runId }: EvalDetailProps) { const tabs: { id: Tab; label: string }[] = [ { id: 'steps', label: 'Steps' }, - { id: 'output', label: 'Output' }, - { id: 'task', label: 'Task' }, + { id: 'files', label: 'Files' }, ]; return ( @@ -108,8 +106,7 @@ export function EvalDetail({ eval: result, runId }: EvalDetailProps) { {/* Tab content */}
{activeTab === 'steps' && } - {activeTab === 'output' && } - {activeTab === 'task' && } + {activeTab === 'files' && }
{/* Feedback */} @@ -119,9 +116,13 @@ export function EvalDetail({ eval: result, runId }: EvalDetailProps) { } function StepsTab({ result }: { result: EvalResult }) { + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; const assertions = result.assertions ?? []; const hasFailed = - result.score < 1 || result.executionStatus === 'error' || result.executionStatus === 'failed'; + !isPassing(result.score, passThreshold) || + result.executionStatus === 'error' || + result.executionStatus === 'failed'; // Collect failure reasons from multiple sources const failureReasons: string[] = []; @@ -138,7 +139,7 @@ function StepsTab({ result }: { result: EvalResult }) { // Also check per-evaluator scores for failure details if (result.scores) { for (const s of result.scores) { - if (s.score < 1 && s.details) { + if (!isPassing(s.score, passThreshold) && s.details) { const detailStr = typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2); failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`); @@ -207,16 +208,14 @@ function StepsTab({ result }: { result: EvalResult }) { ); } -function OutputTab({ result, runId }: { result: EvalResult; runId: string }) { +function FilesTab({ result, runId }: { result: EvalResult; runId: string }) { const evalId = result.testId; const { data: filesData } = useEvalFiles(runId, evalId); const files = filesData?.files ?? []; - const hasFiles = files.length > 0; const [selectedPath, setSelectedPath] = useState(null); - // Resolve effective path: selected, or first file, or null - const effectivePath = selectedPath ?? (hasFiles ? findFirstFile(files) : null); + const effectivePath = selectedPath ?? (files.length > 0 ? findFirstFile(files) : null); const { data: fileContentData, isLoading: isLoadingContent } = useEvalFileContent( runId, @@ -224,69 +223,17 @@ function OutputTab({ result, runId }: { result: EvalResult; runId: string }) { effectivePath ?? '', ); - const output = result.output; - const fallbackText = - output && output.length > 0 ? output.map((m) => `[${m.role}]\n${m.content}`).join('\n\n') : ''; - - if (!hasFiles) { - if (!output || output.length === 0) { - return

No output available.

; - } - return ; - } - - const displayValue = effectivePath - ? isLoadingContent - ? 'Loading...' - : (fileContentData?.content ?? fallbackText) - : fallbackText; - - const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'markdown'; - - return ( -
- -
- -
-
- ); -} - -function TaskTab({ result, runId }: { result: EvalResult; runId: string }) { - const evalId = result.testId; - const { data: filesData } = useEvalFiles(runId, evalId); - const files = filesData?.files ?? []; - const hasFiles = files.length > 0; - - const [selectedPath, setSelectedPath] = useState(null); - - const effectivePath = selectedPath ?? (hasFiles ? findFirstFile(files) : null); - - const { data: fileContentData, isLoading: isLoadingContent } = useEvalFileContent( - runId, - evalId, - effectivePath ?? '', - ); - - const input = result.input; - const fallbackText = - input && input.length > 0 ? input.map((m) => `[${m.role}]\n${m.content}`).join('\n\n') : ''; - - if (!hasFiles) { - if (!input || input.length === 0) { - return

No task input available.

; - } - return ; + if (files.length === 0) { + return

No artifact files available.

; } const displayValue = effectivePath ? isLoadingContent ? 'Loading...' - : (fileContentData?.content ?? fallbackText) - : fallbackText; + : (fileContentData?.content ?? '') + : ''; - const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'markdown'; + const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'plaintext'; return (
diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 01961138..3162dd47 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -10,6 +10,7 @@ import { useState } from 'react'; import type { EvalResult } from '~/lib/types'; +import { isPassing, useStudioConfig } from '~/lib/api'; import { ScoreBar } from './ScoreBar'; import { StatsCards } from './StatsCards'; @@ -35,7 +36,7 @@ interface CategoryGroup { avgScore: number; } -function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { +function buildCategoryGroups(results: EvalResult[], passThreshold: number): CategoryGroup[] { const categoryMap = new Map< string, Map @@ -50,7 +51,7 @@ function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { const entry = dsMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; entry.scoreSum += r.score; - if (r.score >= 1) entry.passed += 1; + if (isPassing(r.score, passThreshold)) entry.passed += 1; else entry.failed += 1; dsMap.set(ds, entry); } @@ -83,13 +84,16 @@ function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { } export function RunDetail({ results, runId }: RunDetailProps) { + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; + const total = results.length; - const passed = results.filter((r) => r.score >= 1).length; + const passed = results.filter((r) => isPassing(r.score, passThreshold)).length; const failed = total - passed; const passRate = total > 0 ? passed / total : 0; const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - const categories = buildCategoryGroups(results); + const categories = buildCategoryGroups(results, passThreshold); const hasMultipleCategories = categories.length > 1; if (total === 0) { diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 3ca1d43e..0210c6b1 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -10,7 +10,14 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { useCategoryDatasets, useExperiments, useRunDetail, useRunList } from '~/lib/api'; +import { + isPassing, + useCategoryDatasets, + useExperiments, + useRunDetail, + useRunList, + useStudioConfig, +} from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); @@ -103,12 +110,24 @@ function RunSidebar() { ); })} + + {/* Settings link at bottom */} +
+ + Settings + +
); } function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: string }) { const { data } = useRunDetail(runId); + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; return (
{datasetResults.map((result) => { - const passed = result.score >= 1; + const passed = isPassing(result.score, passThreshold); return ( fetchJson('/api/config'), + staleTime: 5_000, +}); + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -175,3 +182,28 @@ export function useRunCategories(runId: string) { export function useCategoryDatasets(runId: string, category: string) { return useQuery(categoryDatasetsOptions(runId, category)); } + +export function useStudioConfig() { + return useQuery(studioConfigOptions); +} + +/** Default pass threshold matching @agentv/core PASS_THRESHOLD */ +export const DEFAULT_PASS_THRESHOLD = 0.8; + +export function isPassing(score: number, passThreshold: number = DEFAULT_PASS_THRESHOLD): boolean { + return score >= passThreshold; +} + +export async function saveStudioConfig( + config: Partial, +): Promise { + const res = await fetch('/api/config', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(config), + }); + if (!res.ok) { + throw new Error(`Failed to save config: ${res.status}`); + } + return res.json() as Promise; +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 4c1ef6f0..6555dbe7 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -164,3 +164,7 @@ export interface CategorySummary { export interface CategoriesResponse { categories: CategorySummary[]; } + +export interface StudioConfigResponse { + pass_threshold: number; +} diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 118360b3..eed4c15d 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -9,6 +9,7 @@ // Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified. import { Route as rootRouteImport } from './routes/__root' +import { Route as SettingsRouteImport } from './routes/settings' import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' @@ -16,6 +17,11 @@ import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$eva import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset' import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' +const SettingsRoute = SettingsRouteImport.update({ + id: '/settings', + path: '/settings', + getParentRoute: () => rootRouteImport, +} as any) const IndexRoute = IndexRouteImport.update({ id: '/', path: '/', @@ -51,6 +57,7 @@ const RunsRunIdCategoryCategoryRoute = export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -59,6 +66,7 @@ export interface FileRoutesByFullPath { } export interface FileRoutesByTo { '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -68,6 +76,7 @@ export interface FileRoutesByTo { export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/settings': typeof SettingsRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute @@ -78,6 +87,7 @@ export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -86,6 +96,7 @@ export interface FileRouteTypes { fileRoutesByTo: FileRoutesByTo to: | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -94,6 +105,7 @@ export interface FileRouteTypes { id: | '__root__' | '/' + | '/settings' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' @@ -103,6 +115,7 @@ export interface FileRouteTypes { } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + SettingsRoute: typeof SettingsRoute ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute @@ -112,6 +125,13 @@ export interface RootRouteChildren { declare module '@tanstack/react-router' { interface FileRoutesByPath { + '/settings': { + id: '/settings' + path: '/settings' + fullPath: '/settings' + preLoaderRoute: typeof SettingsRouteImport + parentRoute: typeof rootRouteImport + } '/': { id: '/' path: '/' @@ -159,6 +179,7 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + SettingsRoute: SettingsRoute, ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, diff --git a/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx b/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx index 6a50243d..0b9722e0 100644 --- a/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx +++ b/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx @@ -10,7 +10,7 @@ import { Link, createFileRoute } from '@tanstack/react-router'; import { ScoreBar } from '~/components/ScoreBar'; import { StatsCards } from '~/components/StatsCards'; -import { useRunDetail } from '~/lib/api'; +import { isPassing, useRunDetail, useStudioConfig } from '~/lib/api'; export const Route = createFileRoute('/runs/$runId_/dataset/$dataset')({ component: DatasetPage, @@ -19,6 +19,8 @@ export const Route = createFileRoute('/runs/$runId_/dataset/$dataset')({ function DatasetPage() { const { runId, dataset } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); + const { data: config } = useStudioConfig(); + const passThreshold = config?.pass_threshold ?? 0.8; if (isLoading) { return ( @@ -43,7 +45,7 @@ function DatasetPage() { const results = (data?.results ?? []).filter((r) => (r.dataset ?? 'Uncategorized') === dataset); const total = results.length; - const passed = results.filter((r) => r.score >= 1).length; + const passed = results.filter((r) => isPassing(r.score, passThreshold)).length; const failed = total - passed; const passRate = total > 0 ? passed / total : 0; const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); diff --git a/apps/studio/src/routes/settings.tsx b/apps/studio/src/routes/settings.tsx new file mode 100644 index 00000000..d404192a --- /dev/null +++ b/apps/studio/src/routes/settings.tsx @@ -0,0 +1,126 @@ +/** + * Settings page for configuring AgentV Studio behavior. + * + * Reads and writes to .agentv/config.yaml via the /api/config endpoint. + * Changes take effect immediately on page refresh. + */ + +import { useQueryClient } from '@tanstack/react-query'; +import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; + +import { DEFAULT_PASS_THRESHOLD, saveStudioConfig, useStudioConfig } from '~/lib/api'; + +export const Route = createFileRoute('/settings')({ + component: SettingsPage, +}); + +function SettingsPage() { + const { data: config, isLoading } = useStudioConfig(); + const queryClient = useQueryClient(); + const [threshold, setThreshold] = useState(''); + const [saving, setSaving] = useState(false); + const [message, setMessage] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + const currentThreshold = config?.pass_threshold ?? DEFAULT_PASS_THRESHOLD; + const displayThreshold = threshold || String(currentThreshold); + + const handleSave = async () => { + const value = Number.parseFloat(threshold || String(currentThreshold)); + if (Number.isNaN(value) || value < 0 || value > 1) { + setMessage({ type: 'error', text: 'Threshold must be a number between 0 and 1' }); + return; + } + + setSaving(true); + setMessage(null); + try { + await saveStudioConfig({ pass_threshold: value }); + await queryClient.invalidateQueries({ queryKey: ['config'] }); + setThreshold(''); + setMessage({ type: 'success', text: 'Settings saved' }); + setTimeout(() => setMessage(null), 3000); + } catch { + setMessage({ type: 'error', text: 'Failed to save settings' }); + } finally { + setSaving(false); + } + }; + + if (isLoading) { + return ( +
+
+
+
+ ); + } + + return ( +
+
+

Settings

+

Configure your AgentV Studio dashboard

+
+ + {/* Pass Threshold Card */} +
+

Evaluation

+

+ Configure how evaluation results are classified +

+ +
+
+ +

+ Score at or above this value is considered passing. Default: {DEFAULT_PASS_THRESHOLD} +

+
+ setThreshold(e.target.value)} + className="w-32 rounded-md border border-gray-700 bg-gray-800 px-3 py-2 text-sm text-white placeholder-gray-500 focus:border-cyan-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> + + ({Math.round((Number.parseFloat(displayThreshold) || 0) * 100)}%) + +
+
+
+ +
+ + {message && ( + + {message.text} + + )} +
+
+ + {/* Config file info */} +
+

+ Settings are stored in .agentv/config.yaml +

+
+
+ ); +}