From 5e423a349cd155dfb9cbe09940bdfba02e108bfa Mon Sep 17 00:00:00 2001 From: Vasyl Vdovychenko Date: Mon, 15 Jun 2026 23:49:50 -0400 Subject: [PATCH] feat(ai): admin view-transcript UI for crew/agent runs (AI-045) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the multi-agent reasoning chain inspectable. Read-only over the existing Phase 6 agent_run — no schema change, no new table. - Backend: GET /admin/ai-quality/agent-runs (list, filter by agent, newest-first, omits heavy StepsJson/Output, Goal truncated) + /agent-runs/{id} (detail, raw StepsJson). Mirrors GetTraces/GetTrace; StepsJson passed through raw, parsed client-side. - Frontend: Transcripts tab on AiQualityPage (list+modal, modeled on TracesTab/TraceModal). Parses the step tree; critic step special-cased (score chips + severity-colored issues), default-expanded. Defensive — malformed/non-JSON steps fall back to raw, never throw. - Casing contract is load-bearing (mismatch = blank transcript): DbAgentRunWriter uses default STJ → mixed Pascal/camel. Frontend reads each key exactly; CrewTranscriptJsonContractTests locks it with negative asserts that fail if anyone adds web/camelCase options. 407 unit + admin tsc/build green. Per-description deep-link = fast-follow. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 9 + apps/admin/src/api/client.ts | 49 +++ apps/admin/src/pages/AiQualityPage.tsx | 382 +++++++++++++++++- .../Api/Endpoints/AdminAiQualityEndpoints.cs | 64 +++ backend/src/Contracts/Admin/AiQualityDtos.cs | 39 ++ .../AdminAgentRunsEndpointTests.cs | 120 ++++++ .../CrewTranscriptJsonContractTests.cs | 133 ++++++ 7 files changed, 788 insertions(+), 8 deletions(-) create mode 100644 tests/TextStack.IntegrationTests/AdminAgentRunsEndpointTests.cs create mode 100644 tests/TextStack.UnitTests/CrewTranscriptJsonContractTests.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bbd71ba..92c9a909 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ ## [Unreleased] +### Phase 7 — admin "view transcript" UI for crew/agent runs (AI-045) (2026-06-15) + +Makes the multi-agent reasoning chain inspectable. Every crew run (`crew.autopublish`/`crew.seo`) and single-agent run (`studybuddy`) already persists an `agent_run` row with a nested-step transcript (researcher→drafter→critic→editor); AI-045 surfaces it in the admin app. **No schema change, no new table** — read-only over the existing Phase 6 `agent_run`. + +- **Backend** (`Api/Endpoints/AdminAiQualityEndpoints.cs`, mirrors the existing `GetTraces`/`GetTrace`): `GET /admin/ai-quality/agent-runs?agent=&limit=&offset=` (list, newest-first, `agent` filter = exact-or-prefix so `crew.` narrows to all crew runs; clamp 1–100; **list projection omits the heavy `StepsJson`+`Output`** and truncates `Goal` to 120 chars — guarded `Length > 120 ? Goal[..120] : Goal`) and `GET /admin/ai-quality/agent-runs/{id}` (detail, full **raw** `StepsJson` + `Output`, 404 on unknown). DTOs in `Contracts/Admin/AiQualityDtos.cs` (`AgentRunListItemDto`/`AgentRunsPageDto`/`AgentRunDetailDto`). `StepsJson` is passed through RAW and parsed client-side — same pattern as `TraceDetailDto`'s `MessagesJson`/`ToolCallsJson`; no brittle second server-side schema. +- **Frontend** (`apps/admin/src/pages/AiQualityPage.tsx`, new `Transcripts` tab modeled on `TracesTab`/`TraceModal`): filterable list (All / `crew.autopublish` / `crew.seo` / `studybuddy`) + pager → click row → modal. The modal parses `stepsJson` into the step tree: each `sub_agent` step is a collapsible `{stage} · {agentName}` panel with its per-step usage; the **critic** panel is special-cased + default-expanded — its inner `llm_response` JSON is rendered as score chips (factual_accuracy/tone/length/banned_phrases) + a severity-colored issue list (blocker red / major amber / minor gray). Single-agent steps (`llm_response`/`tool_result`) render via `pretty()`. Every layer is defensive — a malformed/empty/non-JSON step falls back to raw and never throws. +- **The JSON casing contract is the load-bearing risk** (a mismatch = blank transcript). The writer `DbAgentRunWriter` serializes `run.Steps` with **default** STJ options (unaffected by the HTTP pipeline's web-defaults), so the shape is mixed: top-level `AgentStep` records → PascalCase (`Index`/`Kind`/`Payload`/`At`), the `sub_agent` payload anon-object → camelCase (`stage`/`agentName`/`status`/`usage`/`steps`), the nested `AgentUsage` record → PascalCase again, inner `llm_response` payload `new { text }` → camelCase. The frontend reads each exactly. **`CrewTranscriptJsonContractTests`** (pure, no DB) serializes a real 4-stage `crew.autopublish` run through the same factory+options and asserts every key the UI depends on, **with negative asserts** so the test fails loudly if anyone ever puts camelCase/web options on the writer. +- Entry points from the AutoPublish/SEO pages (deep-link by the runIds those `crew-generate` endpoints already return) are a deliberate fast-follow; this PR is the self-contained AI-quality tab. Admin-only (`/admin/*` auth) — the `Goal` can carry user passages / book source, acceptable for the owner's own audit surface. + ### Phase 7 — synthetic-defect critic harness (AI-044) (2026-06-15) A calibration gate for the AI-041 `CriticAgent`: instead of trusting that the critic *would* catch a bad draft, we inject KNOWN defects into clean drafts and measure whether it actually does. ~23 fixtures over a single edition-description `ContentBrief` (`AutoPublishBriefs.Description("en")` — real 800–1600 char bounds + the shared `CrewBannedPhrases` blocklist): factual_hallucination ×6, banned_phrase ×4, length over/under ×2+2, tone_break ×4, plus 5 clean controls. The harness mirrors `ToolCallEvalRunner` exactly — real nano per case, **pure deterministic scoring, no judge**, `JudgeModelId="n/a"`, persists a reused `EvalRun` (**no schema change**), Score = catch-rate. diff --git a/apps/admin/src/api/client.ts b/apps/admin/src/api/client.ts index 2c6f8248..d43db026 100644 --- a/apps/admin/src/api/client.ts +++ b/apps/admin/src/api/client.ts @@ -485,6 +485,42 @@ export interface TraceDetail { userId: string | null createdAt: string } +export interface AgentRunListItem { + id: string + agent: string + userId?: string | null + editionId?: string | null + status: string + goal: string + iterations: number + tokensIn: number + tokensOut: number + costUsd: number + latencyMs: number + hasError: boolean + createdAt: string +} +export interface AgentRunsPage { + total: number + items: AgentRunListItem[] +} +export interface AgentRunDetail { + id: string + agent: string + userId?: string | null + editionId?: string | null + status: string + goal: string + output?: string | null + stepsJson: string + iterations: number + tokensIn: number + tokensOut: number + costUsd: number + latencyMs: number + error?: string | null + createdAt: string +} export interface EvalRun { id: string feature: string @@ -1107,6 +1143,19 @@ export const adminApi = { return fetchJson(`/admin/ai-quality/traces/${id}`) }, + getAgentRuns: async (params?: { agent?: string; limit?: number; offset?: number }): Promise => { + const query = new URLSearchParams() + if (params?.agent) query.set('agent', params.agent) + if (params?.limit) query.set('limit', String(params.limit)) + if (params?.offset) query.set('offset', String(params.offset)) + const qs = query.toString() + return fetchJson(`/admin/ai-quality/agent-runs${qs ? `?${qs}` : ''}`) + }, + + getAgentRun: async (id: string): Promise => { + return fetchJson(`/admin/ai-quality/agent-runs/${id}`) + }, + getAiEvals: async (params?: { feature?: string; limit?: number }): Promise => { const query = new URLSearchParams() if (params?.feature) query.set('feature', params.feature) diff --git a/apps/admin/src/pages/AiQualityPage.tsx b/apps/admin/src/pages/AiQualityPage.tsx index 1d0ab0aa..de5a3ee5 100644 --- a/apps/admin/src/pages/AiQualityPage.tsx +++ b/apps/admin/src/pages/AiQualityPage.tsx @@ -6,11 +6,13 @@ import { DailyCostPoint, TraceListItem, TraceDetail, + AgentRunListItem, + AgentRunDetail, EvalRun, CriticDefectEvalResult, } from '../api/client' -type Tab = 'summary' | 'traces' | 'evals' +type Tab = 'summary' | 'traces' | 'transcripts' | 'evals' const KNOWN_FEATURES = ['explain', 'translate', 'distractor', 'bookmeta', 'tagsuggestion', 'eval.judge'] @@ -20,7 +22,7 @@ export function AiQualityPage() {

AI Quality

- {(['summary', 'traces', 'evals'] as Tab[]).map((t) => ( + {(['summary', 'traces', 'transcripts', 'evals'] as Tab[]).map((t) => (
{tab === 'summary' && } {tab === 'traces' && } + {tab === 'transcripts' && } {tab === 'evals' && }
) @@ -318,6 +321,357 @@ function Section({ title, body }: { title: string; body: string | null }) { ) } +// ─────────────────────────── Transcripts ─────────────────────────── + +const RUN_PAGE = 25 +const AGENT_FILTERS = ['crew.autopublish', 'crew.seo', 'studybuddy'] + +function isErrorStatus(status: string, hasError?: boolean): boolean { + return hasError === true || status === 'error' || status === 'budget_exhausted' +} + +function TranscriptsTab() { + const [items, setItems] = useState([]) + const [total, setTotal] = useState(0) + const [agent, setAgent] = useState('') + const [offset, setOffset] = useState(0) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) + const [selected, setSelected] = useState(null) + + useEffect(() => { + setLoading(true) + adminApi + .getAgentRuns({ agent: agent || undefined, limit: RUN_PAGE, offset }) + .then((d) => { + setItems(d.items) + setTotal(d.total) + setError(null) + }) + .catch((e) => setError(e instanceof Error ? e.message : 'Failed to load')) + .finally(() => setLoading(false)) + }, [agent, offset]) + + const openRun = async (id: string) => { + try { + setSelected(await adminApi.getAgentRun(id)) + } catch (e) { + setError(e instanceof Error ? e.message : 'Failed to load transcript') + } + } + + return ( + <> +
+ +
+ + {error && } + + {loading ? ( +

Loading…

+ ) : items.length === 0 ? ( +

No agent runs match. (Crew/agent transcripts appear after the app runs an agent.)

+ ) : ( + <> + + + + + + + + + + + + + {items.map((r) => { + const err = isErrorStatus(r.status, r.hasError) + return ( + openRun(r.id)} + style={{ cursor: 'pointer', borderBottom: '1px solid #f3f4f6', background: err ? '#fef2f2' : undefined }} + > + + + + + + + + ) + })} + +
AgentStatusGoalCostTokensWhen
{r.agent}{err && }{r.status}{r.goal}${r.costUsd.toFixed(4)}{r.tokensIn}/{r.tokensOut}{timeAgo(r.createdAt)}
+ + + )} + + {selected && setSelected(null)} />} + + ) +} + +function statusColor(status: string, hasError?: boolean): string { + if (isErrorStatus(status, hasError)) return '#dc2626' + if (status === 'completed') return '#059669' + return '#6b7280' +} + +function TranscriptModal({ run, onClose }: { run: AgentRunDetail; onClose: () => void }) { + return ( +
+
e.stopPropagation()} style={modal}> +
+

+ {run.agent} + + {run.status} + +

+ +
+
+ + + + + +
+ {run.error && } +
+
+ +
+
+ ) +} + +// PascalCase shapes from the backend transcript JSON. +interface AgentStep { + Index?: number + Kind?: string + Payload?: unknown + At?: string +} +interface SubAgentUsage { + Iterations?: number + InputTokensTotal?: number + OutputTokensTotal?: number + CostUsdTotal?: number + LatencyMs?: number +} +interface SubAgentPayload { + stage?: string + agentName?: string + status?: string + usage?: SubAgentUsage + steps?: AgentStep[] +} + +function StepTree({ stepsJson }: { stepsJson: string }) { + let steps: AgentStep[] + try { + const parsed = JSON.parse(stepsJson) + if (!Array.isArray(parsed)) throw new Error('not an array') + steps = parsed as AgentStep[] + } catch { + return
+ } + return ( +
+
Steps
+
+ {steps.map((step, i) => ( + + ))} +
+
+ ) +} + +function StepNode({ step }: { step: AgentStep }) { + try { + if (step.Kind === 'sub_agent') { + return + } + return ( +
+
{step.Kind ?? 'step'}
+
{pretty(JSON.stringify(step.Payload))}
+
+ ) + } catch { + return ( +
+
step
+
{pretty(JSON.stringify(step))}
+
+ ) + } +} + +function SubAgentPanel({ payload }: { payload: SubAgentPayload }) { + const p = payload ?? {} + const isCritic = p.agentName === 'critic' + const [open, setOpen] = useState(isCritic) // critic default-expanded, others collapsed + const usage = p.usage ?? {} + const title = `${p.stage ?? '?'} · ${p.agentName ?? '?'}` + return ( +
+ + {open && ( + <> +
+ + + + +
+ + + )} +
+ ) +} + +function SubAgentBody({ payload, isCritic }: { payload: SubAgentPayload; isCritic: boolean }) { + const steps = Array.isArray(payload.steps) ? payload.steps : [] + if (isCritic) { + const text = firstLlmResponseText(steps) + if (text != null) { + return + } + } + if (steps.length === 0) { + return
{pretty(JSON.stringify(payload))}
+ } + return ( +
+ {steps.map((s, i) => { + try { + const inner = s.Payload as { text?: string } | undefined + const body = inner && typeof inner.text === 'string' ? inner.text : pretty(JSON.stringify(s.Payload)) + return ( +
+
{s.Kind ?? 'step'}
+
{body}
+
+ ) + } catch { + return
{pretty(JSON.stringify(s))}
+ } + })} +
+ ) +} + +function firstLlmResponseText(steps: AgentStep[]): string | null { + for (const s of steps) { + if (s.Kind === 'llm_response') { + const inner = s.Payload as { text?: string } | undefined + if (inner && typeof inner.text === 'string') return inner.text + } + } + // fall back to any step that carries a text payload + for (const s of steps) { + const inner = s.Payload as { text?: string } | undefined + if (inner && typeof inner.text === 'string') return inner.text + } + return null +} + +interface CriticScores { + factual_accuracy?: number + tone?: number + length?: number + banned_phrases?: number +} +interface CriticIssue { + severity?: string + axis?: string + message?: string + detail?: string +} + +function CriticReview({ text }: { text: string }) { + let parsed: { scores?: CriticScores; issues?: CriticIssue[] } + try { + parsed = JSON.parse(text) + } catch { + return
{pretty(text)}
+ } + const scores = parsed.scores ?? {} + const issues = Array.isArray(parsed.issues) ? parsed.issues : [] + return ( +
+
+ + + + +
+ {issues.length > 0 && ( +
+ {issues.map((iss, i) => ( +
+ + {iss.severity ?? 'issue'} + + {iss.axis && {iss.axis}} +
{iss.message ?? iss.detail ?? ''}
+
+ ))} +
+ )} +
+ ) +} + +function fmtScore(v: number | undefined): string { + return v == null ? '—' : String(v) +} + +function severityColor(severity?: string): string { + if (severity === 'blocker') return '#dc2626' + if (severity === 'major') return '#d97706' + return '#6b7280' // minor / unknown +} + // ─────────────────────────── Evals ─────────────────────────── function EvalsTab() { @@ -484,18 +838,30 @@ function EvalsTab() { // ─────────────────────────── shared ─────────────────────────── -function Pager({ offset, total, onChange }: { offset: number; total: number; onChange: (o: number) => void }) { - const page = Math.floor(offset / PAGE) + 1 - const pages = Math.max(1, Math.ceil(total / PAGE)) +function Pager({ + offset, + total, + onChange, + pageSize = PAGE, + label = 'traces', +}: { + offset: number + total: number + onChange: (o: number) => void + pageSize?: number + label?: string +}) { + const page = Math.floor(offset / pageSize) + 1 + const pages = Math.max(1, Math.ceil(total / pageSize)) return (
- - Page {page} / {pages} · {total} traces + Page {page} / {pages} · {total} {label} -
diff --git a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs index 09fe117f..6122a1f3 100644 --- a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs +++ b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs @@ -25,6 +25,8 @@ public static void MapAdminAiQualityEndpoints(this WebApplication app) group.MapGet("/summary", GetSummary); group.MapGet("/traces", GetTraces); group.MapGet("/traces/{id:guid}", GetTrace); + group.MapGet("/agent-runs", GetAgentRuns); + group.MapGet("/agent-runs/{id:guid}", GetAgentRun); group.MapGet("/evals", GetEvals); group.MapPost("/evals/run", RunEvals); group.MapGet("/evals/status", GetEvalStatus); @@ -260,6 +262,68 @@ private static async Task GetTraces( return Results.Ok(new TracesPageDto(total, items)); } + // AI-045: list persisted agent_run rows for the admin transcript UI. Mirrors GetTraces + // (newest-first, clamp, paged). The list projection omits StepsJson + Output (both big) and + // truncates Goal. The `agent` filter matches exact OR prefix (so "crew." narrows to all crew + // runs, "crew.autopublish"/"studybuddy" narrow to that one). + private static async Task GetAgentRuns( + AppDbContext db, + [FromQuery] string? agent, + [FromQuery] int limit = 25, + [FromQuery] int offset = 0, + CancellationToken ct = default) + { + limit = Math.Clamp(limit, 1, 100); + offset = Math.Max(offset, 0); + + var query = db.AgentRuns.AsQueryable(); + if (!string.IsNullOrWhiteSpace(agent)) + { + var a = agent.Trim(); + query = query.Where(r => r.Agent == a || r.Agent.StartsWith(a)); + } + + var total = await query.LongCountAsync(ct); + var rows = await query + .OrderByDescending(r => r.CreatedAt) + .Skip(offset).Take(limit) + .Select(r => new + { + r.Id, + r.Agent, + r.UserId, + r.EditionId, + r.Status, + r.Goal, + r.Iterations, + r.TokensIn, + r.TokensOut, + r.CostUsd, + r.LatencyMs, + HasError = r.Error != null && r.Error != "", + r.CreatedAt, + }) + .ToListAsync(ct); + + var items = rows.Select(r => new AgentRunListItemDto( + r.Id, r.Agent, r.UserId, r.EditionId, r.Status, + r.Goal.Length > 120 ? r.Goal[..120] : r.Goal, + r.Iterations, r.TokensIn, r.TokensOut, r.CostUsd, r.LatencyMs, + r.HasError, r.CreatedAt)).ToList(); + + return Results.Ok(new AgentRunsPageDto(total, items)); + } + + // AI-045: full transcript for one agent run (StepsJson shipped RAW; frontend parses). + private static async Task GetAgentRun(Guid id, AppDbContext db, CancellationToken ct) + { + var r = await db.AgentRuns.FirstOrDefaultAsync(x => x.Id == id, ct); + if (r is null) return Results.NotFound(); + return Results.Ok(new AgentRunDetailDto( + r.Id, r.Agent, r.UserId, r.EditionId, r.Status, r.Goal, r.Output, r.StepsJson, + r.Iterations, r.TokensIn, r.TokensOut, r.CostUsd, r.LatencyMs, r.Error, r.CreatedAt)); + } + private static async Task GetTrace(Guid id, AppDbContext db, CancellationToken ct) { var t = await db.LlmTraces.FirstOrDefaultAsync(x => x.Id == id, ct); diff --git a/backend/src/Contracts/Admin/AiQualityDtos.cs b/backend/src/Contracts/Admin/AiQualityDtos.cs index efd9f1a1..dd334151 100644 --- a/backend/src/Contracts/Admin/AiQualityDtos.cs +++ b/backend/src/Contracts/Admin/AiQualityDtos.cs @@ -57,6 +57,45 @@ public record TraceDetailDto( Guid? UserId, DateTimeOffset CreatedAt); +/// One row in the Agent Runs tab table (AI-045). Omits the heavy StepsJson + Output; +/// Goal is truncated for the list. HasError = the run recorded an error. +public record AgentRunListItemDto( + Guid Id, + string Agent, + Guid? UserId, + Guid? EditionId, + string Status, + string Goal, + int Iterations, + int TokensIn, + int TokensOut, + decimal CostUsd, + int LatencyMs, + bool HasError, + DateTimeOffset CreatedAt); + +/// Paged agent-run list for the Agent Runs tab. +public record AgentRunsPageDto(long Total, IReadOnlyList Items); + +/// Full agent run for the transcript drill-in (AI-045). StepsJson is the RAW jsonb +/// string (crew runs carry nested sub_agent steps); the frontend parses it. +public record AgentRunDetailDto( + Guid Id, + string Agent, + Guid? UserId, + Guid? EditionId, + string Status, + string Goal, + string? Output, + string StepsJson, + int Iterations, + int TokensIn, + int TokensOut, + decimal CostUsd, + int LatencyMs, + string? Error, + DateTimeOffset CreatedAt); + /// Request body for triggering an eval run from the admin panel. public record RunEvalsRequest(string[]? Features, string? Judge); diff --git a/tests/TextStack.IntegrationTests/AdminAgentRunsEndpointTests.cs b/tests/TextStack.IntegrationTests/AdminAgentRunsEndpointTests.cs new file mode 100644 index 00000000..90ab0248 --- /dev/null +++ b/tests/TextStack.IntegrationTests/AdminAgentRunsEndpointTests.cs @@ -0,0 +1,120 @@ +using System.Net; +using System.Net.Http.Json; +using System.Text.Json; + +namespace TextStack.IntegrationTests; + +/// +/// Integration tests for the admin agent-run transcript endpoints (AI-045), against the live API on +/// the admin host (textstack.dev). Mirrors the Study Buddy live-server pattern: auth + validation + +/// not-found paths are asserted without needing seeded rows (an empty agent_run table still returns a +/// well-formed page), and the list/detail shapes are sanity-checked. A run requires admin auth via the +/// fixture cookie (test-login); when unavailable, the relevant tests SKIP rather than false-pass. +/// +/// To run: requires `docker compose up` (API on :8080) with `ENABLE_TEST_AUTH=true`. The fixture's +/// test user must be an admin for the authed assertions; otherwise AdminAuth returns 401/403 and those +/// tests are skipped via the IsAuthenticated / status guards. +/// +public class AdminAgentRunsEndpointTests : IClassFixture +{ + private readonly AuthenticatedApiFixture _fixture; + + public AdminAgentRunsEndpointTests(AuthenticatedApiFixture fixture) => _fixture = fixture; + + [Fact] + public async Task GetAgentRuns_NoAuth_Unauthorized() + { + var request = new HttpRequestMessage(HttpMethod.Get, "/admin/ai-quality/agent-runs"); + request.Headers.Host = AuthenticatedApiFixture.AdminHost; + + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(response.StatusCode is HttpStatusCode.NotFound, "endpoint not deployed"); + Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode); + } + + [Fact] + public async Task GetAgentRuns_Authed_ReturnsPagedShape_ListOmitsStepsJson() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest(HttpMethod.Get, "/admin/ai-quality/agent-runs?limit=5"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + // Test user may not be admin in this environment → AdminAuth rejects; skip rather than fail. + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.OK, response.StatusCode); + + using var doc = JsonDocument.Parse( + await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken)); + var root = doc.RootElement; + + Assert.True(root.TryGetProperty("total", out _), "page has total"); + Assert.True(root.TryGetProperty("items", out var items), "page has items"); + Assert.Equal(JsonValueKind.Array, items.ValueKind); + + // List items must NOT carry the heavy transcript / final output. + foreach (var item in items.EnumerateArray()) + { + Assert.False(item.TryGetProperty("stepsJson", out _), "list item must omit stepsJson"); + Assert.False(item.TryGetProperty("output", out _), "list item must omit output"); + Assert.True(item.TryGetProperty("agent", out _)); + Assert.True(item.TryGetProperty("hasError", out _)); + } + } + + [Fact] + public async Task GetAgentRuns_AgentFilter_ReturnsOnlyMatching() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, "/admin/ai-quality/agent-runs?agent=crew.autopublish&limit=10"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.OK, response.StatusCode); + + using var doc = JsonDocument.Parse( + await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken)); + foreach (var item in doc.RootElement.GetProperty("items").EnumerateArray()) + { + var agent = item.GetProperty("agent").GetString(); + Assert.StartsWith("crew.autopublish", agent); + } + } + + [Fact] + public async Task GetAgentRun_NoAuth_Unauthorized() + { + var request = new HttpRequestMessage( + HttpMethod.Get, $"/admin/ai-quality/agent-runs/{Guid.NewGuid()}"); + request.Headers.Host = AuthenticatedApiFixture.AdminHost; + + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(response.StatusCode is HttpStatusCode.NotFound, "endpoint not deployed"); + Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode); + } + + [Fact] + public async Task GetAgentRun_AuthedUnknownId_NotFound() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, $"/admin/ai-quality/agent-runs/{Guid.NewGuid()}"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.NotFound, response.StatusCode); + } +} diff --git a/tests/TextStack.UnitTests/CrewTranscriptJsonContractTests.cs b/tests/TextStack.UnitTests/CrewTranscriptJsonContractTests.cs new file mode 100644 index 00000000..ed6be5cc --- /dev/null +++ b/tests/TextStack.UnitTests/CrewTranscriptJsonContractTests.cs @@ -0,0 +1,133 @@ +using System.Text.Json; +using TextStack.Ai.Agents; +using TextStack.Ai.Core; + +namespace TextStack.UnitTests; + +/// +/// AI-045 — end-to-end JSON casing contract between the persisted agent_run.steps_json and the +/// admin transcript parser (AiQualityPage.tsx StepTree / SubAgentPanel / CriticReview). +/// +/// The writer (DbAgentRunWriter) serializes the step transcript with a bare +/// JsonSerializer.Serialize(run.Steps) — DEFAULT options, no naming policy. Under default STJ: +/// • record properties keep their PascalCase names (AgentStep → Index/Kind/Payload/At, +/// AgentUsage → Iterations/InputTokensTotal/OutputTokensTotal/CostUsdTotal/LatencyMs); +/// • anonymous objects keep their literal member names (camelCase: stage/agentName/status/usage/ +/// error/steps and the inner llm_response payload's `text`). +/// +/// The frontend reads exactly that MIXED casing. These tests serialize a realistic crew run through the +/// SAME factory + SAME default options and assert every key the frontend depends on, so the build breaks +/// if either side drifts (a mismatch = blank/raw transcript for every run in prod). +/// +public class CrewTranscriptJsonContractTests +{ + // Mirror DbAgentRunWriter exactly: JsonSerializer.Serialize(run.Steps) with no options. + private static JsonElement SerializeSteps(IReadOnlyList steps) => + JsonDocument.Parse(JsonSerializer.Serialize(steps)).RootElement; + + private static AgentStep LlmResponse(string text) => + new(0, "llm_response", JsonSerializer.SerializeToElement(new { text }), DateTimeOffset.UtcNow); + + private static CrewStepEntry SubAgent(int index, string stage, string name, string status, IReadOnlyList steps) => + new(index, stage, name, status, steps, new AgentUsage(1, 100, 40, 0.0012m, 700), null); + + // A realistic 4-stage crew.autopublish run: researcher → drafter → critic → editor, critic carries a + // JSON verdict in its llm_response. + private static AgentRunRecord BuildCrewRun() + { + const string criticJson = """ + {"scores":{"factual_accuracy":4,"tone":5,"length":3,"banned_phrases":5}, + "issues":[{"severity":"major","axis":"length","message":"Too long."}]} + """; + var entries = new List + { + SubAgent(0, "research", "researcher", "completed", [LlmResponse("notes")]), + SubAgent(1, "draft", "drafter", "completed", [LlmResponse("draft body")]), + SubAgent(2, "critique", "critic", "completed", [LlmResponse(criticJson)]), + SubAgent(3, "edit", "editor", "completed", [LlmResponse("final body")]), + }; + var result = new CrewResult( + "final body", CrewRunRecordFactory.StatusCompleted, entries, + new AgentUsage(4, 400, 160, 0.0048m, 2800), null); + + return CrewRunRecordFactory.From( + Guid.NewGuid(), "autopublish", Guid.NewGuid(), Guid.NewGuid(), + "goal text", "final body", result); + } + + [Fact] + public void TopLevelStep_UsesPascalCaseRecordKeys_KindAndPayload() + { + var steps = SerializeSteps(BuildCrewRun().Steps); + + var step0 = steps[0]; + // Frontend reads step.Kind and step.Payload (PascalCase — AgentStep record, default options). + Assert.Equal("sub_agent", step0.GetProperty("Kind").GetString()); + Assert.True(step0.TryGetProperty("Payload", out _), "top step exposes PascalCase Payload"); + Assert.True(step0.TryGetProperty("Index", out _)); + // And NOT camelCase — guards against a future Web/camelCase option leaking onto the writer. + Assert.False(step0.TryGetProperty("kind", out _), "must not be camelCase 'kind'"); + Assert.False(step0.TryGetProperty("payload", out _), "must not be camelCase 'payload'"); + } + + [Fact] + public void SubAgentPayload_UsesCamelCaseAnonymousKeys() + { + var steps = SerializeSteps(BuildCrewRun().Steps); + var payload = steps[2].GetProperty("Payload"); // critic + + // Frontend reads Payload.stage / .agentName / .status / .usage / .steps (camelCase anon members). + Assert.Equal("critique", payload.GetProperty("stage").GetString()); + Assert.Equal("critic", payload.GetProperty("agentName").GetString()); + Assert.Equal("completed", payload.GetProperty("status").GetString()); + Assert.True(payload.TryGetProperty("usage", out _)); + Assert.Equal(JsonValueKind.Array, payload.GetProperty("steps").ValueKind); + } + + [Fact] + public void NestedUsage_UsesPascalCaseRecordKeys() + { + var steps = SerializeSteps(BuildCrewRun().Steps); + var usage = steps[0].GetProperty("Payload").GetProperty("usage"); + + // Frontend reads usage.Iterations / InputTokensTotal / OutputTokensTotal / CostUsdTotal / LatencyMs. + Assert.Equal(1, usage.GetProperty("Iterations").GetInt32()); + Assert.Equal(100, usage.GetProperty("InputTokensTotal").GetInt32()); + Assert.Equal(40, usage.GetProperty("OutputTokensTotal").GetInt32()); + Assert.Equal(0.0012m, usage.GetProperty("CostUsdTotal").GetDecimal()); + Assert.Equal(700, usage.GetProperty("LatencyMs").GetInt32()); + } + + [Fact] + public void InnerLlmResponseStep_PascalCaseKind_CamelCasePayloadText() + { + var steps = SerializeSteps(BuildCrewRun().Steps); + var innerStep = steps[1].GetProperty("Payload").GetProperty("steps")[0]; // drafter's inner step + + // Frontend reads s.Kind (PascalCase) and s.Payload.text (camelCase). + Assert.Equal("llm_response", innerStep.GetProperty("Kind").GetString()); + Assert.Equal("draft body", innerStep.GetProperty("Payload").GetProperty("text").GetString()); + } + + [Fact] + public void CriticInnerText_IsValidJson_WithScoresAndIssues_FrontendKeys() + { + var steps = SerializeSteps(BuildCrewRun().Steps); + var criticText = steps[2].GetProperty("Payload").GetProperty("steps")[0] + .GetProperty("Payload").GetProperty("text").GetString(); + + Assert.NotNull(criticText); + using var verdict = JsonDocument.Parse(criticText!); + var scores = verdict.RootElement.GetProperty("scores"); + // CriticReview reads snake_case score axes verbatim from the LLM text. + Assert.Equal(4, scores.GetProperty("factual_accuracy").GetInt32()); + Assert.Equal(5, scores.GetProperty("tone").GetInt32()); + Assert.Equal(3, scores.GetProperty("length").GetInt32()); + Assert.Equal(5, scores.GetProperty("banned_phrases").GetInt32()); + + var issue = verdict.RootElement.GetProperty("issues")[0]; + Assert.Equal("major", issue.GetProperty("severity").GetString()); + Assert.Equal("length", issue.GetProperty("axis").GetString()); + Assert.Equal("Too long.", issue.GetProperty("message").GetString()); + } +}