From 543e88860eaf143effeadba9b8eda9e96b759378 Mon Sep 17 00:00:00 2001 From: Nisarg Patel Date: Thu, 2 Apr 2026 19:16:41 -0700 Subject: [PATCH 1/2] feat: introduce scope tier options for testing depth and enhance execution reporting - Added `scope` option to CLI for specifying test depth: quick, standard, or thorough. - Updated `runHeadless` and related functions to handle new scope tier logic. - Enhanced execution reporting to include detailed step results and durations. - Implemented auth redirect warning in the browser MCP server. - Introduced new constants for run management in the supervisor module. --- apps/cli/src/data/execution-atom.ts | 34 +++ apps/cli/src/index.tsx | 19 +- apps/cli/src/layers.ts | 2 + apps/cli/src/utils/run-test.ts | 73 +++--- apps/cli/src/utils/write-run-result.ts | 60 +++++ packages/browser/src/mcp/server.ts | 55 +++- packages/shared/src/models.ts | 3 + packages/shared/src/prompts.ts | 333 ++++++++++++++++++------- packages/shared/tests/prompts.test.ts | 47 ++-- packages/supervisor/src/constants.ts | 2 + packages/supervisor/src/executor.ts | 7 +- 11 files changed, 496 insertions(+), 139 deletions(-) create mode 100644 apps/cli/src/utils/write-run-result.ts diff --git a/apps/cli/src/data/execution-atom.ts b/apps/cli/src/data/execution-atom.ts index 13bd79384..266a6c1b9 100644 --- a/apps/cli/src/data/execution-atom.ts +++ b/apps/cli/src/data/execution-atom.ts @@ -11,6 +11,10 @@ import { startReplayProxy } from "../utils/replay-proxy-server"; import { toViewerRunState, pushStepState } from "../utils/push-step-state"; import { extractCloseArtifacts } from "../utils/extract-close-artifacts"; import { loadReplayEvents } from "../utils/load-replay-events"; +import { writeRunResult } from "../utils/write-run-result"; +import { CiResultOutput, CiStepResult } from "@expect/shared/models"; +import { VERSION } from "../constants"; +import { getStepElapsedMs, getTotalElapsedMs } from "../utils/step-elapsed"; const LIVE_VIEW_PORT_MIN = 50000; const LIVE_VIEW_PORT_RANGE = 10000; @@ -175,6 +179,36 @@ const executeCore = (input: ExecuteInput) => yield* git.saveTestedFingerprint(); } + const statuses = report.stepStatuses; + const stepResults = report.steps.map((step) => { + const entry = statuses.get(step.id); + const stepStatus = entry?.status ?? ("not-run" as const); + const elapsed = getStepElapsedMs(step); + return new CiStepResult({ + title: step.title, + status: stepStatus, + ...(elapsed !== undefined ? { duration_ms: elapsed } : {}), + ...(stepStatus === "failed" && entry?.summary ? { error: entry.summary } : {}), + }); + }); + + const totalDurationMs = getTotalElapsedMs(report.steps) || durationMs; + const summaryParts = [`${passedCount} passed`, `${failedCount} failed`]; + const resultOutput = new CiResultOutput({ + version: VERSION, + status: report.status, + title: report.title, + duration_ms: totalDurationMs, + steps: stepResults, + artifacts: { + ...(artifacts.videoUrl ? { video: artifacts.videoUrl } : {}), + ...(artifacts.localReplayUrl ? { replay: artifacts.localReplayUrl } : {}), + }, + summary: `${summaryParts.join(", ")} out of ${report.steps.length} step${report.steps.length === 1 ? "" : "s"}`, + }); + + yield* writeRunResult(finalExecuted.id ?? crypto.randomUUID(), resultOutput); + return { executedPlan: finalExecuted, report, diff --git a/apps/cli/src/index.tsx b/apps/cli/src/index.tsx index 30b71e225..5f8236c88 100644 --- a/apps/cli/src/index.tsx +++ b/apps/cli/src/index.tsx @@ -3,6 +3,7 @@ import { join } from "node:path"; import { Option } from "effect"; import { Command } from "commander"; import { ChangesFor } from "@expect/supervisor"; +import type { ScopeTier } from "@expect/shared/models"; import { runHeadless } from "./utils/run-test"; import { runInit } from "./commands/init"; import { runAddGithubAction } from "./commands/add-github-action"; @@ -34,12 +35,15 @@ const TARGETS: readonly Target[] = ["unstaged", "branch", "changes"]; type OutputFormat = "text" | "json"; +const SCOPE_TIERS: readonly ScopeTier[] = ["quick", "standard", "thorough"]; + interface CommanderOpts { message?: string; flow?: string; yes?: boolean; agent?: AgentBackend; target?: Target; + scope?: ScopeTier; verbose?: boolean; headed?: boolean; noCookies?: boolean; @@ -63,6 +67,11 @@ const program = new Command() "agent provider to use (claude, codex, copilot, gemini, cursor, opencode, or droid)", ) .option("-t, --target ", "what to test: unstaged, branch, or changes", "changes") + .option( + "-s, --scope ", + "test depth: quick (one check, ~30s), standard (primary + follow-ups), thorough (full audit)", + "standard", + ) .option("--verbose", "enable verbose logging") .option("--headed", "show a visible browser window during tests") .option("--no-cookies", "skip system browser cookie extraction") @@ -80,7 +89,8 @@ Examples: $ expect --headed -m "smoke test" -y run with a visible browser $ expect --target branch test all branch changes $ expect --target unstaged test unstaged changes - $ expect --no-cookies -m "test" -y skip system browser cookie extraction + $ expect --scope quick -m "check the button" -y fast focused test (~30s) + $ expect --scope thorough --target branch full audit before merge $ expect -u http://localhost:3000 -m "test" -y specify dev server URL directly $ expect watch -m "test the login flow" watch mode`, ); @@ -113,6 +123,12 @@ const runHeadlessForTarget = async (target: Target, opts: CommanderOpts) => { ? Option.some(CI_EXECUTION_TIMEOUT_MS) : Option.none(); + const scopeTier = opts.scope ?? "standard"; + if (!SCOPE_TIERS.includes(scopeTier)) { + console.error(`Unknown scope tier: ${scopeTier}. Use ${SCOPE_TIERS.join(", ")}.`); + process.exit(1); + } + const { changesFor } = await resolveChangesFor(target); return runHeadless({ changesFor, @@ -123,6 +139,7 @@ const runHeadlessForTarget = async (target: Target, opts: CommanderOpts) => { ci: ciMode, timeoutMs, output: opts.output ?? "text", + scopeTier, baseUrl: opts.url?.join(", "), }); }; diff --git a/apps/cli/src/layers.ts b/apps/cli/src/layers.ts index 495144db3..43e77c91c 100644 --- a/apps/cli/src/layers.ts +++ b/apps/cli/src/layers.ts @@ -4,6 +4,7 @@ import { Executor, FlowStorage, Git, Reporter, Updates, Watch } from "@expect/su import { Agent, AgentBackend } from "@expect/agent"; import { RrVideo } from "@expect/browser"; import { Analytics, DebugFileLoggerLayer, Tracing } from "@expect/shared/observability"; +import * as NodeServices from "@effect/platform-node/NodeServices"; export const layerCli = ({ verbose, agent }: { verbose: boolean; agent: AgentBackend }) => { const gitLayer = Git.withRepoRoot(process.cwd()); @@ -25,6 +26,7 @@ export const layerCli = ({ verbose, agent }: { verbose: boolean; agent: AgentBac Layer.provide(Agent.layerFor(agent ?? "claude")), Layer.provide(DebugFileLoggerLayer), Layer.provide(Tracing.layerAxiom("expect-cli")), + Layer.provideMerge(NodeServices.layer), Layer.provideMerge(Layer.succeed(References.MinimumLogLevel, verbose ? "All" : "Error")), ); }; diff --git a/apps/cli/src/utils/run-test.ts b/apps/cli/src/utils/run-test.ts index ec6887097..128c163d4 100644 --- a/apps/cli/src/utils/run-test.ts +++ b/apps/cli/src/utils/run-test.ts @@ -1,5 +1,5 @@ import { Config, Effect, Option, Stream, Schema } from "effect"; -import { type ChangesFor, CiResultOutput, CiStepResult } from "@expect/shared/models"; +import { type ChangesFor, CiResultOutput, CiStepResult, type ScopeTier } from "@expect/shared/models"; import { Executor, ExecutedTestPlan, Reporter, Github } from "@expect/supervisor"; import { Analytics } from "@expect/shared/observability"; import type { AgentBackend } from "@expect/agent"; @@ -13,6 +13,7 @@ import { createCiReporter } from "./ci-reporter"; import { writeGhaOutputs, writeGhaStepSummary } from "./gha-output"; import { getStepElapsedMs, getTotalElapsedMs } from "./step-elapsed"; import { formatElapsedTime } from "./format-elapsed-time"; +import { writeRunResult } from "./write-run-result"; class ExecutionTimeoutError extends Schema.ErrorClass( "ExecutionTimeoutError", @@ -34,6 +35,7 @@ interface HeadlessRunOptions { ci: boolean; timeoutMs: Option.Option; output: "text" | "json"; + scopeTier: ScopeTier; baseUrl?: string; } @@ -132,6 +134,7 @@ export const runHeadless = (options: HeadlessRunOptions) => instruction: options.instruction, isHeadless: !options.headed, cookieBrowserKeys: [], + scopeTier: options.scopeTier, baseUrl: options.baseUrl, }) .pipe( @@ -366,39 +369,47 @@ export const runHeadless = (options: HeadlessRunOptions) => ); } - if (isJsonOutput) { - const stepResults = report.steps.map((step) => { - const entry = statuses.get(step.id); - const stepStatus = entry?.status ?? ("not-run" as const); - const elapsed = getStepElapsedMs(step); - return new CiStepResult({ - title: step.title, - status: stepStatus, - ...(elapsed !== undefined ? { duration_ms: elapsed } : {}), - ...(stepStatus === "failed" && entry?.summary ? { error: entry.summary } : {}), - }); + const stepResults = report.steps.map((step) => { + const entry = statuses.get(step.id); + const stepStatus = entry?.status ?? ("not-run" as const); + const elapsed = getStepElapsedMs(step); + return new CiStepResult({ + title: step.title, + status: stepStatus, + ...(elapsed !== undefined ? { duration_ms: elapsed } : {}), + ...(stepStatus === "failed" && entry?.summary ? { error: entry.summary } : {}), }); + }); - const summaryParts = [`${passedCount} passed`, `${failedCount} failed`]; - if (skippedCount > 0) summaryParts.push(`${skippedCount} skipped`); - const summaryText = `${summaryParts.join(", ")} out of ${report.steps.length} step${report.steps.length === 1 ? "" : "s"}`; - - const resultOutput = new CiResultOutput({ - version: VERSION, - status: report.status, - title: report.title, - duration_ms: totalDurationMs, - steps: stepResults, - artifacts: { - ...(effectiveVideoPath ? { video: effectiveVideoPath } : {}), - ...(artifacts.replayPath ? { replay: artifacts.replayPath } : {}), - ...(artifacts.screenshotPaths.length > 0 - ? { screenshots: [...artifacts.screenshotPaths] } - : {}), - }, - summary: summaryText, - }); + const summaryParts = [`${passedCount} passed`, `${failedCount} failed`]; + if (skippedCount > 0) summaryParts.push(`${skippedCount} skipped`); + const summaryText = `${summaryParts.join(", ")} out of ${report.steps.length} step${report.steps.length === 1 ? "" : "s"}`; + + const resultOutput = new CiResultOutput({ + version: VERSION, + status: report.status, + title: report.title, + duration_ms: totalDurationMs, + steps: stepResults, + artifacts: { + ...(effectiveVideoPath ? { video: effectiveVideoPath } : {}), + ...(artifacts.replayPath ? { replay: artifacts.replayPath } : {}), + ...(artifacts.screenshotPaths.length > 0 + ? { screenshots: [...artifacts.screenshotPaths] } + : {}), + }, + summary: summaryText, + }); + + const runResultPath = yield* writeRunResult( + finalExecuted.id ?? crypto.randomUUID(), + resultOutput, + ); + if (!isJsonOutput) { + process.stderr.write(`Run result: ${runResultPath}\n`); + } + if (isJsonOutput) { const jsonString = JSON.stringify( Schema.encodeSync(CiResultOutput)(resultOutput), undefined, diff --git a/apps/cli/src/utils/write-run-result.ts b/apps/cli/src/utils/write-run-result.ts new file mode 100644 index 000000000..192228984 --- /dev/null +++ b/apps/cli/src/utils/write-run-result.ts @@ -0,0 +1,60 @@ +import * as path from "node:path"; +import { Effect, Option, Schema } from "effect"; +import { FileSystem } from "effect/FileSystem"; +import { CiResultOutput } from "@expect/shared/models"; + +// Persists structured run results to .expect/runs/{planId}.json so outer +// agents (Cursor, Claude Code, Codex) can read a single file instead of +// polling terminal output. Each run gets a unique planId (UUID), enabling +// parallel agent sessions without file conflicts. + +const EXPECT_STATE_DIR = ".expect"; +const EXPECT_RUNS_DIR = "runs"; +const EXPECT_RUNS_MAX_KEPT = 20; + +export const writeRunResult = Effect.fn("writeRunResult")(function* ( + planId: string, + resultOutput: CiResultOutput, +) { + const fileSystem = yield* FileSystem; + const runsDir = path.join(process.cwd(), EXPECT_STATE_DIR, EXPECT_RUNS_DIR); + + yield* fileSystem.makeDirectory(runsDir, { recursive: true }); + + const filePath = path.join(runsDir, `${planId}.json`); + const jsonString = JSON.stringify(Schema.encodeSync(CiResultOutput)(resultOutput), undefined, 2); + yield* fileSystem.writeFileString(filePath, jsonString + "\n"); + + yield* pruneOldRuns(runsDir); + + return filePath; +}); + +const pruneOldRuns = Effect.fn("pruneOldRuns")(function* (runsDir: string) { + const fileSystem = yield* FileSystem; + + const entries = yield* fileSystem.readDirectory(runsDir); + const jsonFiles = entries.filter((file) => file.endsWith(".json")); + + if (jsonFiles.length <= EXPECT_RUNS_MAX_KEPT) return; + + const withStats = yield* Effect.forEach( + jsonFiles, + (file) => + Effect.gen(function* () { + const filePath = path.join(runsDir, file); + const stat = yield* fileSystem.stat(filePath); + const mtime = Option.getOrElse(stat.mtime, () => new Date(0)); + return { filePath, mtime: mtime.getTime() }; + }), + { concurrency: "unbounded" }, + ); + + withStats.sort((left, right) => right.mtime - left.mtime); + + yield* Effect.forEach( + withStats.slice(EXPECT_RUNS_MAX_KEPT), + (entry) => fileSystem.remove(entry.filePath), + { concurrency: "unbounded" }, + ); +}); diff --git a/packages/browser/src/mcp/server.ts b/packages/browser/src/mcp/server.ts index fb8bc6be9..2c2131247 100644 --- a/packages/browser/src/mcp/server.ts +++ b/packages/browser/src/mcp/server.ts @@ -36,6 +36,42 @@ const imageResult = (base64: string) => ({ content: [{ type: "image" as const, data: base64, mimeType: "image/png" }], }); +const AUTH_PAGE_INDICATORS = [ + "login", + "signin", + "sign-in", + "sign_in", + "auth", + "authenticate", + "sso", + "oauth", +]; + +const buildAuthRedirectWarning = (requestedUrl: string, currentUrl: string): string => { + try { + const requestedOrigin = new URL(requestedUrl).origin; + const currentOrigin = new URL(currentUrl).origin; + const currentPath = new URL(currentUrl).pathname.toLowerCase(); + + const redirectedToAuthPage = + currentOrigin !== requestedOrigin || + AUTH_PAGE_INDICATORS.some( + (indicator) => currentPath.includes(indicator) && !requestedUrl.includes(indicator), + ); + + if (!redirectedToAuthPage) return ""; + + return ( + `\n\n⚠️ AUTH REDIRECT DETECTED: Page redirected to ${currentUrl} instead of staying at ${requestedUrl}. ` + + `This likely means authentication is required. ` + + `If tests need authenticated access, re-run with cookie injection (--cookies) or ensure the dev server allows unauthenticated access. ` + + `You should emit STEP_SKIPPED with category=auth-blocked for any steps that require authentication.` + ); + } catch { + return ""; + } +}; + const AsyncFunction = Object.getPrototypeOf(async () => {}).constructor; // Tool annotations (readOnlyHint, destructiveHint) enable parallel execution in the Claude Agent SDK. @@ -106,13 +142,24 @@ export const createBrowserMcpServer = ( cdpUrl, browserType, }); + + const page = yield* session.requirePage(); + const authWarning = buildAuthRedirectWarning(url, page.url()); + if (authWarning) { + yield* Effect.logWarning("Auth redirect detected", { + requestedUrl: url, + currentUrl: page.url(), + }); + } + const engineSuffix = browserType && browserType !== "chromium" ? ` [${browserType}]` : ""; const cdpSuffix = cdpUrl ? ` (connected via CDP: ${cdpUrl})` : ""; + const cookieSuffix = + result.injectedCookieCount > 0 + ? ` (${result.injectedCookieCount} cookies synced from local browser)` + : ""; return textResult( - `Opened ${url}${engineSuffix}${cdpSuffix}` + - (result.injectedCookieCount > 0 - ? ` (${result.injectedCookieCount} cookies synced from local browser)` - : ""), + `Opened ${url}${engineSuffix}${cdpSuffix}${cookieSuffix}${authWarning}`, ); }).pipe(Effect.withSpan(`mcp.tool.open`)), ), diff --git a/packages/shared/src/models.ts b/packages/shared/src/models.ts index eff1105a8..9313f0dbb 100644 --- a/packages/shared/src/models.ts +++ b/packages/shared/src/models.ts @@ -326,6 +326,9 @@ export const ChangesFor = Schema.TaggedUnion({ }); export type ChangesFor = typeof ChangesFor.Type; +export const ScopeTier = Schema.Literals(["quick", "standard", "thorough"] as const); +export type ScopeTier = typeof ScopeTier.Type; + export const changesForDisplayName = (changesFor: ChangesFor): string => Match.value(changesFor).pipe( Match.tagsExhaustive({ diff --git a/packages/shared/src/prompts.ts b/packages/shared/src/prompts.ts index 5538b9dbb..20b3a3a8e 100644 --- a/packages/shared/src/prompts.ts +++ b/packages/shared/src/prompts.ts @@ -3,6 +3,7 @@ import type { ChangesFor, CommitSummary, SavedFlow, + ScopeTier, TestCoverageReport, } from "./models"; @@ -20,6 +21,7 @@ export interface DevServerHint { export interface ExecutionPromptOptions { readonly userInstruction: string; readonly scope: ChangesFor["_tag"]; + readonly scopeTier: ScopeTier; readonly currentBranch: string; readonly mainBranch: string | undefined; readonly changedFiles: readonly ChangedFile[]; @@ -53,27 +55,52 @@ const formatSavedFlowGuidance = (savedFlow: SavedFlow | undefined): string[] => ]; }; -const getScopeStrategy = (scope: ChangesFor["_tag"]): string[] => { +const getScopeStrategy = (scope: ChangesFor["_tag"], scopeTier: ScopeTier): string[] => { + if (scopeTier === "quick") { + return [ + "- This is a quick, focused test. Verify ONLY the exact change described in the developer request.", + "- One flow, one verification. Do not test adjacent flows, edge cases, or unrelated features.", + "- Navigate to the specific URL, verify the specific behavior, emit RUN_COMPLETED.", + "- Speed is the priority. The developer wants a fast confirmation, not a comprehensive audit.", + ]; + } + switch (scope) { case "Commit": return [ "- Start narrow and prove the selected commit's intended change works first.", "- Treat the selected commit and its touched files as the primary testing hypothesis.", - "- After the primary flow, test 2-4 adjacent flows that could regress from the same change. Think about what else touches the same components, routes, or data.", - "- For UI changes, verify related views that render the same data or share the same components.", + ...(scopeTier === "thorough" + ? [ + "- After the primary flow, test 2-4 adjacent flows that could regress from the same change. Think about what else touches the same components, routes, or data.", + "- For UI changes, verify related views that render the same data or share the same components.", + ] + : [ + "- After the primary flow, test 1-2 adjacent flows that could regress from the same change.", + ]), ]; case "WorkingTree": return [ "- Start with the exact user-requested flow against the local in-progress changes.", - "- After the primary flow, test related flows that exercise the same code paths — aim for 2-3 follow-ups.", - "- Pay extra attention to partially-implemented features: check that incomplete states don't break existing behavior.", + ...(scopeTier === "thorough" + ? [ + "- After the primary flow, test related flows that exercise the same code paths — aim for 2-3 follow-ups.", + "- Pay extra attention to partially-implemented features: check that incomplete states don't break existing behavior.", + ] + : ["- After the primary flow, test 1 related flow that exercises the same code paths."]), ]; case "Changes": return [ "- Treat committed and uncommitted work as one body of change.", "- Cover the requested flow first, then the highest-risk adjacent flows.", - "- Test 2-4 follow-up flows, prioritizing paths that share components or data with the changed files.", - "- If the changes touch shared utilities or layouts, verify multiple pages that use them.", + ...(scopeTier === "thorough" + ? [ + "- Test 2-4 follow-up flows, prioritizing paths that share components or data with the changed files.", + "- If the changes touch shared utilities or layouts, verify multiple pages that use them.", + ] + : [ + "- Test 1-2 follow-up flows, prioritizing paths that share components or data with the changed files.", + ]), ]; default: return [ @@ -112,22 +139,31 @@ const formatTestCoverageSection = (testCoverage: TestCoverageReport | undefined) return lines; }; -export const buildExecutionSystemPrompt = (browserMcpServerName?: string): string => { - const mcpName = browserMcpServerName ?? DEFAULT_BROWSER_MCP_SERVER_NAME; +export interface SystemPromptOptions { + readonly browserMcpServerName?: string; + readonly scopeTier?: ScopeTier; +} + +const buildChangeAnalysis = (): string[] => [ + "", + "The diff preview, changed files list, and recent commits are already provided in the prompt. Do NOT call tools to re-read or re-diff those files — all the context you need to plan is already here.", + "- Scan the provided changed files list and diff preview to identify what behavior changed and which user flows to test.", + "- Group related files into concrete flows. A flow is an end-to-end path with a clear entry point, user action, and observable outcome.", + "- Treat the diff as the source of truth. The developer request is a starting point, not the full scope.", + "- Files without existing automated tests are higher risk. Give them deeper browser coverage when they touch runtime behavior.", + "", +]; + +const buildCoverageRules = (scopeTier: ScopeTier): string[] => { + if (scopeTier === "quick") { + return [ + "", + "Verify only the specific behavior described in the developer request. Do not expand scope beyond the requested change.", + "", + ]; + } return [ - "You are a QA engineer testing code changes in a real browser. Your job is to find bugs the developer missed, not confirm the happy path works.", - "", - "You have two documented failure patterns. First, happy-path seduction: the page loads, the primary flow works, and you emit RUN_COMPLETED without testing edge cases, viewports, or adjacent flows — the easy 80% passes and the bugs hide in the untested 20%. Second, soft failures: a check fails but the page 'mostly works,' so you emit STEP_DONE instead of ASSERTION_FAILED, hiding the bug from the developer.", - "", - "", - "The diff preview, changed files list, and recent commits are already provided in the prompt. Do NOT call tools to re-read or re-diff those files — all the context you need to plan is already here.", - "- Scan the provided changed files list and diff preview to identify what behavior changed and which user flows to test.", - "- Group related files into concrete flows. A flow is an end-to-end path with a clear entry point, user action, and observable outcome.", - "- Treat the diff as the source of truth. The developer request is a starting point, not the full scope.", - "- Files without existing automated tests are higher risk. Give them deeper browser coverage when they touch runtime behavior.", - "", - "", "", "Minimum bar: every changed route, page, form, mutation, API interaction, auth gate, shared component, shared hook, or shared utility that affects runtime behavior must be covered by at least one tested flow or one code-level check.", "- When shared code changes, test multiple consumers instead of one happy path.", @@ -135,11 +171,31 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "- If a diff changes persistence or mutations, verify the before/after state and one durability check (refresh, revisit, or back-navigation).", "- If multiple files implement one feature, test the full user journey end-to-end instead of isolated clicks.", "", - "", + ]; +}; + +const buildExecutionStrategy = (scopeTier: ScopeTier): string[] => { + if (scopeTier === "quick") { + return [ + "", + "- Navigate directly to the page affected by the change. Verify the specific behavior the developer asked about.", + "- Execution style is assertion-first: navigate, act, then validate.", + "- Create your own step structure while executing. Use stable sequential IDs like step-01, step-02, step-03.", + "- Use playwright to return structured evidence: current URL, page title, and visibility of the target element.", + "- Do not test unrelated features, adjacent flows, or edge cases. One focused verification is the goal.", + "", + ]; + } + + return [ "", "- First master the primary flow the developer asked for. Verify it thoroughly before moving on.", "- Once the primary flow passes, test additional related flows suggested by the changed files, diff semantics, and route context. The scope strategy below specifies how many.", - "- For each flow, test both the happy path AND at least one edge case or negative path (e.g. empty input, missing data, back-navigation, double-click, refresh mid-flow).", + ...(scopeTier === "thorough" + ? [ + "- For each flow, test both the happy path AND at least one edge case or negative path (e.g. empty input, missing data, back-navigation, double-click, refresh mid-flow).", + ] + : []), "- Use the same browser session throughout unless the app forces you into a different path.", "- Execution style is assertion-first: navigate, act, then validate before moving on.", "- Create your own step structure while executing. Use stable sequential IDs like step-01, step-02, step-03.", @@ -148,35 +204,52 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "- Use playwright to return structured evidence: current URL, page title, and visibility of the target element.", "- If the changed files suggest specific behavior (e.g. a validation rule, a redirect, a computed value), test that specific behavior rather than just the surrounding UI.", "", - "", - "", - "Every page you test MUST have real data. If a page shows an empty state, zero records, or placeholder content, seed it before testing. An empty-state screenshot is not a test — it is a skip.", - "", - "1. Navigate to the target page. Snapshot. If data exists and is sufficient, proceed to testing.", - "2. If empty or insufficient: find the creation flow ('Add', 'New', 'Create', 'Import') and use it. If the app exposes an API you can call via playwright's page.evaluate(fetch(...)), prefer that for speed.", - "3. Create the full dependency chain top-down. A paystub requires company → employee → payroll run → paystub. Do not skip intermediate objects.", - "4. Create MINIMUM 3 records. One record hides pagination, sorting, bulk-action, and empty-vs-populated bugs.", - "5. After seeding, return to the target page and snapshot. If the data does not appear, emit ASSERTION_FAILED — the creation flow is broken.", - "6. Prefix every seed step with [Setup]: STEP_START|step-01|[Setup] Create employee with adversarial name", - "", - "Adversarial seed values — each record MUST use a different category. Rotate across your 3+ records:", - "- Unicode stress: German umlauts + hyphen ('Günther Müller-Lüdenscheid'), Arabic RTL ('مريم الفارسي'), CJK ('田中太郎'), Zalgo combining chars ('T̸̢̧ë̵̡s̶̨̛t̷̢̛')", - "- Boundary values: 0, -1, 999999999.99, 0.001 for numbers. Empty string and 5000+ chars for text. '' for XSS.", - "- Edge dates: '1970-01-01' (epoch), a date in the current month, and an obviously invalid date if the field allows free input.", - "- Truncation: 100+ character email, 200+ character name, max-length strings. These catch overflow and ellipsis bugs.", - "- Dropdowns: always select the LAST option at least once — it is the least tested.", - "", - "Bad: navigate to /employees, see 'No employees yet', screenshot, emit STEP_DONE|step-01|employee list page renders correctly.", - "Good: navigate to /employees, see 'No employees yet', find 'Add Employee' button, create 3 employees with adversarial names, return to /employees, verify all 3 appear in the table, THEN test the actual feature.", - "", - "Rationalizations you will reach for — recognize them and do the opposite:", - "- 'The empty state renders correctly' — you were not asked to test the empty state. Seed data.", - "- 'One record is enough to verify the feature' — one record hides half the bugs. Three is the minimum.", - "- 'Creating data will take too long' — testing against empty data wastes the entire run. Seed first.", - "- 'I don't have the right permissions to create data' — try the creation flow first. Only emit STEP_SKIPPED with category=missing-test-data if it actually fails.", - "- 'The developer probably has data in their environment' — you do not know that. Check and seed.", - "", - "", + ]; +}; + +const buildDataSeeding = (): string[] => [ + "", + "Every page you test MUST have real data. If a page shows an empty state, zero records, or placeholder content, seed it before testing. An empty-state screenshot is not a test — it is a skip.", + "", + "1. Navigate to the target page. Snapshot. If data exists and is sufficient, proceed to testing.", + "2. If empty or insufficient: find the creation flow ('Add', 'New', 'Create', 'Import') and use it. If the app exposes an API you can call via playwright's page.evaluate(fetch(...)), prefer that for speed.", + "3. Create the full dependency chain top-down. A paystub requires company → employee → payroll run → paystub. Do not skip intermediate objects.", + "4. Create MINIMUM 3 records. One record hides pagination, sorting, bulk-action, and empty-vs-populated bugs.", + "5. After seeding, return to the target page and snapshot. If the data does not appear, emit ASSERTION_FAILED — the creation flow is broken.", + '6. Prefix every seed step with [Setup]: STEP_START|step-01|[Setup] Create employee with adversarial name', + "", + "Adversarial seed values — each record MUST use a different category. Rotate across your 3+ records:", + "- Unicode stress: German umlauts + hyphen ('Günther Müller-Lüdenscheid'), Arabic RTL ('مريم الفارسي'), CJK ('田中太郎'), Zalgo combining chars ('T̸̢̧ë̵̡s̶̨̛t̷̢̛')", + "- Boundary values: 0, -1, 999999999.99, 0.001 for numbers. Empty string and 5000+ chars for text. '' for XSS.", + "- Edge dates: '1970-01-01' (epoch), a date in the current month, and an obviously invalid date if the field allows free input.", + "- Truncation: 100+ character email, 200+ character name, max-length strings. These catch overflow and ellipsis bugs.", + "- Dropdowns: always select the LAST option at least once — it is the least tested.", + "", + "Bad: navigate to /employees, see 'No employees yet', screenshot, emit STEP_DONE|step-01|employee list page renders correctly.", + "Good: navigate to /employees, see 'No employees yet', find 'Add Employee' button, create 3 employees with adversarial names, return to /employees, verify all 3 appear in the table, THEN test the actual feature.", + "", + "Rationalizations you will reach for — recognize them and do the opposite:", + "- 'The empty state renders correctly' — you were not asked to test the empty state. Seed data.", + "- 'One record is enough to verify the feature' — one record hides half the bugs. Three is the minimum.", + "- 'Creating data will take too long' — testing against empty data wastes the entire run. Seed first.", + "- 'I don't have the right permissions to create data' — try the creation flow first. Only emit STEP_SKIPPED with category=missing-test-data if it actually fails.", + "- 'The developer probably has data in their environment' — you do not know that. Check and seed.", + "", +]; + +const buildUiQualityRules = (scopeTier: ScopeTier): string[] => { + if (scopeTier === "standard") { + return [ + "", + "After completing the primary functional tests, run a quick UI quality check when the diff touches files that affect visual output (components, styles, layouts, templates, routes). Skip this section when the diff only changes backend logic, build config, or tests.", + "", + "1. Design system conformance: inspect for tailwind.config, CSS custom properties, component libraries, token files. Verify changed elements use the system's tokens. Flag hardcoded hex/rgb colors, pixel spacing, or font-family declarations that bypass the design system.", + "2. Responsive design: test at two viewports using page.setViewportSize: 375×812 (mobile) and 1280×800 (desktop). Verify no horizontal overflow, no overlapping elements, text readable, interactive elements accessible.", + "", + ]; + } + + return [ "", "After completing the primary functional tests, run a dedicated UI quality pass when the diff touches files that affect visual output (components, styles, layouts, templates, routes). Skip this section when the diff only changes backend logic, build config, or tests. When applicable, these checks are mandatory. Emit each as its own step.", "", @@ -188,20 +261,122 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "6. Layout stability (CLS): after networkidle, measure cumulative layout shift via PerformanceObserver. CLS above 0.1 is a failure, 0.05-0.1 is a warning. If high, screenshot immediately and 3 seconds later.", "7. Font loading: after networkidle, check document.fonts API. Every font must have status 'loaded'. Verify @font-face or preload tags exist. Flag system-font-only text unless the design system specifies a system stack.", "", + ]; +}; + +const buildTools = (mcpName: string, scopeTier: ScopeTier): string[] => [ + ``, + "1. open: launch a browser and navigate to a URL. Pass browser='webkit' or browser='firefox' to launch a non-Chromium engine (e.g. for cross-browser testing). Close the current session first before switching engines.", + "2. playwright: execute Playwright code. Globals: page, context, browser, ref(id). Set snapshotAfter=true to auto-snapshot after execution.", + "3. screenshot: capture page state. Modes: 'snapshot' (ARIA tree, preferred), 'screenshot' (PNG), 'annotated' (PNG with labels).", + "4. console_logs: get browser console messages. Filter by type ('error', 'warning', 'log').", + "5. network_requests: get captured requests with automatic issue detection (4xx/5xx, duplicates, mixed content).", + "6. performance_metrics: collect Web Vitals, TTFB, Long Animation Frames (LoAF), resource breakdown.", + "7. accessibility_audit: run WCAG audit (axe-core + IBM Equal Access). Returns violations with selectors and fix guidance.", + "8. close: close the browser and end the session.", + "", + "Prefer screenshot mode 'snapshot' for observing page state. Use 'screenshot' or 'annotated' only for purely visual checks (layout, colors, images).", + ...(scopeTier === "thorough" + ? ["After each step, call console_logs with type 'error' to catch unexpected errors."] + : []), + ``, +]; + +const buildRunCompletion = (scopeTier: ScopeTier): string[] => { + if (scopeTier === "quick") { + return [ + "", + "Before emitting RUN_COMPLETED:", + "1. If a browser session was opened, call close exactly once to flush the session video to disk.", + "Do not emit RUN_COMPLETED until the close call is done.", + "", + ]; + } + + if (scopeTier === "standard") { + return [ + "", + "Before emitting RUN_COMPLETED, complete all of these steps:", + "1. Call accessibility_audit to check for WCAG violations. Report critical or serious violations as ASSERTION_FAILED steps.", + "2. If a browser session was opened, call close exactly once to flush the session video to disk.", + "3. Review the changed files list and confirm every file is accounted for by a tested flow, a code-level check, or an explicit blocker with evidence.", + "Do not emit RUN_COMPLETED until all steps above are done.", + "", + ]; + } + + return [ + "", + "Before emitting RUN_COMPLETED, complete all of these steps:", + "1. Call accessibility_audit to check for WCAG violations. Report critical or serious violations as ASSERTION_FAILED steps.", + "2. Call performance_metrics to collect the performance trace. If any Web Vital is rated 'poor' or any LoAF has blockingDuration > 150ms, report it as an ASSERTION_FAILED step.", + "3. Run the project healthcheck: read package.json to find test/check scripts, identify the package manager from lock files, and run it. Report pass/fail as a step.", + "4. If a browser session was opened, call close exactly once to flush the session video to disk.", + "5. Review the changed files list and confirm every file is accounted for by a tested flow, a code-level check, or an explicit blocker with evidence.", + "Do not emit RUN_COMPLETED until all steps above are done.", + "", + ]; +}; + +const buildRecognizeRationalizations = (scopeTier: ScopeTier): string[] => { + if (scopeTier === "quick") { + return [ + "", + '- "The page loaded successfully" — loading is not verification. Check the specific behavior the diff changed.', + '- "I already checked this visually" — visual checks without structured evidence are not verification. Use playwright to return concrete data.', + "If you catch yourself narrating what you would test instead of running a tool call, stop. Run the tool call.", + "", + ]; + } + + return [ + "", + "You will feel the urge to skip checks or soften results. These are the exact excuses you reach for — recognize them and do the opposite:", + '- "The page loaded successfully" — loading is not verification. Check the specific behavior the diff changed.', + ...(scopeTier === "thorough" + ? [ + '- "This viewport looks fine" — did you check all required viewports? Skipping one is not testing it.', + '- "This styling change is too small to need all 7 checks" — if the diff touches visual files, every applicable check runs regardless of change size.', + ] + : []), + '- "The test coverage section shows this file is already tested" — existing tests are written by the developer. Your job is to catch what they missed.', + '- "The primary flow passed, so the feature works" — the primary flow is the easy 80%. Test the adjacent flows.', + '- "I already checked this visually" — visual checks without structured evidence are not verification. Use playwright to return concrete data.', + "If you catch yourself narrating what you would test instead of running a tool call, stop. Run the tool call.", + "", + ]; +}; + +export const buildExecutionSystemPrompt = (options?: SystemPromptOptions): string => { + const mcpName = options?.browserMcpServerName ?? DEFAULT_BROWSER_MCP_SERVER_NAME; + const scopeTier = options?.scopeTier ?? "standard"; + + const intro = + scopeTier === "quick" + ? "You are a QA engineer running a quick, focused browser test on a specific code change. Verify exactly what the developer asked — nothing more." + : "You are a QA engineer testing code changes in a real browser. Your job is to find bugs the developer missed, not confirm the happy path works."; + + const failurePatterns = + scopeTier === "quick" + ? [] + : [ + "", + "You have two documented failure patterns. First, happy-path seduction: the page loads, the primary flow works, and you emit RUN_COMPLETED without testing edge cases, viewports, or adjacent flows — the easy 80% passes and the bugs hide in the untested 20%. Second, soft failures: a check fails but the page 'mostly works,' so you emit STEP_DONE instead of ASSERTION_FAILED, hiding the bug from the developer.", + ]; + + return [ + intro, + ...failurePatterns, + "", + ...buildChangeAnalysis(), "", - ``, - "1. open: launch a browser and navigate to a URL. Pass browser='webkit' or browser='firefox' to launch a non-Chromium engine (e.g. for cross-browser testing). Close the current session first before switching engines.", - "2. playwright: execute Playwright code. Globals: page, context, browser, ref(id). Set snapshotAfter=true to auto-snapshot after execution.", - "3. screenshot: capture page state. Modes: 'snapshot' (ARIA tree, preferred), 'screenshot' (PNG), 'annotated' (PNG with labels).", - "4. console_logs: get browser console messages. Filter by type ('error', 'warning', 'log').", - "5. network_requests: get captured requests with automatic issue detection (4xx/5xx, duplicates, mixed content).", - "6. performance_metrics: collect Web Vitals, TTFB, Long Animation Frames (LoAF), resource breakdown.", - "7. accessibility_audit: run WCAG audit (axe-core + IBM Equal Access). Returns violations with selectors and fix guidance.", - "8. close: close the browser and end the session.", + ...buildCoverageRules(scopeTier), "", - "Prefer screenshot mode 'snapshot' for observing page state. Use 'screenshot' or 'annotated' only for purely visual checks (layout, colors, images).", - "After each step, call console_logs with type 'error' to catch unexpected errors.", - "", + ...buildExecutionStrategy(scopeTier), + "", + ...(scopeTier !== "quick" ? [...buildDataSeeding(), ""] : []), + ...(scopeTier !== "quick" ? [...buildUiQualityRules(scopeTier), ""] : []), + ...buildTools(mcpName, scopeTier), "", "", "1. Call screenshot mode='snapshot' to get the ARIA tree with refs like [ref=e4].", @@ -222,16 +397,7 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "If changes are mixed, browser-test the UI parts and code-test the rest.", "", "", - "", - "You will feel the urge to skip checks or soften results. These are the exact excuses you reach for — recognize them and do the opposite:", - '- "The page loaded successfully" — loading is not verification. Check the specific behavior the diff changed.', - '- "This viewport looks fine" — did you check all required viewports? Skipping one is not testing it.', - '- "The test coverage section shows this file is already tested" — existing tests are written by the developer. Your job is to catch what they missed.', - '- "This styling change is too small to need all 7 checks" — if the diff touches visual files, every applicable check runs regardless of change size.', - '- "The primary flow passed, so the feature works" — the primary flow is the easy 80%. Test the adjacent flows.', - '- "I already checked this visually" — visual checks without structured evidence are not verification. Use playwright to return concrete data.', - "If you catch yourself narrating what you would test instead of running a tool call, stop. Run the tool call.", - "", + ...buildRecognizeRationalizations(scopeTier), "", "", "- After navigation or major UI changes, wait for the page to settle (await page.waitForLoadState('networkidle')).", @@ -241,7 +407,11 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "- If still blocked after one retry, classify the blocker with one allowed failure category and emit ASSERTION_FAILED.", "- Do not repeat the same failing action without new evidence (fresh snapshot, different ref, changed page state).", "- If four attempts fail or progress stalls, stop and report what you observed, what blocked progress, and the most likely next step.", - "- If you encounter missing test data (empty lists, no records, 'no results' states), treat it as a resolvable blocker — follow the procedure before giving up.", + ...(scopeTier !== "quick" + ? [ + "- If you encounter missing test data (empty lists, no records, 'no results' states), treat it as a resolvable blocker — follow the procedure before giving up.", + ] + : []), "- If you encounter a hard blocker (login, passkey, captcha, permissions), stop and report it instead of improvising.", "", "", @@ -278,15 +448,7 @@ export const buildExecutionSystemPrompt = (browserMcpServerName?: string): strin "Good: ASSERTION_FAILED|step-03|category=app-bug; domain=responsive; expected=Submit button visible at 375px; actual=button clipped by overflow:hidden on .form-container; url=http://localhost:3000/login; evidence=snapshot ref=e4 width=0; repro=resize to 375×812, open /login; likely-scope=src/components/LoginForm.tsx; next-agent-prompt=Fix overflow clipping on .form-container at mobile viewports", "", "", - "", - "Before emitting RUN_COMPLETED, complete all of these steps:", - "1. Call accessibility_audit to check for WCAG violations. Report critical or serious violations as ASSERTION_FAILED steps.", - "2. Call performance_metrics to collect the performance trace. If any Web Vital is rated 'poor' or any LoAF has blockingDuration > 150ms, report it as an ASSERTION_FAILED step.", - "3. Run the project healthcheck: read package.json to find test/check scripts, identify the package manager from lock files, and run it. Report pass/fail as a step.", - "4. If a browser session was opened, call close exactly once to flush the session video to disk.", - "5. Review the changed files list and confirm every file is accounted for by a tested flow, a code-level check, or an explicit blocker with evidence.", - "Do not emit RUN_COMPLETED until all steps above are done.", - "", + ...buildRunCompletion(scopeTier), ].join("\n"); }; @@ -316,6 +478,7 @@ export const buildExecutionPrompt = (options: ExecutionPromptOptions): string => `Browser is headless: ${options.isHeadless ? "yes" : "no"}`, `Uses existing browser cookies: ${options.cookieBrowserKeys.length > 0 ? `yes (${options.cookieBrowserKeys.length})` : "no"}`, `Scope: ${options.scope}`, + `Scope tier: ${options.scopeTier}`, `Current branch: ${options.currentBranch}`, ...(options.mainBranch ? [`Main branch: ${options.mainBranch}`] : []), "", @@ -347,7 +510,7 @@ export const buildExecutionPrompt = (options: ExecutionPromptOptions): string => "", "", "", - ...getScopeStrategy(options.scope), + ...getScopeStrategy(options.scope, options.scopeTier), "", ].join("\n"); }; diff --git a/packages/shared/tests/prompts.test.ts b/packages/shared/tests/prompts.test.ts index bf381f77f..caff6b408 100644 --- a/packages/shared/tests/prompts.test.ts +++ b/packages/shared/tests/prompts.test.ts @@ -12,6 +12,7 @@ const makeDefaultOptions = ( ): ExecutionPromptOptions => ({ userInstruction: "Test the login flow", scope: "Changes", + scopeTier: "standard", currentBranch: "feat/login", mainBranch: "main", changedFiles: [ @@ -107,7 +108,7 @@ describe("buildExecutionPrompt", () => { }); it("includes scope strategy for branch scope", () => { - const prompt = buildExecutionPrompt(makeDefaultOptions({ scope: "Branch" })); + const prompt = buildExecutionPrompt(makeDefaultOptions({ scope: "Branch", scopeTier: "thorough" })); expect(prompt).toContain("branch-level review"); expect(prompt).toContain("5-8 total tested flows"); }); @@ -252,8 +253,8 @@ describe("buildExecutionPrompt", () => { expect(prompt).toContain("no user-visible surface"); }); - it("includes project healthcheck guidance in system prompt", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes project healthcheck guidance in thorough system prompt", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("healthcheck"); expect(prompt).toContain("package.json"); expect(prompt).toContain("lock files"); @@ -297,8 +298,8 @@ describe("buildExecutionPrompt", () => { expect(prompt).toContain("Good: ASSERTION_FAILED|step-03|category=app-bug"); }); - it("includes UI quality rules section in system prompt", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes UI quality rules section in thorough system prompt", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain(""); expect(prompt).toContain("these checks are mandatory"); }); @@ -310,8 +311,8 @@ describe("buildExecutionPrompt", () => { expect(prompt).toContain("hardcoded hex/rgb colors"); }); - it("includes responsive viewport sizes with tablet breakpoints", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes responsive viewport sizes with tablet breakpoints in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Responsive design:"); expect(prompt).toContain("375\u00d7812 (iPhone SE)"); expect(prompt).toContain("390\u00d7844 (iPhone 14)"); @@ -321,35 +322,47 @@ describe("buildExecutionPrompt", () => { expect(prompt).toContain("setViewportSize"); }); - it("includes touch interaction testing rules", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes only mobile and desktop viewports in standard mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "standard" }); + expect(prompt).toContain("375\u00d7812 (mobile)"); + expect(prompt).toContain("1280\u00d7800 (desktop)"); + expect(prompt).not.toContain("iPad Mini"); + }); + + it("excludes UI quality rules in quick mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "quick" }); + expect(prompt).not.toContain(""); + }); + + it("includes touch interaction testing rules in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Touch interaction:"); expect(prompt).toContain("also complete via tap"); }); - it("includes cross-browser Safari/WebKit check", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes cross-browser Safari/WebKit check in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Cross-browser (Safari/WebKit):"); expect(prompt).toContain("flexbox gap"); expect(prompt).toContain("WebKit is unavailable"); }); - it("includes dark mode verification rules", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes dark mode verification rules in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Dark mode:"); expect(prompt).toContain("prefers-color-scheme"); expect(prompt).toContain("dark mode"); }); - it("includes layout stability (CLS) rules", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes layout stability (CLS) rules in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Layout stability (CLS):"); expect(prompt).toContain("layout shift"); expect(prompt).toContain("0.1"); }); - it("includes font loading verification rules", () => { - const prompt = buildExecutionSystemPrompt(); + it("includes font loading verification rules in thorough mode", () => { + const prompt = buildExecutionSystemPrompt({ scopeTier: "thorough" }); expect(prompt).toContain("Font loading:"); expect(prompt).toContain("document.fonts"); expect(prompt).toContain("@font-face"); diff --git a/packages/supervisor/src/constants.ts b/packages/supervisor/src/constants.ts index 595082821..09786c47a 100644 --- a/packages/supervisor/src/constants.ts +++ b/packages/supervisor/src/constants.ts @@ -8,6 +8,8 @@ export const PR_LIMIT = 100; export const EXECUTION_CONTEXT_FILE_LIMIT = 12; export const EXECUTION_RECENT_COMMIT_LIMIT = 5; export const EXPECT_STATE_DIR = ".expect"; +export const EXPECT_RUNS_DIR = "runs"; +export const EXPECT_RUNS_MAX_KEPT = 20; export const EXPECT_REPLAY_OUTPUT_ENV_NAME = "EXPECT_REPLAY_OUTPUT_PATH"; export const TESTED_FINGERPRINT_FILE = "last-tested"; diff --git a/packages/supervisor/src/executor.ts b/packages/supervisor/src/executor.ts index 1ef0437a9..2d6733a80 100644 --- a/packages/supervisor/src/executor.ts +++ b/packages/supervisor/src/executor.ts @@ -25,6 +25,7 @@ import { buildExecutionSystemPrompt, type DevServerHint, } from "@expect/shared/prompts"; +import type { ScopeTier } from "@expect/shared/models"; import * as NodeServices from "@effect/platform-node/NodeServices"; import { Git } from "./git/git"; import { @@ -59,6 +60,7 @@ export interface ExecuteOptions { readonly instruction: string; readonly isHeadless: boolean; readonly cookieBrowserKeys: readonly string[]; + readonly scopeTier?: ScopeTier; readonly baseUrl?: string; readonly savedFlow?: SavedFlow; readonly learnings?: string; @@ -132,11 +134,14 @@ export class Executor extends ServiceMap.Service()("@supervisor/Execut const context = yield* gatherContext(options.changesFor); - const systemPrompt = buildExecutionSystemPrompt(); + const scopeTier = options.scopeTier ?? "standard"; + + const systemPrompt = buildExecutionSystemPrompt({ scopeTier }); const prompt = buildExecutionPrompt({ userInstruction: options.instruction, scope: options.changesFor._tag, + scopeTier, currentBranch: context.currentBranch, mainBranch: context.mainBranch, changedFiles: context.changedFiles, From 95ce694e2ce4f97ea5981f6ddadd8dc83fbcf872 Mon Sep 17 00:00:00 2001 From: Nisarg Patel Date: Fri, 3 Apr 2026 00:12:09 -0700 Subject: [PATCH 2/2] chore: update version to 2.3.0 and enhance documentation for scope tiers and execution results - Bumped version from 2.2.0 to 2.3.0. - Added detailed documentation on scope tiers for testing depth: quick, standard, and thorough. - Included instructions for reading structured run result files from `.expect/runs/{planId}.json`. --- packages/expect-skill/SKILL.md | 79 +++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/packages/expect-skill/SKILL.md b/packages/expect-skill/SKILL.md index f3757f958..dc6a109ff 100644 --- a/packages/expect-skill/SKILL.md +++ b/packages/expect-skill/SKILL.md @@ -4,7 +4,7 @@ description: "Use when editing .tsx/.jsx/.css/.html, React components, pages, ro license: MIT metadata: author: millionco - version: "2.2.0" + version: "2.3.0" --- # Expect @@ -41,6 +41,39 @@ expect-cli -m "[INSTRUCTION] on [URL]" -y --cookies - Include the URL of the app in the instruction - Accessibility and performance are checked automatically. Do not mention them in your instruction. +### Scope Tiers + +Control test depth with `--scope`: + +| Tier | Flag | What it does | When to use | +| ------------ | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------- | +| **quick** | `--scope quick` | One URL, one verification, ~30 seconds. No follow-up flows, no viewport matrix, no a11y/perf audit. | Verifying a specific bug fix, checking a single CSS change, fast iteration loops | +| **standard** | (default) | Primary flow + 1-2 follow-ups, mobile+desktop responsive check, a11y audit. | Normal development — the right balance of speed and coverage | +| **thorough** | `--scope thorough` | Full audit: 7 viewports, WebKit cross-browser, dark mode, CLS, font loading, perf metrics, project healthcheck, 2-8 follow-up flows. | Pre-merge branch reviews, design system migrations, release candidates | + +**Use `--scope quick` for fast iteration.** When you just fixed one thing and want to confirm it works, don't run the full test suite: + +```bash +expect-cli -m "Verify the submit button is visible at mobile viewport on http://localhost:3000/login" -y --cookies --scope quick +``` + +**Use `--scope thorough` for branch reviews:** + +```bash +expect-cli --target branch --scope thorough -y --cookies +``` + +### Run Result Files + +Every run writes structured results to `.expect/runs/{planId}.json`. Each run gets a unique UUID filename, so parallel agents don't conflict. Read these files instead of polling terminal output: + +```bash +# Read the latest run result +cat .expect/runs/*.json | jq -s 'sort_by(.duration_ms) | last' +``` + +The JSON contains: `status`, `title`, `duration_ms`, `steps[]` (with per-step status/duration/errors), `artifacts` (video, replay, screenshots), and `summary`. + ## Parallel Execution `expect-cli` takes 1-30 minutes. Never block your main thread. @@ -48,21 +81,43 @@ expect-cli -m "[INSTRUCTION] on [URL]" -y --cookies 1. **Set shell timeout to at least 1800 seconds** — the default will kill it 2. **Launch each `expect-cli` call in a subagent** (Task/Agent tool or background shell with timeout 0 / `&`) and continue working 3. **When testing multiple features, run separate invocations concurrently** - one subagent per feature +4. **Read results from `.expect/runs/*.json`** instead of polling terminal output — each run writes a unique file on completion Do not skip parallel execution because "it's simpler to wait." Do not skip subagent usage because "it's just one test." ## Writing Instructions -Think like a user trying to break the feature, not a QA checklist confirming it renders. +**Be specific about what changed, not broad about everything on the page.** The highest-value tests are: one URL, one specific behavior, verified fast. Broad instructions that test unrelated features waste 3-5 minutes per run. -**Bad:** `expect-cli -m "Check that the login form renders on http://localhost:5173" -y --cookies` -**Good:** `expect-cli -m "Submit the login form empty, with invalid email, with wrong password, and with valid credentials. Verify error messages, redirect on success, and console errors on http://localhost:5173" -y --cookies` +**Bad — too vague, tests unrelated features:** -**Bad:** `expect-cli -m "Verify the settings page works on http://localhost:5173/settings" -y --cookies` -**Good:** `expect-cli -m "Change display name to empty string, to a 500-char string, and to a valid name. Toggle every switch off then on. Hit save without changes. Verify toasts, validation errors, and that refreshing persists the update on http://localhost:5173/settings" -y --cookies` +``` +expect-cli -m "Check that the login form renders on http://localhost:5173" -y --cookies +``` + +**Good — focused on the actual change:** + +``` +expect-cli -m "Submit the login form with invalid email and verify the error message says 'Invalid email format' on http://localhost:5173/login" -y --cookies --scope quick +``` + +**Bad — tests everything on the page instead of the change:** + +``` +expect-cli -m "Verify the settings page works on http://localhost:5173/settings" -y --cookies +``` -**Bad:** `expect-cli -m "Test the search feature on http://localhost:5173" -y --cookies` -**Good:** `expect-cli -m "Search with no query, a single character, a query with no results, and a valid query. Click a result, go back, verify the previous query is preserved. Rapid-fire 5 searches and confirm no stale results appear on http://localhost:5173" -y --cookies` +**Good — targets the specific behavior that changed:** + +``` +expect-cli -m "Change display name to a 500-char string, hit save, refresh, and verify it persisted with truncation on http://localhost:5173/settings" -y --cookies +``` + +**For thorough coverage** (branch reviews, pre-merge), use `--scope thorough` and let the tool decide what to test from the diff: + +``` +expect-cli --target branch --scope thorough -m "Test all changed components" -y --cookies +``` ## Reference Skills @@ -116,9 +171,11 @@ Do not guess at workarounds for tool bugs. Diagnose with `--verbose`, report wit You MUST complete every step. A skipped step is a failed task. -1. Run `expect-cli -m "[instruction] on [URL]" -y --cookies` with a fresh, adversarial instruction -2. Read the full output — check exit code, count failures, read accessibility and performance results -3. If ANY failure: fix it, then re-run from step 1. Accessibility violations and performance regressions are failures. +1. Run `expect-cli -m "[instruction] on [URL]" -y --cookies` with a focused instruction targeting the change +2. Read the full output or `.expect/runs/*.json` — check exit code, count failures, read accessibility results +3. If ANY failure: fix it, then re-run with `--scope quick` targeting the specific fix 4. Only after exit 0 with 0 failures: state the claim WITH the evidence +Use `--scope quick` for fix-and-verify loops to keep iteration fast. Use default scope for initial verification. + Skip any step = lying, not verifying. No exceptions for "just this once", "it's simple enough", or "I already checked manually".