diff --git a/bench/bench-configure-readiness.ts b/bench/bench-configure-readiness.ts new file mode 100644 index 00000000..f626cdf1 --- /dev/null +++ b/bench/bench-configure-readiness.ts @@ -0,0 +1,365 @@ +#!/usr/bin/env bun + +/* +Benchmark the readiness contract of POST /configure on the kernel-images +container by: + 1. Starting one of the kernel-images-headless docker images. + 2. Waiting for the initial chromium to be fully CDP-responsive. + 3. Issuing a single POST /configure with both viewport (1280x800@60) and + --kiosk chromium_flags so the VM has to stop chromium and start it + back up (the "stop/start path" inside ChromiumConfigure). + 4. Concurrently probing: + (a) when /json/version starts reporting a NEW browser UUID + (i.e. the new chromium has emitted "DevTools listening on ...") + (b) when a real Browser.getVersion CDP round-trip succeeds against + that new UUID (i.e. chromium is actually serving CDP). + and timing the gap between the api returning 200 and (a)/(b) being + true. + +This isolates the bug fixed in fix/chromium-readiness-false-positive: +the BEFORE image returns 200 several seconds before chromium can actually +serve requests; the AFTER image holds 200 until chromium is real. + +Usage: + bun bench/bench-configure-readiness.ts \ + --image kernel-images-headless:before --label before + bun bench/bench-configure-readiness.ts \ + --image kernel-images-headless:after --label after +*/ + +interface Args { + image: string; + label: string; + apiPort: number; + devToolsPort: number; + containerName: string; + iterations: number; +} + +function parseArgs(): Args { + const argv = process.argv.slice(2); + const get = (flag: string, def?: string) => { + const i = argv.indexOf(flag); + if (i === -1) return def; + return argv[i + 1]; + }; + const image = get('--image'); + if (!image) { + console.error('--image is required'); + process.exit(1); + } + return { + image, + label: get('--label', image) ?? image, + apiPort: Number(get('--api-port', '19001')), + devToolsPort: Number(get('--devtools-port', '19222')), + containerName: get('--name', `kibench-${Date.now()}`)!, + iterations: Number(get('--iterations', '3')), + }; +} + +const args = parseArgs(); + +async function sh(cmd: string[], opts?: { quiet?: boolean }): Promise { + const proc = Bun.spawn(cmd, { + stdout: 'pipe', + stderr: opts?.quiet ? 'ignore' : 'pipe', + }); + const out = await new Response(proc.stdout).text(); + await proc.exited; + if (proc.exitCode !== 0) { + throw new Error(`${cmd.join(' ')} exited ${proc.exitCode}: ${out}`); + } + return out; +} + +async function dockerRm(name: string): Promise { + try { + await sh(['docker', 'rm', '-f', name], { quiet: true }); + } catch { + // ignore + } +} + +async function startContainer(): Promise { + await dockerRm(args.containerName); + console.log(`[start] docker run ${args.image}`); + // Detached run; we'll exec on it later if we need anything from inside. + await sh([ + 'docker', + 'run', + '-d', + '--rm', + '--name', + args.containerName, + '--platform', + 'linux/amd64', + '--privileged', + '--tmpfs', + '/dev/shm:size=2g', + '-p', + `${args.devToolsPort}:9222`, + '-p', + `${args.apiPort}:10001`, + args.image, + ]); +} + +async function getBrowserUUID(): Promise { + try { + const res = await fetch(`http://localhost:${args.devToolsPort}/json/version`, { + signal: AbortSignal.timeout(1500), + }); + if (!res.ok) return null; + const data = (await res.json()) as { webSocketDebuggerUrl?: string }; + const m = data.webSocketDebuggerUrl?.match(/\/devtools\/browser\/([^/?]+)/); + return m?.[1] ?? null; + } catch { + return null; + } +} + +async function browserGetVersionCDP(uuid: string, timeoutMs = 2000): Promise { + // Send a single Browser.getVersion message on a fresh WS. + return new Promise((resolve) => { + let settled = false; + const settle = (ok: boolean) => { + if (!settled) { + settled = true; + try { + ws.close(); + } catch { + // ignore + } + resolve(ok); + } + }; + let ws: WebSocket; + try { + ws = new WebSocket(`ws://localhost:${args.devToolsPort}/devtools/browser/${uuid}`); + } catch { + resolve(false); + return; + } + const t = setTimeout(() => settle(false), timeoutMs); + ws.addEventListener('open', () => { + try { + ws.send(JSON.stringify({ id: 1, method: 'Browser.getVersion' })); + } catch { + clearTimeout(t); + settle(false); + } + }); + ws.addEventListener('message', (ev) => { + try { + const msg = JSON.parse(String(ev.data)); + if (msg.id === 1 && msg.result?.product) { + clearTimeout(t); + settle(true); + } + } catch { + // ignore + } + }); + ws.addEventListener('error', () => { + clearTimeout(t); + settle(false); + }); + ws.addEventListener('close', () => { + clearTimeout(t); + settle(false); + }); + }); +} + +async function waitUntil( + fn: () => Promise, + timeoutMs: number, + intervalMs = 100, +): Promise<{ ok: boolean; ms: number; value: boolean | string | null }> { + const t0 = Date.now(); + while (Date.now() - t0 < timeoutMs) { + const v = await fn(); + if (v) { + return { ok: true, ms: Date.now() - t0, value: v }; + } + await new Promise((r) => setTimeout(r, intervalMs)); + } + return { ok: false, ms: Date.now() - t0, value: null }; +} + +function makeConfigureBody(width: number, height: number, refresh: number): { body: Blob; contentType: string } { + const fd = new FormData(); + fd.append('display', JSON.stringify({ width, height, refresh_rate: refresh, restart_chromium: true, require_idle: false })); + fd.append('chromium_flags', JSON.stringify({ flags: ['--kiosk'] })); + // Bun's FormData via Request handles boundary automatically; we capture both + // body and content-type via a dummy Request. + const req = new Request('http://x/', { method: 'POST', body: fd }); + return { body: req.body as unknown as Blob, contentType: req.headers.get('content-type')! }; +} + +async function postConfigure(): Promise<{ status: number; ms: number; body: string }> { + const { body, contentType } = makeConfigureBody(1280, 800, 60); + const t0 = Date.now(); + let res: Response; + try { + res = await fetch(`http://localhost:${args.apiPort}/configure`, { + method: 'POST', + headers: { 'content-type': contentType }, + body, + signal: AbortSignal.timeout(60_000), + }); + } catch (e) { + return { status: -1, ms: Date.now() - t0, body: `fetch error: ${(e as Error).message}` }; + } + const ms = Date.now() - t0; + const txt = await res.text(); + return { status: res.status, ms, body: txt }; +} + +interface Sample { + iter: number; + initialUUID: string; + configureMs: number; + configureStatus: number; + newUUIDMs: number | null; + newUUID: string | null; + cdpReadyMs: number | null; + gapMs: number | null; // cdpReadyMs - configureMs +} + +async function runOneIteration(iter: number): Promise { + // Wait for the initial chromium to be fully ready (UUID present + CDP responsive). + console.log(`[iter ${iter}] waiting for initial chromium ready...`); + const initialResult = await waitUntil( + async () => { + const u = await getBrowserUUID(); + if (!u) return null; + const ok = await browserGetVersionCDP(u, 1500); + return ok ? u : null; + }, + 90_000, + 250, + ); + if (!initialResult.ok || typeof initialResult.value !== 'string') { + throw new Error('initial chromium never became ready'); + } + const initialUUID = initialResult.value; + console.log(`[iter ${iter}] initial chromium ready in ${initialResult.ms}ms uuid=${initialUUID.slice(0, 8)}…`); + + // Fire POST /configure and concurrently start probing for the new UUID. + console.log(`[iter ${iter}] POST /configure (viewport 1280x800@60 + --kiosk)...`); + const configureStart = Date.now(); + const configurePromise = postConfigure(); + + // Poll fast for the new UUID to appear on /json/version. + let newUUID: string | null = null; + let newUUIDMs: number | null = null; + const newUUIDProbe = (async () => { + while (Date.now() - configureStart < 60_000) { + const u = await getBrowserUUID(); + if (u && u !== initialUUID) { + newUUID = u; + newUUIDMs = Date.now() - configureStart; + return; + } + await new Promise((r) => setTimeout(r, 25)); + } + })(); + + // Wait for /configure to return. + const configResult = await configurePromise; + const configureMs = configResult.ms; + console.log( + `[iter ${iter}] /configure returned ${configResult.status} in ${configureMs}ms${configResult.status !== 200 ? ` body=${configResult.body.slice(0, 200)}` : ''}`, + ); + + // Wait for the new UUID probe to land (or time out). + await newUUIDProbe; + if (newUUID === null) { + return { + iter, + initialUUID, + configureMs, + configureStatus: configResult.status, + newUUIDMs: null, + newUUID: null, + cdpReadyMs: null, + gapMs: null, + }; + } + console.log(`[iter ${iter}] new UUID observed at +${newUUIDMs}ms uuid=${newUUID!.slice(0, 8)}…`); + + // Now wait for a real CDP round-trip to succeed against the new UUID. + let cdpReadyMs: number | null = null; + const cdpDeadline = configureStart + 60_000; + while (Date.now() < cdpDeadline) { + const ok = await browserGetVersionCDP(newUUID!, 1500); + if (ok) { + cdpReadyMs = Date.now() - configureStart; + break; + } + await new Promise((r) => setTimeout(r, 25)); + } + console.log(`[iter ${iter}] Browser.getVersion succeeded at +${cdpReadyMs}ms`); + + const gapMs = cdpReadyMs !== null ? cdpReadyMs - configureMs : null; + return { + iter, + initialUUID, + configureMs, + configureStatus: configResult.status, + newUUIDMs, + newUUID, + cdpReadyMs, + gapMs, + }; +} + +async function main(): Promise { + console.log(`=== benchmark ${args.label} (image=${args.image}) ===`); + console.log(`apiPort=${args.apiPort} devToolsPort=${args.devToolsPort} iters=${args.iterations}`); + + await startContainer(); + + const samples: Sample[] = []; + try { + for (let i = 1; i <= args.iterations; i++) { + const s = await runOneIteration(i); + samples.push(s); + } + } finally { + console.log('\n[cleanup] tearing down container'); + await dockerRm(args.containerName); + } + + console.log(`\n=== ${args.label} summary ===`); + console.log( + `${'iter'.padEnd(5)}${'configMs'.padEnd(11)}${'newUUID@'.padEnd(11)}${'cdpReady@'.padEnd(11)}${'GAP'.padEnd(8)}`, + ); + console.log('-'.repeat(46)); + for (const s of samples) { + console.log( + String(s.iter).padEnd(5) + + String(s.configureMs).padEnd(11) + + String(s.newUUIDMs ?? '?').padEnd(11) + + String(s.cdpReadyMs ?? '?').padEnd(11) + + String(s.gapMs ?? '?').padEnd(8), + ); + } + console.log(); + const gaps = samples.map((s) => s.gapMs).filter((v): v is number => v !== null); + if (gaps.length) { + const sum = gaps.reduce((a, b) => a + b, 0); + const min = Math.min(...gaps); + const max = Math.max(...gaps); + const sorted = [...gaps].sort((a, b) => a - b); + const median = sorted[Math.floor(sorted.length / 2)]!; + console.log(`GAP stats (ms): n=${gaps.length} min=${min} median=${median} mean=${Math.round(sum / gaps.length)} max=${max}`); + console.log('(GAP = cdpReadyMs − configureMs. Positive = api returned 200 before chromium was really ready.)'); + } else { + console.log('no gap measurements collected'); + } +} + +void main(); diff --git a/server/cmd/api/api/chromium.go b/server/cmd/api/api/chromium.go index 772ba1e9..bd4c4c7a 100644 --- a/server/cmd/api/api/chromium.go +++ b/server/cmd/api/api/chromium.go @@ -401,8 +401,16 @@ func waitChromiumSupervisorStatus(ctx context.Context, want string, timeout time } // stopChromium runs supervisorctl stop chromium and waits for the command to complete. +// +// On success it emits a "chromium stopped" info log with both string (`elapsed`) +// and numeric (`elapsed_ms`) duration attributes, plus an `outcome` field that +// distinguishes the success-after-error recovery paths from the canonical +// success path. The numeric attribute is what aggregations like +// p99(elapsed_ms) in SigNoz key off; the string is kept for parity with the +// existing `startChromiumAndWait` "devtools ready" log shape. func (s *ApiService) stopChromium(ctx context.Context) error { log := logger.FromContext(ctx) + start := time.Now() cmdCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute) defer cancel() log.Info("stopping chromium via supervisorctl") @@ -413,24 +421,69 @@ func (s *ApiService) stopChromium(ctx context.Context) error { if statusErr == nil { switch status { case "STOPPED": - log.Info("chromium already stopped after supervisorctl stop error", "status", statusOut) + elapsed := time.Since(start) + log.Info("chromium stopped", + "outcome", "already_stopped", + "elapsed", elapsed.String(), + "elapsed_ms", elapsed.Milliseconds(), + "status", statusOut, + ) return nil case "STOPPING": if stoppedOut, waitErr := waitChromiumSupervisorStatus(ctx, "STOPPED", 30*time.Second); waitErr == nil { - log.Info("chromium reached stopped after supervisorctl stop error", "status", stoppedOut) + elapsed := time.Since(start) + log.Info("chromium stopped", + "outcome", "stopping_then_stopped", + "elapsed", elapsed.String(), + "elapsed_ms", elapsed.Milliseconds(), + "status", stoppedOut, + ) return nil } } } return fmt.Errorf("supervisorctl stop chromium failed: %w", err) } + confirmed := true if stoppedOut, waitErr := waitChromiumSupervisorStatus(ctx, "STOPPED", 30*time.Second); waitErr != nil { + confirmed = false log.Warn("chromium stop command completed but stopped status was not confirmed", "error", waitErr, "status", stoppedOut) } + elapsed := time.Since(start) + outcome := "success" + if !confirmed { + outcome = "success_unconfirmed" + } + log.Info("chromium stopped", + "outcome", outcome, + "elapsed", elapsed.String(), + "elapsed_ms", elapsed.Milliseconds(), + ) return nil } -// startChromiumAndWait launches chromium via supervisorctl start and waits for DevTools readiness. +// startChromiumAndWait launches chromium via supervisorctl start and waits +// for DevTools to actually serve the new Chromium browser. +// +// Readiness is gated on two independent signals being satisfied together: +// +// 1. The UpstreamManager has observed a "DevTools listening on ws://..." log +// line whose URL differs from the URL we saw on entry (prevUpstream). +// UpstreamManager only updates its current URL after the new Chromium +// prints that line, so a new URL is the earliest reliable evidence that +// the new Chromium has bound its DevTools listener. +// 2. After dialing that new URL, a Browser.getVersion CDP round-trip +// succeeds. This rules out two failure modes that a bare websocket.Dial +// does not: a dial completing against a half-closed socket from the +// just-killed previous Chromium, or against a freshly bound TCP +// listener that has not yet wired up CDP routes. Either case can +// otherwise produce a false "ready" return, after which the new +// Chromium can take several more seconds to actually serve requests +// (and live view will appear blank during that window). +// +// We intentionally do NOT short-circuit on the supervisorctl start command +// returning ("doneCh") -- that command returns as soon as supervisord ack's +// the fork, long before the new Chromium has bound any ports. func (s *ApiService) startChromiumAndWait(ctx context.Context, operation string) error { log := logger.FromContext(ctx) start := time.Now() @@ -458,12 +511,8 @@ func (s *ApiService) startChromiumAndWait(ctx context.Context, operation string) ticker := time.NewTicker(250 * time.Millisecond) defer ticker.Stop() - commandDone := false - tryReady := func(upstream string, allowCurrent bool) bool { - if upstream == "" { - return false - } - if !allowCurrent && upstream == prevUpstream { + tryReady := func(upstream string) bool { + if upstream == "" || upstream == prevUpstream { return false } dialCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Second) @@ -472,28 +521,32 @@ func (s *ApiService) startChromiumAndWait(ctx context.Context, operation string) if err != nil { return false } - _ = c.Close() + defer c.Close() + if _, err := c.GetBrowserVersion(dialCtx); err != nil { + log.Debug("dial succeeded but Browser.getVersion failed; ignoring", "operation", operation, "url", upstream, "err", err) + return false + } return true } for { select { case upstream, ok := <-updates: - if ok && tryReady(upstream, false) { + if ok && tryReady(upstream) { log.Info("devtools ready", "operation", operation, "elapsed", time.Since(start).String()) return nil } case err := <-errCh: return err case <-doneCh: - commandDone = true + // supervisorctl start returned; the new chromium has been forked + // but its DevTools listener may not be bound yet. Do not try + // ready against the current upstream here -- it may still be the + // previous chromium's URL. Continue waiting for either the + // updates channel or the ticker to pick up the new URL. doneCh = nil - if tryReady(s.upstreamMgr.Current(), true) { - log.Info("devtools ready", "operation", operation, "elapsed", time.Since(start).String()) - return nil - } case <-ticker.C: - if commandDone && tryReady(s.upstreamMgr.Current(), true) { + if tryReady(s.upstreamMgr.Current()) { log.Info("devtools ready", "operation", operation, "elapsed", time.Since(start).String()) return nil } diff --git a/server/lib/cdpclient/cdpclient.go b/server/lib/cdpclient/cdpclient.go index a33b6319..8d2c3c2e 100644 --- a/server/lib/cdpclient/cdpclient.go +++ b/server/lib/cdpclient/cdpclient.go @@ -99,6 +99,40 @@ func (c *Client) send(ctx context.Context, method string, params any, sessionID } } +// BrowserVersion is the result of a Browser.getVersion CDP call. +// +// We use this struct only to confirm a successful round-trip; callers that +// just need a liveness probe can ignore the fields. The protocol-version +// fields are populated for convenience. +type BrowserVersion struct { + ProtocolVersion string `json:"protocolVersion"` + Product string `json:"product"` + Revision string `json:"revision"` + UserAgent string `json:"userAgent"` + JsVersion string `json:"jsVersion"` +} + +// GetBrowserVersion sends Browser.getVersion on the browser-level DevTools +// endpoint. It is a cheap CDP round-trip that proves the WebSocket is +// connected to a live, CDP-responsive Chromium browser process. +// +// Callers should use this after Dial as a readiness gate: a successful +// websocket.Dial alone is not enough because a dial can complete against +// a half-open socket of a killed Chromium, or against a freshly bound TCP +// listener of a Chromium that has not yet wired up its CDP routes. A +// Browser.getVersion round-trip rules out both cases. +func (c *Client) GetBrowserVersion(ctx context.Context) (*BrowserVersion, error) { + raw, err := c.send(ctx, "Browser.getVersion", nil, "") + if err != nil { + return nil, fmt.Errorf("Browser.getVersion: %w", err) + } + var v BrowserVersion + if err := json.Unmarshal(raw, &v); err != nil { + return nil, fmt.Errorf("unmarshal Browser.getVersion: %w", err) + } + return &v, nil +} + // DispatchStartURL closes extra page targets and dispatches a navigation on the // first page target. It does not wait for lifecycle events; Chrome owns the // eventual navigation result. diff --git a/server/lib/cdpclient/cdpclient_test.go b/server/lib/cdpclient/cdpclient_test.go index d3a4d7d9..f44d675f 100644 --- a/server/lib/cdpclient/cdpclient_test.go +++ b/server/lib/cdpclient/cdpclient_test.go @@ -13,9 +13,8 @@ import ( "github.com/stretchr/testify/require" ) -// fakeCDP is a minimal CDP server that responds to the three commands used by -// SetDeviceMetricsOverride: Target.getTargets, Target.attachToTarget, -// Emulation.setDeviceMetricsOverride, and Target.detachFromTarget. +// fakeCDP is a minimal CDP server that responds to the commands used by +// SetDeviceMetricsOverride and GetBrowserVersion. type fakeCDP struct { getTargetsCalled bool attachCalled bool @@ -28,6 +27,9 @@ type fakeCDP struct { failGetTargets bool failSetMetrics bool returnNoPageTargets bool + getVersionCalled bool + failGetVersion bool + productResponse string } func (f *fakeCDP) handler(w http.ResponseWriter, r *http.Request) { @@ -84,6 +86,23 @@ func (f *fakeCDP) handler(w http.ResponseWriter, r *http.Request) { case "Target.detachFromTarget": f.detachCalled = true result = map[string]any{} + case "Browser.getVersion": + f.getVersionCalled = true + if f.failGetVersion { + cdpErr = &cdpError{Code: -3, Message: "version error"} + } else { + product := f.productResponse + if product == "" { + product = "HeadlessChrome/test" + } + result = map[string]any{ + "protocolVersion": "1.3", + "product": product, + "revision": "@deadbeef", + "userAgent": "Mozilla/5.0 fake", + "jsVersion": "1.2.3", + } + } } resp := map[string]any{"id": req.ID} @@ -201,3 +220,36 @@ func TestDial(t *testing.T) { require.Error(t, err) }) } + +func TestGetBrowserVersion(t *testing.T) { + t.Run("happy path", func(t *testing.T) { + f := &fakeCDP{productResponse: "HeadlessChrome/145.0.0.0"} + url := startFakeCDP(t, f) + + ctx := context.Background() + client, err := Dial(ctx, url) + require.NoError(t, err) + defer client.Close() + + v, err := client.GetBrowserVersion(ctx) + require.NoError(t, err) + require.NotNil(t, v) + assert.True(t, f.getVersionCalled) + assert.Equal(t, "HeadlessChrome/145.0.0.0", v.Product) + assert.Equal(t, "1.3", v.ProtocolVersion) + }) + + t.Run("CDP error from chromium", func(t *testing.T) { + f := &fakeCDP{failGetVersion: true} + url := startFakeCDP(t, f) + + ctx := context.Background() + client, err := Dial(ctx, url) + require.NoError(t, err) + defer client.Close() + + _, err = client.GetBrowserVersion(ctx) + require.Error(t, err) + assert.Contains(t, err.Error(), "Browser.getVersion") + }) +}