From 620c9549368aa1d13c49eebd26323c038dc3b959 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Mon, 13 Apr 2026 18:46:19 -0700 Subject: [PATCH 01/13] updates --- .github/skills/chat-perf/SKILL.md | 140 ++++ .gitignore | 1 + package.json | 2 + scripts/chat-perf/common/mock-llm-server.js | 652 ++++++++++++++++++ scripts/chat-perf/common/utils.js | 511 ++++++++++++++ scripts/chat-perf/test-chat-mem-leaks.js | 229 ++++++ .../chat-perf/test-chat-perf-regression.js | 598 ++++++++++++++++ 7 files changed, 2133 insertions(+) create mode 100644 .github/skills/chat-perf/SKILL.md create mode 100644 scripts/chat-perf/common/mock-llm-server.js create mode 100644 scripts/chat-perf/common/utils.js create mode 100644 scripts/chat-perf/test-chat-mem-leaks.js create mode 100644 scripts/chat-perf/test-chat-perf-regression.js diff --git a/.github/skills/chat-perf/SKILL.md b/.github/skills/chat-perf/SKILL.md new file mode 100644 index 0000000000000..fd45b8b6b2a7c --- /dev/null +++ b/.github/skills/chat-perf/SKILL.md @@ -0,0 +1,140 @@ +# Chat Performance Testing + +Run chat perf benchmarks and memory leak checks against the local dev build or any published VS Code version. Use when investigating chat rendering regressions, validating perf-sensitive changes to chat UI, or checking for memory leaks in the chat response pipeline. + +## When to use + +- Before/after modifying chat rendering code (`chatListRenderer.ts`, `chatInputPart.ts`, markdown rendering) +- When changing the streaming response pipeline or SSE processing +- When modifying disposable/lifecycle patterns in chat components +- To compare performance between two VS Code releases +- In CI to gate PRs that touch chat UI code + +## Quick start + +```bash +# Run perf regression test (compares local dev build vs VS Code 1.115.0): +npm run perf:chat -- --scenario text-only --runs 3 + +# Run all scenarios with no baseline (just measure): +npm run perf:chat -- --no-baseline --runs 3 + +# Run memory leak check (10 messages in one session): +npm run perf:chat-leak + +# Run leak check with more messages for accuracy: +npm run perf:chat-leak -- --messages 20 --verbose +``` + +## Perf regression test + +**Script:** `scripts/chat-perf/test-chat-perf-regression.js` +**npm:** `npm run perf:chat` + +Launches VS Code via Playwright Electron, opens the chat panel, sends a message with a mock LLM response, and measures timing, layout, and rendering metrics. By default, downloads VS Code 1.115.0 as a baseline, benchmarks it, then benchmarks the local dev build and compares. + +### Key flags + +| Flag | Default | Description | +|---|---|---| +| `--runs ` | `5` | Runs per scenario. More = more stable. Use 5+ for CI. | +| `--scenario ` | all | Scenario to test (repeatable). See scenarios below. | +| `--build ` | local dev | Build to test. Accepts path or version (`1.110.0`, `insiders`). | +| `--baseline-build ` | `1.115.0` | Version to download and compare against. | +| `--no-baseline` | — | Skip baseline comparison entirely. | +| `--threshold ` | `0.2` | Regression threshold (0.2 = flag if 20% slower). | +| `--verbose` | — | Print per-run details including response content. | + +### Comparing two remote builds + +```bash +# Compare 1.110.0 against 1.115.0 (no local build needed): +npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 --runs 5 +``` + +### Exit codes + +- `0` — all metrics within threshold +- `1` — regression detected or runs failed + +### Scenarios + +| ID | What it stresses | +|---|---| +| `text-only` | Baseline — plain text response | +| `large-codeblock` | Single TypeScript block with syntax highlighting | +| `many-codeblocks` | 10 fenced code blocks (~600 lines) | +| `many-small-chunks` | 200 small SSE chunks | +| `mixed-content` | Markdown with headers, code blocks, prose | +| `long-prose` | ~3000 words across 15 sections | +| `rich-markdown` | Nested lists, bold, italic, links, blockquotes | +| `giant-codeblock` | Single 200-line TypeScript block | +| `rapid-stream` | 1000 tiny SSE chunks | +| `file-links` | 32 file URI references with line anchors | + +### Metrics collected + +- **Timing:** time to first token, time to complete (prefers internal `code/chat/*` perf marks, falls back to client-side measurement) +- **Rendering:** layout count, style recalculation count, forced reflows, long tasks (>50ms) +- **Memory:** heap before/after (informational, noisy for single requests) + +### Statistics + +Results use **IQR-based outlier removal** and **median** (not mean) to handle startup jitter. The **coefficient of variation (cv)** is reported — under 15% is stable, over 15% gets a ⚠ warning. Use 5+ runs to get stable results. + +## Memory leak check + +**Script:** `scripts/chat-perf/test-chat-mem-leaks.js` +**npm:** `npm run perf:chat-leak` + +Launches one VS Code session, sends N messages sequentially, forces GC between each, and measures renderer heap and DOM node count. Uses **linear regression** on the samples to compute per-message growth rate, which is compared against a threshold. + +### Key flags + +| Flag | Default | Description | +|---|---|---| +| `--messages ` | `10` | Number of messages to send. More = more accurate slope. | +| `--build ` | local dev | Build to test. | +| `--threshold ` | `2` | Max per-message heap growth in MB. | +| `--verbose` | — | Print per-message heap/DOM counts. | + +### What it measures + +- **Heap growth slope** (MB/message) — linear regression over forced-GC heap samples. A leak shows as sustained positive slope. +- **DOM node growth** (nodes/message) — catches rendering leaks where elements aren't cleaned up. Healthy chat virtualizes old messages so node count plateaus. + +### Interpreting results + +- `0.3–1.0 MB/msg` — normal (V8 internal overhead, string interning) +- `>2.0 MB/msg` — likely leak, investigate retained objects +- DOM nodes stable after first message — normal (chat list virtualization working) +- DOM nodes growing linearly — rendering leak, check disposable cleanup + +## Architecture + +``` +scripts/chat-perf/ +├── common/ +│ ├── mock-llm-server.js # Mock CAPI server matching @vscode/copilot-api URL structure +│ └── utils.js # Shared: paths, env setup, stats, launch helpers +├── test-chat-perf-regression.js +└── test-chat-mem-leaks.js +``` + +### Mock server + +The mock LLM server (`common/mock-llm-server.js`) implements the full CAPI URL structure from `@vscode/copilot-api`'s `DomainService`: + +- `GET /models` — returns model metadata +- `POST /models/session` — returns `AutoModeAPIResponse` with `available_models` and `session_token` +- `POST /models/session/intent` — model router +- `POST /chat/completions` — SSE streaming response matching the scenario +- Agent, session, telemetry, and token endpoints + +The copilot extension connects to this server via `IS_SCENARIO_AUTOMATION=1` mode with `overrideCapiUrl` and `overrideProxyUrl` settings. The `vscode-api-tests` extension must be disabled (`--disable-extension=vscode.vscode-api-tests`) because it contributes a duplicate `copilot` vendor that blocks the real extension's language model provider registration. + +### Adding a scenario + +1. Add a new entry to the `SCENARIOS` object in `common/mock-llm-server.js` — an array of string chunks that will be streamed as SSE +2. Add the scenario ID to the `SCENARIOS` array in `common/utils.js` +3. Run: `npm run perf:chat -- --scenario your-new-scenario --runs 1 --no-baseline --verbose` diff --git a/.gitignore b/.gitignore index 421c621311626..7e5189df7aa01 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ product.overrides.json *.snap.actual *.tsbuildinfo .vscode-test +.chat-perf-data vscode-telemetry-docs/ test-output.json test/componentFixtures/.screenshots/* diff --git a/package.json b/package.json index c3a544010f214..b9067cf575740 100644 --- a/package.json +++ b/package.json @@ -79,6 +79,8 @@ "extensions-ci": "npm run gulp extensions-ci", "extensions-ci-pr": "npm run gulp extensions-ci-pr", "perf": "node scripts/code-perf.js", + "perf:chat": "node scripts/chat-perf/test-chat-perf-regression.js", + "perf:chat-leak": "node scripts/chat-perf/test-chat-mem-leaks.js", "copilot:setup": "npm --prefix extensions/copilot run setup", "copilot:get_token": "npm --prefix extensions/copilot run get_token", "update-build-ts-version": "npm install -D typescript@next && npm install -D @typescript/native-preview && (cd build && npm run typecheck)", diff --git a/scripts/chat-perf/common/mock-llm-server.js b/scripts/chat-perf/common/mock-llm-server.js new file mode 100644 index 0000000000000..1b45967d1b2f2 --- /dev/null +++ b/scripts/chat-perf/common/mock-llm-server.js @@ -0,0 +1,652 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// @ts-check + +/** + * Local mock server that implements the OpenAI Chat Completions streaming API. + * Used by the chat perf benchmark to replace the real LLM backend with + * deterministic, zero-latency responses. + * + * Supports scenario-based responses: the `messages` array's last user message + * content is matched against scenario IDs. Unknown scenarios get a default + * text-only response. + */ + +const http = require('http'); +const { EventEmitter } = require('events'); + +// -- Scenario fixtures ------------------------------------------------------- + +/** @type {Record} */ +const SCENARIOS = { + 'text-only': [ + 'Here is an explanation of the code you selected:\n\n', + 'The function `processItems` iterates over the input array and applies a transformation to each element. ', + 'It uses a `Map` to track previously seen values, which allows it to deduplicate results efficiently in O(n) time.\n\n', + 'The algorithm works in a single pass: for every element, it computes the transformed value, ', + 'checks membership in the set, and conditionally appends to the output array. ', + 'This is a common pattern in data processing pipelines where uniqueness constraints must be maintained.\n\n', + 'Edge cases to consider include empty arrays, duplicate transformations that produce the same key, ', + 'and items where the transform function itself is expensive.\n\n', + 'The time complexity is **O(n)** and the space complexity is **O(n)** in the worst case when all items are unique.\n', + ], + 'large-codeblock': [ + 'Here is the refactored implementation:\n\n', + '```typescript\n', + 'import { EventEmitter } from "events";\n\n', + 'interface CacheEntry {\n value: T;\n expiresAt: number;\n accessCount: number;\n}\n\n', + 'export class LRUCache {\n', + ' private readonly _map = new Map>();\n', + ' private readonly _emitter = new EventEmitter();\n\n', + ' constructor(\n private readonly _maxSize: number,\n private readonly _ttlMs: number = 60_000,\n ) {}\n\n', + ' get(key: K): V | undefined {\n const entry = this._map.get(key);\n if (!entry) { return undefined; }\n', + ' if (Date.now() > entry.expiresAt) {\n this._map.delete(key);\n this._emitter.emit("evict", key);\n return undefined;\n }\n', + ' entry.accessCount++;\n this._map.delete(key);\n this._map.set(key, entry);\n return entry.value;\n }\n\n', + ' set(key: K, value: V): void {\n if (this._map.size >= this._maxSize) {\n', + ' const oldest = this._map.keys().next().value;\n if (oldest !== undefined) {\n this._map.delete(oldest);\n this._emitter.emit("evict", oldest);\n }\n }\n', + ' this._map.set(key, { value, expiresAt: Date.now() + this._ttlMs, accessCount: 0 });\n }\n\n', + ' clear(): void { this._map.clear(); this._emitter.emit("clear"); }\n', + ' get size(): number { return this._map.size; }\n', + ' onEvict(listener: (key: K) => void): void { this._emitter.on("evict", listener); }\n}\n', + '```\n\n', + 'The key changes:\n- Added TTL-based expiry with configurable timeout\n- LRU eviction uses Map insertion order\n- EventEmitter notifies on evictions for cache observability\n', + ], + 'many-small-chunks': (() => { + const chunks = ['Generating detailed analysis:\n\n']; + for (let i = 0; i < 200; i++) { + chunks.push(`Word${i} `); + } + chunks.push('\n\nAnalysis complete.\n'); + return chunks; + })(), + 'mixed-content': [ + '## Issue Found\n\n', + 'The `DisposableStore` is not being disposed in the `deactivate` path, ', + 'which can lead to memory leaks.\n\n', + '### Current Code\n\n', + '```typescript\nclass MyService {\n private store = new DisposableStore();\n // missing dispose!\n}\n```\n\n', + '### Suggested Fix\n\n', + '```typescript\nclass MyService extends Disposable {\n', + ' private readonly store = this._register(new DisposableStore());\n\n', + ' override dispose(): void {\n this.store.dispose();\n super.dispose();\n }\n}\n```\n\n', + 'This ensures the store is cleaned up when the service is disposed via the workbench lifecycle.\n', + ], + + // -- Stress-test scenarios -------------------------------------------- + + // ~500 lines of code across 10 fenced blocks — stresses syntax + // highlighting, code block rendering, and copy-button creation. + 'many-codeblocks': (() => { + const chunks = ['Here are the implementations for each module:\n\n']; + for (let i = 0; i < 10; i++) { + chunks.push(`### Module ${i + 1}: \`handler${i}.ts\`\n\n`); + chunks.push('```typescript\n'); + for (let j = 0; j < 15; j++) { + chunks.push(`export function handle${i}_${j}(input: string): string {\n`); + chunks.push(` const result = input.trim().split('').reverse().join('');\n`); + chunks.push(` return \`[\${result}] processed by handler ${i}_${j}\`;\n`); + chunks.push('}\n\n'); + } + chunks.push('```\n\n'); + } + chunks.push('All modules implement the same pattern with unique handler IDs.\n'); + return chunks; + })(), + + // Very long prose — stresses markdown rendering, word wrapping, + // and layout with ~3000 words of continuous text. + 'long-prose': (() => { + const sentences = [ + 'The architecture follows a layered dependency injection pattern where each service declares its dependencies through constructor parameters. ', + 'This approach ensures that circular dependencies are detected at compile time rather than at runtime, which significantly reduces debugging overhead. ', + 'When a service is instantiated, the instantiation service resolves all of its dependencies recursively, creating a directed acyclic graph of service instances. ', + 'Each service is a singleton within its scope, meaning that multiple consumers of the same service interface receive the same instance. ', + 'The workbench lifecycle manages the creation and disposal of these services through well-defined phases: creation, restoration, and eventual shutdown. ', + 'During the restoration phase, services that persist state across sessions reload their data from storage, which may involve asynchronous operations. ', + 'Contributors register their functionality through extension points, which are processed during the appropriate lifecycle phase. ', + 'This contribution model allows features to be added without modifying the core workbench code, maintaining a clean separation of concerns. ', + ]; + const chunks = ['# Detailed Architecture Analysis\n\n']; + for (let para = 0; para < 15; para++) { + chunks.push(`## Section ${para + 1}: ${['Overview', 'Design Patterns', 'Service Layer', 'Event System', 'State Management', 'Error Handling', 'Performance', 'Testing', 'Deployment', 'Monitoring', 'Security', 'Extensibility', 'Compatibility', 'Migration', 'Future Work'][para]}\n\n`); + for (let s = 0; s < 25; s++) { + chunks.push(sentences[s % sentences.length]); + } + chunks.push('\n\n'); + } + return chunks; + })(), + + // Deeply nested markdown — headers, ordered/unordered lists, bold, + // italic, inline code, links, blockquotes. Exercises the full + // markdown renderer pipeline. + 'rich-markdown': (() => { + const chunks = ['# Comprehensive Code Review Report\n\n']; + chunks.push('> **Summary**: Found 12 issues across 4 severity levels.\n\n'); + for (let section = 0; section < 6; section++) { + chunks.push(`## ${section + 1}. ${['Critical Issues', 'Performance Concerns', 'Code Style', 'Documentation Gaps', 'Test Coverage', 'Security Review'][section]}\n\n`); + for (let item = 0; item < 5; item++) { + chunks.push(`${item + 1}. **Issue ${section * 5 + item + 1}**: \`${['useState', 'useEffect', 'useMemo', 'useCallback', 'useRef'][item]}\` in \`src/components/Widget${item}.tsx\`\n`); + chunks.push(` - Severity: ${['[Critical]', '[Warning]', '[Info]', '[Suggestion]', '[Note]'][item]}\n`); + chunks.push(` - The current implementation uses *unnecessary re-renders* due to missing dependency arrays.\n`); + chunks.push(` - See [React docs](https://react.dev/reference) and the [\`useMemo\` guide](https://react.dev/reference/react/useMemo).\n`); + chunks.push(` - Fix: wrap in \`useCallback\` or extract to a ***separate memoized component***.\n\n`); + } + chunks.push('---\n\n'); + } + chunks.push('> *Report generated automatically. Please review all suggestions before applying.*\n'); + return chunks; + })(), + + // A huge single code block (~200 lines) — stresses the syntax + // highlighter and scroll virtualization within a code block. + 'giant-codeblock': (() => { + const chunks = ['Here is the complete implementation:\n\n```typescript\n']; + chunks.push('import { Disposable, DisposableStore } from "vs/base/common/lifecycle";\n'); + chunks.push('import { Emitter, Event } from "vs/base/common/event";\n'); + chunks.push('import { URI } from "vs/base/common/uri";\n\n'); + for (let i = 0; i < 40; i++) { + chunks.push(`export class Service${i} extends Disposable {\n`); + chunks.push(` private readonly _onDidChange = this._register(new Emitter());\n`); + chunks.push(` readonly onDidChange: Event = this._onDidChange.event;\n\n`); + chunks.push(` private _value: string = '';\n`); + chunks.push(` get value(): string { return this._value; }\n\n`); + chunks.push(` async update(uri: URI): Promise {\n`); + chunks.push(` this._value = uri.toString();\n`); + chunks.push(` this._onDidChange.fire();\n`); + chunks.push(` }\n`); + chunks.push('}\n\n'); + } + chunks.push('```\n\nThis defines 40 service classes following the standard VS Code pattern.\n'); + return chunks; + })(), + + // 1000 very small chunks — stresses the streaming SSE pipeline + // and incremental DOM updates with high chunk frequency. + 'rapid-stream': (() => { + const chunks = []; + for (let i = 0; i < 1000; i++) { + chunks.push(`w${i} `); + } + return chunks; + })(), + + // Many file URI references — stresses link detection, file + // resolution, path rendering, hover providers, and inline + // anchor widget creation. + 'file-links': (() => { + const files = [ + 'src/vs/workbench/contrib/chat/browser/chatListRenderer.ts', + 'src/vs/workbench/contrib/chat/common/chatService/chatServiceImpl.ts', + 'src/vs/workbench/contrib/chat/browser/widget/input/chatInputPart.ts', + 'src/vs/workbench/contrib/chat/common/chatPerf.ts', + 'src/vs/base/common/lifecycle.ts', + 'src/vs/base/common/event.ts', + 'src/vs/platform/instantiation/common/instantiation.ts', + 'src/vs/workbench/services/extensions/common/abstractExtensionService.ts', + 'src/vs/workbench/api/common/extHostLanguageModels.ts', + 'src/vs/workbench/contrib/chat/common/languageModels.ts', + 'src/vs/editor/browser/widget/codeEditor/editor.ts', + 'src/vs/workbench/browser/parts/editor/editorGroupView.ts', + ]; + const chunks = ['I found references to the disposable pattern across the following files:\n\n']; + for (let i = 0; i < files.length; i++) { + const line = Math.floor(Math.random() * 500) + 1; + chunks.push(`${i + 1}. [${files[i]}](${files[i]}#L${line}) — `); + chunks.push(`Line ${line}: uses \`DisposableStore\` with ${Math.floor(Math.random() * 10) + 1} registrations\n`); + } + chunks.push('\nAdditionally, the following files import from `vs/base/common/lifecycle`:\n\n'); + for (let i = 0; i < 20; i++) { + const depth = ['base', 'platform', 'editor', 'workbench'][i % 4]; + const area = ['common', 'browser', 'node', 'electron-browser'][i % 4]; + const name = ['service', 'provider', 'contribution', 'handler', 'manager'][i % 5]; + const file = `src/vs/${depth}/${area}/${name}${i}.ts`; + chunks.push(`- [${file}](${file}#L${i * 10 + 5})`); + chunks.push(` — imports \`Disposable\`, \`DisposableStore\`\n`); + } + chunks.push('\nTotal: 32 files reference the disposable pattern.\n'); + return chunks; + })(), +}; + +const DEFAULT_SCENARIO = 'text-only'; + +// -- SSE chunk builder ------------------------------------------------------- + +const MODEL = 'gpt-4o-2024-08-06'; + +/** + * @param {string} content + * @param {number} index + * @param {boolean} finish + */ +function makeChunk(content, index, finish) { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: finish ? {} : { content }, + finish_reason: finish ? 'stop' : null, + content_filter_results: {}, + }], + usage: null, + }; +} + +function makeInitialChunk() { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { role: 'assistant', content: '' }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + +// -- Request handler --------------------------------------------------------- + +/** + * @param {http.IncomingMessage} req + * @param {http.ServerResponse} res + */ +function handleRequest(req, res) { + const contentLength = req.headers['content-length'] || '0'; + const ts = new Date().toISOString().slice(11, -1); // HH:MM:SS.mmm + console.log(`[mock-llm] ${ts} ${req.method} ${req.url} (${contentLength} bytes)`); + + // CORS + res.setHeader('Access-Control-Allow-Origin', '*'); + res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS'); + res.setHeader('Access-Control-Allow-Headers', '*'); + if (req.method === 'OPTIONS') { res.writeHead(204); res.end(); return; } + + const url = new URL(req.url || '/', `http://${req.headers.host}`); + const path = url.pathname; + const json = (/** @type {number} */ status, /** @type {any} */ data) => { + res.writeHead(status, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify(data)); + }; + const readBody = () => new Promise(resolve => { + let body = ''; + req.on('data', chunk => { body += chunk; }); + req.on('end', () => resolve(body)); + }); + + // -- Health ------------------------------------------------------- + if (path === '/health') { res.writeHead(200); res.end('ok'); return; } + + // -- Token endpoints (DomainService.tokenURL / tokenNoAuthURL) ---- + // /copilot_internal/v2/token, /copilot_internal/v2/nltoken + if (path.startsWith('/copilot_internal/')) { + if (path.includes('/token') || path.includes('/nltoken')) { + json(200, { + token: 'perf-benchmark-fake-token', + expires_at: Math.floor(Date.now() / 1000) + 3600, + refresh_in: 1800, + sku: 'free_limited_copilot', + individual: true, + copilot_plan: 'free', + endpoints: { + api: `http://${req.headers.host}`, + proxy: `http://${req.headers.host}`, + }, + }); + } else { + // /copilot_internal/user, /copilot_internal/content_exclusion, etc. + json(200, {}); + } + return; + } + + // -- Telemetry (DomainService.telemetryURL) ---------------------- + if (path === '/telemetry') { json(200, {}); return; } + + // -- Model Router (DomainService.capiModelRouterURL = /models/session/intent) -- + // The automode service POSTs here to get the best model for a request. + if (path === '/models/session/intent' && req.method === 'POST') { + readBody().then(() => { + json(200, { model: MODEL }); + }); + return; + } + + // -- Auto Models / Model Session (DomainService.capiAutoModelURL = /models/session) -- + // Returns AutoModeAPIResponse: { available_models, session_token, expires_at } + if (path === '/models/session' && req.method === 'POST') { + readBody().then(() => { + json(200, { + available_models: [MODEL, 'gpt-4o-mini'], + session_token: 'perf-session-token-' + Date.now(), + expires_at: Math.floor(Date.now() / 1000) + 3600, + discounted_costs: {}, + }); + }); + return; + } + + // -- Models (DomainService.capiModelsURL = /models) -------------- + if (path === '/models' && req.method === 'GET') { + json(200, { + data: [ + { + id: MODEL, + name: 'GPT-4o (Mock)', + version: '2024-05-13', + vendor: 'copilot', + model_picker_enabled: true, + is_chat_default: true, + is_chat_fallback: true, + billing: { is_premium: false, multiplier: 0 }, + capabilities: { + type: 'chat', + family: 'gpt-4o', + tokenizer: 'o200k_base', + limits: { + max_prompt_tokens: 128000, + max_output_tokens: 16384, + max_context_window_tokens: 128000, + }, + supports: { + streaming: true, + tool_calls: true, + parallel_tool_calls: true, + vision: false, + }, + }, + supported_endpoints: ['/chat/completions'], + }, + { + id: 'gpt-4o-mini', + name: 'GPT-4o mini (Mock)', + version: '2024-07-18', + vendor: 'copilot', + model_picker_enabled: false, + is_chat_default: false, + is_chat_fallback: false, + billing: { is_premium: false, multiplier: 0 }, + capabilities: { + type: 'chat', + family: 'gpt-4o-mini', + tokenizer: 'o200k_base', + limits: { + max_prompt_tokens: 128000, + max_output_tokens: 16384, + max_context_window_tokens: 128000, + }, + supports: { + streaming: true, + tool_calls: true, + parallel_tool_calls: true, + vision: false, + }, + }, + supported_endpoints: ['/chat/completions'], + }, + ], + }); + return; + } + + // -- Model by ID (DomainService.capiModelsURL/{id}) -------------- + if (path.startsWith('/models/') && req.method === 'GET') { + const modelId = path.split('/models/')[1]?.split('/')[0]; + if (path.endsWith('/policy')) { + json(200, { state: 'accepted', terms: '' }); + return; + } + json(200, { + id: modelId || MODEL, + name: 'GPT-4o (Mock)', + version: '2024-05-13', + vendor: 'copilot', + model_picker_enabled: true, + is_chat_default: true, + is_chat_fallback: true, + capabilities: { + type: 'chat', + family: 'gpt-4o', + tokenizer: 'o200k_base', + limits: { max_prompt_tokens: 128000, max_output_tokens: 16384, max_context_window_tokens: 128000 }, + supports: { streaming: true, tool_calls: true, parallel_tool_calls: true, vision: false }, + }, + }); + return; + } + + // -- Agents (DomainService.remoteAgentsURL = /agents) ------------- + if (path.startsWith('/agents')) { + // /agents/sessions — CopilotSessions + if (path.includes('/sessions')) { + json(200, { sessions: [], total_count: 0, page_size: 20, page_number: 1 }); + } + // /agents/swe/models — CCAModelsList + else if (path.includes('/swe/models')) { + json(200, { + data: [{ + id: MODEL, name: 'GPT-4o (Mock)', vendor: 'copilot', + capabilities: { type: 'chat', family: 'gpt-4o', supports: { streaming: true } } + }] + }); + } + // /agents/swe/... — agent jobs, etc. + else if (path.includes('/swe/')) { + json(200, {}); + } + // /agents — list agents + else { + json(200, { agents: [] }); + } + return; + } + + // -- Chat Completions (DomainService.capiChatURL = /chat/completions) -- + if (path === '/chat/completions' && req.method === 'POST') { + readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res)); + return; + } + + // -- Responses API (DomainService.capiResponsesURL = /responses) -- + if (path === '/responses' && req.method === 'POST') { + readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res)); + return; + } + + // -- Messages API (DomainService.capiMessagesURL = /v1/messages) -- + if (path === '/v1/messages' && req.method === 'POST') { + readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res)); + return; + } + + // -- Proxy completions (/v1/engines/*/completions) ---------------- + if (path.includes('/v1/engines/') && req.method === 'POST') { + readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res)); + return; + } + + // -- Skills, Search, Embeddings ----------------------------------- + if (path === '/skills' || path.startsWith('/search/') || path.startsWith('/embeddings')) { + json(200, { data: [] }); + return; + } + + // -- Catch-all: any remaining POST with messages → chat completions + if (req.method === 'POST') { + readBody().then((/** @type {string} */ body) => { + try { + const parsed = JSON.parse(/** @type {string} */(body)); + if (parsed.messages && Array.isArray(parsed.messages)) { + handleChatCompletions(/** @type {string} */(body), res); + return; + } + } catch { } + json(200, {}); + }); + return; + } + + // -- Catch-all GET → empty success -------------------------------- + json(200, {}); +} + +// -- Server lifecycle -------------------------------------------------------- + +/** Emitted when a scenario chat completion is fully served. */ +const serverEvents = new EventEmitter(); + +/** + * @param {string} body + * @param {http.ServerResponse} res + */ +function handleChatCompletions(body, res) { + let scenarioId = DEFAULT_SCENARIO; + let isScenarioRequest = false; + try { + const parsed = JSON.parse(body); + const messages = parsed.messages || []; + // Log user messages for debugging + const userMsgs = messages.filter((/** @type {any} */ m) => m.role === 'user'); + if (userMsgs.length > 0) { + const lastContent = typeof userMsgs[userMsgs.length - 1].content === 'string' + ? userMsgs[userMsgs.length - 1].content.substring(0, 100) + : '(structured)'; + const ts = new Date().toISOString().slice(11, -1); + console.log(`[mock-llm] ${ts} → ${messages.length} msgs, last user: "${lastContent}"`); + } + const lastUser = [...messages].reverse().find((/** @type {any} */ m) => m.role === 'user'); + if (lastUser) { + // Extract scenario ID from user message content + const content = typeof lastUser.content === 'string' + ? lastUser.content + : Array.isArray(lastUser.content) + ? lastUser.content.map((/** @type {any} */ c) => c.text || '').join('') + : ''; + const match = content.match(/\[scenario:([^\]]+)\]/); + if (match && SCENARIOS[match[1]]) { + scenarioId = match[1]; + isScenarioRequest = true; + } + } + } catch { } + + const chunks = SCENARIOS[scenarioId] || SCENARIOS[DEFAULT_SCENARIO]; + + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'X-Request-Id': 'perf-benchmark-' + Date.now(), + }); + + // Initial role chunk + res.write(`data: ${JSON.stringify(makeInitialChunk())}\n\n`); + + // Content chunks + for (const chunk of chunks) { + res.write(`data: ${JSON.stringify(makeChunk(chunk, 0, false))}\n\n`); + } + + // Finish chunk + res.write(`data: ${JSON.stringify(makeChunk('', 0, true))}\n\n`); + + // Done + res.write('data: [DONE]\n\n'); + res.end(); + + if (isScenarioRequest) { + serverEvents.emit('scenarioCompletion'); + } +} + +/** + * Start the mock server and return a handle. + * @param {number} port + */ +function startServer(port = 0) { + return new Promise((resolve, reject) => { + let reqCount = 0; + let completions = 0; + /** @type {Array<() => boolean>} */ + let requestWaiters = []; + /** @type {Array<() => boolean>} */ + let completionWaiters = []; + + serverEvents.on('scenarioCompletion', () => { + completions++; + completionWaiters = completionWaiters.filter(fn => !fn()); + }); + + const server = http.createServer((req, res) => { + reqCount++; + requestWaiters = requestWaiters.filter(fn => !fn()); + handleRequest(req, res); + }); + server.listen(port, '127.0.0.1', () => { + const addr = server.address(); + const actualPort = typeof addr === 'object' && addr ? addr.port : port; + const url = `http://127.0.0.1:${actualPort}`; + resolve({ + port: actualPort, + url, + close: () => /** @type {Promise} */(new Promise((resolve, reject) => { + server.close(err => err ? reject(err) : resolve(undefined)); + })), + /** Return total request count. */ + requestCount: () => reqCount, + /** + * Wait until at least `n` requests have been received. + * @param {number} n + * @param {number} timeoutMs + * @returns {Promise} + */ + waitForRequests: (n, timeoutMs) => new Promise((resolve, reject) => { + if (reqCount >= n) { resolve(); return; } + const timer = setTimeout(() => reject(new Error(`Timed out waiting for ${n} requests (got ${reqCount})`)), timeoutMs); + requestWaiters.push(() => { + if (reqCount >= n) { clearTimeout(timer); resolve(); return true; } + return false; + }); + }), + /** Return total scenario-completion count. */ + completionCount: () => completions, + /** + * Wait until at least `n` scenario chat completions have been served. + * @param {number} n + * @param {number} timeoutMs + * @returns {Promise} + */ + waitForCompletion: (n, timeoutMs) => new Promise((resolve, reject) => { + if (completions >= n) { resolve(); return; } + const timer = setTimeout(() => reject(new Error(`Timed out waiting for ${n} completions (got ${completions})`)), timeoutMs); + completionWaiters.push(() => { + if (completions >= n) { clearTimeout(timer); resolve(); return true; } + return false; + }); + }), + }); + }); + server.on('error', reject); + }); +} + +// Allow running standalone for testing: node scripts/mock-llm-server.js +if (require.main === module) { + const port = parseInt(process.argv[2] || '0', 10); + startServer(port).then((/** @type {any} */ handle) => { + console.log(`Mock LLM server listening at ${handle.url}`); + console.log('Scenarios:', Object.keys(SCENARIOS).join(', ')); + }); +} + +module.exports = { startServer, SCENARIOS }; diff --git a/scripts/chat-perf/common/utils.js b/scripts/chat-perf/common/utils.js new file mode 100644 index 0000000000000..671b660738ce4 --- /dev/null +++ b/scripts/chat-perf/common/utils.js @@ -0,0 +1,511 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// @ts-check + +/** + * Shared utilities for chat performance benchmarks and leak checks. + */ + +const path = require('path'); +const fs = require('fs'); +const os = require('os'); +const http = require('http'); +const { execSync, spawn } = require('child_process'); + +const ROOT = path.join(__dirname, '..', '..', '..'); +const DATA_DIR = path.join(ROOT, '.chat-perf-data'); + +const SCENARIOS = [ + 'text-only', + 'large-codeblock', + 'many-small-chunks', + 'mixed-content', + 'many-codeblocks', + 'long-prose', + 'rich-markdown', + 'giant-codeblock', + 'rapid-stream', + 'file-links', +]; + +// -- Electron path resolution ------------------------------------------------ + +function getElectronPath() { + const product = require(path.join(ROOT, 'product.json')); + if (process.platform === 'darwin') { + return path.join(ROOT, '.build', 'electron', `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort); + } else if (process.platform === 'linux') { + return path.join(ROOT, '.build', 'electron', product.applicationName); + } else { + return path.join(ROOT, '.build', 'electron', `${product.nameShort}.exe`); + } +} + +/** + * Returns true if the string looks like a VS Code version or commit hash + * rather than a file path. + * @param {string} value + */ +function isVersionString(value) { + if (value === 'insiders' || value === 'stable') { return true; } + if (/^\d+\.\d+\.\d+/.test(value)) { return true; } + if (/^[0-9a-f]{7,40}$/.test(value)) { return true; } + return false; +} + +/** + * Resolve a build arg to an executable path. + * Version strings are downloaded via @vscode/test-electron. + * @param {string | undefined} buildArg + * @returns {Promise} + */ +async function resolveBuild(buildArg) { + if (!buildArg) { + return getElectronPath(); + } + if (isVersionString(buildArg)) { + console.log(`[chat-perf] Downloading VS Code ${buildArg}...`); + const { downloadAndUnzipVSCode, resolveCliArgsFromVSCodeExecutablePath } = require('@vscode/test-electron'); + const exePath = await downloadAndUnzipVSCode(buildArg); + console.log(`[chat-perf] Downloaded: ${exePath}`); + + // Install the copilot extension into our shared extensions dir so it's + // available when we launch with --extensions-dir=DATA_DIR/extensions. + const extDir = path.join(DATA_DIR, 'extensions'); + fs.mkdirSync(extDir, { recursive: true }); + const [cli, ...cliArgs] = resolveCliArgsFromVSCodeExecutablePath(exePath); + const extId = 'GitHub.copilot'; + console.log(`[chat-perf] Installing ${extId} into ${extDir}...`); + const { spawnSync } = require('child_process'); + const result = spawnSync(cli, [...cliArgs, '--extensions-dir', extDir, '--install-extension', extId], { + encoding: 'utf-8', + stdio: 'pipe', + shell: process.platform === 'win32', + timeout: 120_000, + }); + if (result.status !== 0) { + console.warn(`[chat-perf] Extension install exited with ${result.status}: ${(result.stderr || '').substring(0, 500)}`); + } else { + console.log(`[chat-perf] ${extId} installed`); + } + + return exePath; + } + return path.resolve(buildArg); +} + +// -- Storage pre-seeding ----------------------------------------------------- + +/** + * Pre-seed the VS Code storage database to prevent the + * BuiltinChatExtensionEnablementMigration from disabling the copilot + * extension on fresh user data directories. + * @param {string} userDataDir + */ +function preseedStorage(userDataDir) { + const globalStorageDir = path.join(userDataDir, 'User', 'globalStorage'); + fs.mkdirSync(globalStorageDir, { recursive: true }); + const dbPath = path.join(globalStorageDir, 'state.vscdb'); + execSync(`sqlite3 "${dbPath}" "CREATE TABLE IF NOT EXISTS ItemTable (key TEXT UNIQUE ON CONFLICT REPLACE, value BLOB); INSERT INTO ItemTable (key, value) VALUES ('builtinChatExtensionEnablementMigration', 'true');"`); +} + +// -- Launch helpers ---------------------------------------------------------- + +/** + * Build the environment variables for launching VS Code with the mock server. + * @param {{ url: string }} mockServer + * @param {{ isDevBuild?: boolean }} [opts] + * @returns {Record} + */ +function buildEnv(mockServer, { isDevBuild = true } = {}) { + /** @type {Record} */ + const env = { + ...process.env, + ELECTRON_ENABLE_LOGGING: '1', + IS_SCENARIO_AUTOMATION: '1', + GITHUB_PAT: 'perf-benchmark-fake-pat', + VSCODE_COPILOT_CHAT_TOKEN: Buffer.from(JSON.stringify({ + token: 'perf-benchmark-fake-token', + expires_at: Math.floor(Date.now() / 1000) + 3600, + refresh_in: 1800, + sku: 'free_limited_copilot', + individual: true, + isNoAuthUser: true, + copilot_plan: 'free', + organization_login_list: [], + endpoints: { api: mockServer.url, proxy: mockServer.url }, + })).toString('base64'), + }; + // Dev-only flags — these tell Electron to load the app from source (out/) + // instead of the packaged app. Setting them on a stable build causes it + // to fail to show a window. + if (isDevBuild) { + env.NODE_ENV = 'development'; + env.VSCODE_DEV = '1'; + env.VSCODE_CLI = '1'; + } + return env; +} + +/** + * Build the default VS Code launch args. + * @param {string} userDataDir + * @param {string} extDir + * @param {string} logsDir + * @returns {string[]} + */ +function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) { + const args = [ + ROOT, + '--skip-release-notes', + '--skip-welcome', + '--disable-telemetry', + '--disable-updates', + '--disable-workspace-trust', + `--user-data-dir=${userDataDir}`, + `--extensions-dir=${extDir}`, + `--logsPath=${logsDir}`, + '--enable-smoke-test-driver', + ]; + // vscode-api-tests only exists in the dev build + if (isDevBuild) { + args.push('--disable-extension=vscode.vscode-api-tests'); + } + if (process.platform !== 'darwin') { + args.push('--disable-gpu'); + } + return args; +} + +/** + * Write VS Code settings that point the copilot extension at the mock server. + * @param {string} userDataDir + * @param {{ url: string }} mockServer + */ +function writeSettings(userDataDir, mockServer) { + const settingsDir = path.join(userDataDir, 'User'); + fs.mkdirSync(settingsDir, { recursive: true }); + fs.writeFileSync(path.join(settingsDir, 'settings.json'), JSON.stringify({ + 'github.copilot.advanced.debug.overrideProxyUrl': mockServer.url, + 'github.copilot.advanced.debug.overrideCapiUrl': mockServer.url, + 'chat.allowAnonymousAccess': true, + // Disable MCP servers — they start async and add unpredictable + // delay that pollutes perf measurements. + 'chat.mcp.discovery.enabled': false, + 'chat.mcp.enabled': false, + 'github.copilot.chat.githubMcpServer.enabled': false, + 'github.copilot.chat.cli.mcp.enabled': false, + }, null, '\t')); +} + +/** + * Prepare a fresh run directory (clean, create, preseed, write settings). + * @param {string} runId + * @param {{ url: string }} mockServer + * @returns {{ userDataDir: string, extDir: string, logsDir: string }} + */ +function prepareRunDir(runId, mockServer) { + const tmpBase = path.join(os.tmpdir(), 'vscode-chat-perf'); + const userDataDir = path.join(tmpBase, `run-${runId}`); + const extDir = path.join(DATA_DIR, 'extensions'); + const logsDir = path.join(tmpBase, 'logs', `run-${runId}`); + fs.rmSync(userDataDir, { recursive: true, force: true }); + fs.mkdirSync(userDataDir, { recursive: true }); + fs.mkdirSync(extDir, { recursive: true }); + fs.mkdirSync(logsDir, { recursive: true }); + preseedStorage(userDataDir); + writeSettings(userDataDir, mockServer); + return { userDataDir, extDir, logsDir }; +} + +// -- VS Code launch via CDP -------------------------------------------------- + +/** + * Fetch JSON from a URL. Used to probe the CDP endpoint. + * @param {string} url + * @returns {Promise} + */ +function getJson(url) { + return new Promise((resolve, reject) => { + http.get(url, res => { + let data = ''; + res.on('data', chunk => { data += chunk; }); + res.on('end', () => { + try { resolve(JSON.parse(data)); } + catch { reject(new Error(`Invalid JSON from ${url}`)); } + }); + }).on('error', reject); + }); +} + +/** + * Wait until VS Code exposes its CDP endpoint. + * @param {number} port + * @param {number} timeoutMs + * @returns {Promise} + */ +async function waitForCDP(port, timeoutMs = 60_000) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + await getJson(`http://127.0.0.1:${port}/json/version`); + return; + } catch { + await new Promise(r => setTimeout(r, 500)); + } + } + throw new Error(`Timed out waiting for CDP on port ${port}`); +} + +/** + * Find the workbench page among all CDP pages. + * For dev builds this checks for `globalThis.driver` (smoke-test driver). + * For stable builds it checks for `.monaco-workbench` in the DOM. + * @param {import('playwright').Browser} browser + * @param {number} timeoutMs + * @returns {Promise} + */ +async function findWorkbenchPage(browser, timeoutMs = 60_000) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + const pages = browser.contexts().flatMap(ctx => ctx.pages()); + for (const page of pages) { + const hasWorkbench = await page.evaluate(() => + // @ts-ignore + !!globalThis.driver?.whenWorkbenchRestored || !!document.querySelector('.monaco-workbench') + ).catch(() => false); + if (hasWorkbench) { + return page; + } + } + await new Promise(r => setTimeout(r, 500)); + } + throw new Error('Timed out waiting for the workbench page'); +} + +/** @type {number} */ +let nextPort = 19222; + +/** + * Launch VS Code via child_process and connect via CDP. + * Works with dev builds, insiders, and stable releases. + * + * @param {string} executable - Path to the VS Code executable (Electron binary or CLI) + * @param {string[]} launchArgs - Arguments to pass to the executable + * @param {Record} env - Environment variables + * @param {{ verbose?: boolean }} [opts] + * @returns {Promise<{ page: import('playwright').Page, browser: import('playwright').Browser, close: () => Promise }>} + */ +async function launchVSCode(executable, launchArgs, env, opts = {}) { + const { chromium } = require('playwright'); + const port = nextPort++; + + const args = [`--remote-debugging-port=${port}`, ...launchArgs]; + const isShell = process.platform === 'win32'; + + if (opts.verbose) { + console.log(` [launch] ${executable} ${args.slice(0, 3).join(' ')} ... (port ${port})`); + } + + const child = spawn(executable, args, { + cwd: ROOT, + env, + shell: isShell, + stdio: opts.verbose ? 'inherit' : ['ignore', 'ignore', 'ignore'], + }); + + // Track early exit + let exitError = /** @type {Error | null} */ (null); + child.once('exit', (code, signal) => { + if (!exitError) { + exitError = new Error(`VS Code exited before CDP connected (code=${code} signal=${signal})`); + } + }); + + // Wait for CDP + try { + await waitForCDP(port); + } catch (e) { + if (exitError) { throw exitError; } + throw e; + } + + const browser = await chromium.connectOverCDP(`http://127.0.0.1:${port}`); + const page = await findWorkbenchPage(browser); + + return { + page, + browser, + close: async () => { + await browser.close().catch(() => { }); + const pid = child.pid; + if (pid) { + if (process.platform === 'win32') { + try { execSync(`taskkill /F /T /PID ${pid}`, { stdio: 'ignore' }); } + catch { } + } else { + try { execSync(`pkill -TERM -P ${pid}`, { stdio: 'ignore' }); } + catch { } + child.kill('SIGTERM'); + } + } + await new Promise(resolve => { + const timer = setTimeout(() => { + if (pid) { + try { execSync(`pkill -9 -P ${pid}`, { stdio: 'ignore' }); } + catch { } + } + child.kill('SIGKILL'); + resolve(undefined); + }, 3000); + child.once('exit', () => { clearTimeout(timer); resolve(undefined); }); + }); + // Kill crashpad handler — it self-daemonizes and outlives the + // parent. Wait briefly for it to detach, then kill by pattern. + await new Promise(r => setTimeout(r, 500)); + try { execSync('pkill -9 -f crashpad_handler.*vscode-chat-perf', { stdio: 'ignore' }); } + catch { } + }, + }; +} + +// -- Statistics -------------------------------------------------------------- + +/** + * @param {number[]} values + */ +function median(values) { + const sorted = [...values].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2; +} + +/** + * Remove outliers using IQR method. + * @param {number[]} values + * @returns {number[]} + */ +function removeOutliers(values) { + if (values.length < 4) { return values; } + const sorted = [...values].sort((a, b) => a - b); + const q1 = sorted[Math.floor(sorted.length * 0.25)]; + const q3 = sorted[Math.floor(sorted.length * 0.75)]; + const iqr = q3 - q1; + const lo = q1 - 1.5 * iqr; + const hi = q3 + 1.5 * iqr; + return sorted.filter(v => v >= lo && v <= hi); +} + +/** + * Compute robust stats for a metric array. + * @param {number[]} raw + */ +function robustStats(raw) { + const valid = raw.filter(v => v >= 0); + if (valid.length === 0) { return null; } + const cleaned = removeOutliers(valid); + if (cleaned.length === 0) { return null; } + const sorted = [...cleaned].sort((a, b) => a - b); + const med = median(sorted); + const p95 = sorted[Math.min(Math.floor(sorted.length * 0.95), sorted.length - 1)]; + const mean = sorted.reduce((a, b) => a + b, 0) / sorted.length; + const variance = sorted.reduce((a, b) => a + (b - mean) ** 2, 0) / sorted.length; + const stddev = Math.sqrt(variance); + const cv = mean > 0 ? stddev / mean : 0; + return { + median: Math.round(med * 100) / 100, + p95: Math.round(p95 * 100) / 100, + min: sorted[0], + max: sorted[sorted.length - 1], + mean: Math.round(mean * 100) / 100, + stddev: Math.round(stddev * 100) / 100, + cv: Math.round(cv * 1000) / 1000, + n: sorted.length, + nOutliers: valid.length - cleaned.length, + }; +} + +/** + * Simple linear regression slope (y per unit x). + * @param {number[]} values + */ +function linearRegressionSlope(values) { + const n = values.length; + if (n < 2) { return 0; } + let sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0; + for (let i = 0; i < n; i++) { + sumX += i; + sumY += values[i]; + sumXY += i * values[i]; + sumX2 += i * i; + } + return (n * sumXY - sumX * sumY) / (n * sumX2 - sumX * sumX); +} + +/** + * Format a single metric line for console output. + * @param {number[]} values + * @param {string} label + * @param {string} unit + */ +function summarize(values, label, unit) { + const s = robustStats(values); + if (!s) { return ` ${label}: (no data)`; } + const cv = s.cv > 0.15 ? ` cv=${(s.cv * 100).toFixed(0)}%⚠` : ` cv=${(s.cv * 100).toFixed(0)}%`; + const outliers = s.nOutliers > 0 ? ` (${s.nOutliers} outlier${s.nOutliers > 1 ? 's' : ''} removed)` : ''; + return ` ${label}: median=${s.median}${unit}, p95=${s.p95}${unit},${cv}${outliers} [n=${s.n}]`; +} + +/** + * Compute duration between two chat perf marks. + * @param {Array<{name: string, startTime: number}>} marks + * @param {string} from + * @param {string} to + */ +function markDuration(marks, from, to) { + const fromMark = marks.find(m => m.name.endsWith('/' + from)); + const toMark = marks.find(m => m.name.endsWith('/' + to)); + if (fromMark && toMark) { + return toMark.startTime - fromMark.startTime; + } + return -1; +} + +/** @type {Array<[string, string, string]>} */ +const METRIC_DEFS = [ + ['timeToFirstToken', 'timing', 'ms'], + ['timeToComplete', 'timing', 'ms'], + ['timeToUIUpdated', 'timing', 'ms'], + ['instructionCollectionTime', 'timing', 'ms'], + ['agentInvokeTime', 'timing', 'ms'], + ['heapDelta', 'memory', 'MB'], + ['layoutCount', 'rendering', ''], + ['recalcStyleCount', 'rendering', ''], + ['forcedReflowCount', 'rendering', ''], + ['longTaskCount', 'rendering', ''], +]; + +module.exports = { + ROOT, + DATA_DIR, + SCENARIOS, + METRIC_DEFS, + getElectronPath, + isVersionString, + resolveBuild, + preseedStorage, + buildEnv, + buildArgs, + writeSettings, + prepareRunDir, + median, + removeOutliers, + robustStats, + linearRegressionSlope, + summarize, + markDuration, + launchVSCode, +}; diff --git a/scripts/chat-perf/test-chat-mem-leaks.js b/scripts/chat-perf/test-chat-mem-leaks.js new file mode 100644 index 0000000000000..b4f588a6a4362 --- /dev/null +++ b/scripts/chat-perf/test-chat-mem-leaks.js @@ -0,0 +1,229 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// @ts-check + +/** + * Chat memory leak checker. + * + * Sends multiple messages in a single VS Code session and tracks renderer + * heap and DOM node count after each message with forced GC. Uses linear + * regression to detect monotonic growth that indicates a memory leak. + * + * Usage: + * npm run perf:chat-leak # 10 messages, 2MB/msg threshold + * npm run perf:chat-leak -- --messages 20 # more messages for accuracy + * npm run perf:chat-leak -- --threshold 1 # stricter (1MB/msg) + * npm run perf:chat-leak -- --build 1.115.0 # test a specific build + */ + +const fs = require('fs'); +const path = require('path'); +const { + DATA_DIR, + resolveBuild, buildEnv, buildArgs, prepareRunDir, + linearRegressionSlope, launchVSCode, +} = require('./common/utils'); + +// -- CLI args ---------------------------------------------------------------- + +function parseArgs() { + const args = process.argv.slice(2); + const opts = { + messages: 10, + verbose: false, + /** @type {string | undefined} */ + build: undefined, + leakThresholdMB: 2, + }; + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--messages': case '-n': opts.messages = parseInt(args[++i], 10); break; + case '--verbose': opts.verbose = true; break; + case '--build': case '-b': opts.build = args[++i]; break; + case '--threshold': opts.leakThresholdMB = parseFloat(args[++i]); break; + case '--help': case '-h': + console.log([ + 'Chat memory leak checker', + '', + 'Options:', + ' --messages Number of messages to send (default: 10)', + ' --build Path to VS Code build or version to download', + ' --threshold Max per-message heap growth in MB (default: 2)', + ' --verbose Print per-message details', + ].join('\n')); + process.exit(0); + } + } + return opts; +} + +// -- Leak check -------------------------------------------------------------- + +/** + * @param {string} electronPath + * @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer + * @param {number} messageCount + * @param {boolean} verbose + */ +async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { + const { userDataDir, extDir, logsDir } = prepareRunDir('leak-check', mockServer); + + const vscode = await launchVSCode( + electronPath, + buildArgs(userDataDir, extDir, logsDir), + buildEnv(mockServer), + { verbose }, + ); + const window = vscode.page; + + try { + await window.waitForSelector('.monaco-workbench', { timeout: 60_000 }); + + const cdp = await window.context().newCDPSession(window); + await cdp.send('HeapProfiler.enable'); + + // Open chat + const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI'; + await window.keyboard.press(chatShortcut); + + const CHAT_VIEW = 'div[id="workbench.panel.chat"]'; + const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`; + await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); + await window.waitForFunction( + (sel) => Array.from(document.querySelectorAll(sel)).some(el => el.getBoundingClientRect().width > 0), + chatEditorSel, { timeout: 15_000 }, + ); + + // Wait for extension activation + const reqsBefore = mockServer.requestCount(); + try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { } + await new Promise(r => setTimeout(r, 3000)); + + /** @type {number[]} */ + const heapSamples = []; + /** @type {number[]} */ + const domNodeSamples = []; + + for (let i = 0; i < messageCount; i++) { + // Force GC and measure + await cdp.send('HeapProfiler.collectGarbage'); + await new Promise(r => setTimeout(r, 200)); + const heapInfo = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + const heapMB = Math.round(heapInfo.usedSize / 1024 / 1024 * 100) / 100; + const domNodes = await window.evaluate(() => document.querySelectorAll('*').length); + heapSamples.push(heapMB); + domNodeSamples.push(domNodes); + + if (verbose) { + console.log(` [leak] Message ${i + 1}/${messageCount}: heap=${heapMB}MB, domNodes=${domNodes}`); + } + + // Focus and type + await window.click(chatEditorSel); + await new Promise(r => setTimeout(r, 200)); + + const inputSel = await window.evaluate((editorSel) => { + const ed = document.querySelector(editorSel); + if (!ed) { throw new Error('no editor'); } + return ed.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea'; + }, chatEditorSel); + + const msg = `[scenario:text-only] Leak check message ${i + 1}`; + await window.evaluate(({ selector, text }) => { + // @ts-ignore — globalThis.driver is injected by --enable-smoke-test-driver + if (!globalThis.driver) { throw new Error('no driver'); } + // @ts-ignore + return globalThis.driver.typeInEditor(selector, text); + }, { selector: inputSel, text: msg }); + + const compBefore = mockServer.completionCount(); + await window.keyboard.press('Enter'); + try { await mockServer.waitForCompletion(compBefore + 1, 30_000); } catch { } + + // Wait for response + const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`; + await window.waitForFunction( + (sel) => { + const responses = document.querySelectorAll(sel); + if (responses.length === 0) { return false; } + return !responses[responses.length - 1].classList.contains('chat-response-loading'); + }, + responseSelector, { timeout: 30_000 }, + ); + await new Promise(r => setTimeout(r, 500)); + } + + // Final measurement + await cdp.send('HeapProfiler.collectGarbage'); + await new Promise(r => setTimeout(r, 200)); + const finalHeap = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + heapSamples.push(Math.round(finalHeap.usedSize / 1024 / 1024 * 100) / 100); + domNodeSamples.push(await window.evaluate(() => document.querySelectorAll('*').length)); + + if (verbose) { + console.log(` [leak] Final: heap=${heapSamples[heapSamples.length - 1]}MB, domNodes=${domNodeSamples[domNodeSamples.length - 1]}`); + } + + return { + heapSamples, + domNodeSamples, + leakPerMessageMB: Math.round(linearRegressionSlope(heapSamples) * 100) / 100, + leakPerMessageNodes: Math.round(linearRegressionSlope(domNodeSamples)), + }; + } finally { + await vscode.close(); + } +} + +// -- Main -------------------------------------------------------------------- + +async function main() { + const opts = parseArgs(); + const electronPath = await resolveBuild(opts.build); + + if (!fs.existsSync(electronPath)) { + console.error(`Electron not found at: ${electronPath}`); + process.exit(1); + } + + const { startServer } = require('./common/mock-llm-server'); + const mockServer = await startServer(0); + + console.log(`[chat-perf] Leak check: ${opts.messages} messages, threshold ${opts.leakThresholdMB}MB/msg`); + console.log(`[chat-perf] Build: ${electronPath}`); + console.log(''); + + const result = await runLeakCheck(electronPath, mockServer, opts.messages, opts.verbose); + + console.log('[chat-perf] =================== Leak Check Results ==================='); + console.log(''); + console.log(` Heap samples (MB): ${result.heapSamples.join(' → ')}`); + console.log(` DOM node samples: ${result.domNodeSamples.join(' → ')}`); + console.log(''); + const totalHeapDelta = Math.round((result.heapSamples[result.heapSamples.length - 1] - result.heapSamples[0]) * 100) / 100; + console.log(` Heap growth: ${result.heapSamples[0]}MB → ${result.heapSamples[result.heapSamples.length - 1]}MB (delta${totalHeapDelta}MB total)`); + console.log(` Per-message heap growth: ${result.leakPerMessageMB}MB/msg`); + console.log(` Per-message DOM growth: ${result.leakPerMessageNodes} nodes/msg`); + console.log(''); + + // Write JSON + const jsonPath = path.join(DATA_DIR, 'chat-perf-leak-results.json'); + fs.writeFileSync(jsonPath, JSON.stringify({ timestamp: new Date().toISOString(), ...result }, null, 2)); + console.log(`[chat-perf] Results written to ${jsonPath}`); + + const leaked = result.leakPerMessageMB > opts.leakThresholdMB; + console.log(''); + if (leaked) { + console.log(`[chat-perf] LEAK DETECTED — ${result.leakPerMessageMB}MB/msg exceeds ${opts.leakThresholdMB}MB/msg threshold`); + } else { + console.log(`[chat-perf] No leak detected (${result.leakPerMessageMB}MB/msg < ${opts.leakThresholdMB}MB/msg)`); + } + + await mockServer.close(); + process.exit(leaked ? 1 : 0); +} + +main().catch(err => { console.error(err); process.exit(1); }); diff --git a/scripts/chat-perf/test-chat-perf-regression.js b/scripts/chat-perf/test-chat-perf-regression.js new file mode 100644 index 0000000000000..e3f33a6665f84 --- /dev/null +++ b/scripts/chat-perf/test-chat-perf-regression.js @@ -0,0 +1,598 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// @ts-check + +/** + * Chat performance benchmark. + * + * Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local + * mock LLM server. Measures the full stack: prompt building, context + * gathering, tool resolution, rendering, GC, and layout overhead. + * + * Usage: + * npm run perf:chat # all scenarios vs 1.115.0 + * npm run perf:chat -- --runs 10 # 10 runs per scenario + * npm run perf:chat -- --scenario text-only # single scenario + * npm run perf:chat -- --no-baseline # skip baseline comparison + * npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 + */ + +const path = require('path'); +const fs = require('fs'); +const { + ROOT, DATA_DIR, SCENARIOS, METRIC_DEFS, + resolveBuild, buildEnv, buildArgs, prepareRunDir, + robustStats, summarize, markDuration, launchVSCode, +} = require('./common/utils'); + +// -- CLI args ---------------------------------------------------------------- + +function parseArgs() { + const args = process.argv.slice(2); + const opts = { + runs: 5, + verbose: false, + /** @type {string[]} */ + scenarios: [], + /** @type {string | undefined} */ + build: undefined, + /** @type {string | undefined} */ + baseline: undefined, + /** @type {string | undefined} */ + baselineBuild: '1.115.0', + saveBaseline: false, + threshold: 0.2, + }; + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case '--runs': opts.runs = parseInt(args[++i], 10); break; + case '--verbose': opts.verbose = true; break; + case '--scenario': case '-s': opts.scenarios.push(args[++i]); break; + case '--build': case '-b': opts.build = args[++i]; break; + case '--baseline': opts.baseline = args[++i]; break; + case '--baseline-build': opts.baselineBuild = args[++i]; break; + case '--no-baseline': opts.baselineBuild = undefined; break; + case '--save-baseline': opts.saveBaseline = true; break; + case '--threshold': opts.threshold = parseFloat(args[++i]); break; + case '--help': case '-h': + console.log([ + 'Chat performance benchmark', + '', + 'Options:', + ' --runs Number of runs per scenario (default: 5)', + ' --scenario Scenario to run (repeatable; default: all)', + ' --build Path to VS Code build, or a version to download', + ' (e.g. "1.110.0", "insiders", commit hash; default: local dev)', + ' --baseline Compare against a baseline JSON file', + ' --baseline-build Download a VS Code version and benchmark it as baseline', + ' (default: 1.115.0; accepts "insiders", "1.100.0", commit hash)', + ' --no-baseline Skip baseline comparison entirely', + ' --save-baseline Save results as the new baseline (requires --baseline )', + ' --threshold Regression threshold fraction (default: 0.2 = 20%)', + ' --verbose Print per-run details', + '', + 'Scenarios: ' + SCENARIOS.join(', '), + ].join('\n')); + process.exit(0); + } + } + if (opts.scenarios.length === 0) { + opts.scenarios = SCENARIOS; + } + return opts; +} + +// -- Metrics ----------------------------------------------------------------- + +/** + * @typedef {{ + * timeToUIUpdated: number, + * timeToFirstToken: number, + * timeToComplete: number, + * instructionCollectionTime: number, + * agentInvokeTime: number, + * heapUsedBefore: number, + * heapUsedAfter: number, + * heapDelta: number, + * majorGCs: number, + * minorGCs: number, + * gcDurationMs: number, + * layoutCount: number, + * recalcStyleCount: number, + * forcedReflowCount: number, + * longTaskCount: number, + * hasInternalMarks: boolean, + * responseHasContent: boolean, + * internalFirstToken: number, + * profilePath: string, + * tracePath: string, + * snapshotPath: string, + * }} RunMetrics + */ + +// -- Single run -------------------------------------------------------------- + +/** + * @param {string} electronPath + * @param {string} scenario + * @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer + * @param {boolean} verbose + * @param {string} runIndex + * @param {string} runDir - timestamped run directory for diagnostics + * @param {'baseline' | 'test'} role - whether this is a baseline or test run + * @returns {Promise} + */ +async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role) { + const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer); + const isDevBuild = !electronPath.includes('.vscode-test'); + const buildLabel = isDevBuild ? 'dev' : path.basename(path.dirname(path.dirname(path.dirname(electronPath)))).replace(/^vscode-/, ''); + + // Create a per-run diagnostics directory: /-/-/ + const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, '')); + fs.mkdirSync(runDiagDir, { recursive: true }); + + const vscode = await launchVSCode( + electronPath, + buildArgs(userDataDir, extDir, logsDir, { isDevBuild }), + buildEnv(mockServer, { isDevBuild }), + { verbose }, + ); + const window = vscode.page; + + try { + await window.waitForSelector('.monaco-workbench', { timeout: 60_000 }); + + const cdp = await window.context().newCDPSession(window); + await cdp.send('Performance.enable'); + const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + + await cdp.send('Tracing.start', { + traceConfig: { + includedCategories: ['v8.gc', 'devtools.timeline'], + recordMode: 'recordContinuously', + } + }); + const metricsBefore = await cdp.send('Performance.getMetrics'); + + // Open chat + const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI'; + await window.keyboard.press(chatShortcut); + + const CHAT_VIEW = 'div[id="workbench.panel.chat"]'; + const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`; + + await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); + await window.waitForFunction( + (selector) => Array.from(document.querySelectorAll(selector)).some(el => { + const rect = el.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; + }), + chatEditorSel, { timeout: 15_000 }, + ); + + // Dismiss dialogs + const dismissDialog = async () => { + for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) { + const el = await window.$(sel); + if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; } + } + }; + await dismissDialog(); + + // Wait for extension activation + const reqsBefore = mockServer.requestCount(); + try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { } + if (verbose) { + console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`); + } + + // Wait for model resolution + await new Promise(r => setTimeout(r, 3000)); + await dismissDialog(); + + // Focus input + await window.click(chatEditorSel); + const focusStart = Date.now(); + while (Date.now() - focusStart < 5_000) { + const focused = await window.evaluate((sel) => { + const el = document.querySelector(sel); + return el && (el.classList.contains('focused') || el.contains(document.activeElement)); + }, chatEditorSel).catch(() => false); + if (focused) { break; } + await new Promise(r => setTimeout(r, 50)); + } + + // Type message — use the smoke-test driver's typeInEditor when available + // (dev builds), fall back to pressSequentially for stable/insiders builds. + const chatMessage = `[scenario:${scenario}] Explain how this code works`; + const actualInputSelector = await window.evaluate((editorSel) => { + const editor = document.querySelector(editorSel); + if (!editor) { throw new Error('Chat editor not found'); } + return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea'; + }, chatEditorSel); + + const hasDriver = await window.evaluate(() => + // @ts-ignore + !!globalThis.driver?.typeInEditor + ).catch(() => false); + + if (hasDriver) { + await window.evaluate(({ selector, text }) => { + // @ts-ignore + return globalThis.driver.typeInEditor(selector, text); + }, { selector: actualInputSelector, text: chatMessage }); + } else { + // Fallback: click the input element and use pressSequentially + await window.click(actualInputSelector); + await new Promise(r => setTimeout(r, 200)); + await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 }); + } + + // Start CPU profiler to capture call stacks during the interaction + await cdp.send('Profiler.enable'); + await cdp.send('Profiler.start'); + + // Start polling for code/chat/* perf marks inside the renderer. + // The marks are emitted during the request and cleared immediately + // after RequestComplete in the same microtask. We poll rapidly from + // the page context to capture them before they're cleared. + await window.evaluate(() => { + // @ts-ignore + globalThis._chatPerfCapture = []; + // @ts-ignore + globalThis._chatPerfPollId = setInterval(() => { + // @ts-ignore + const marks = globalThis.MonacoPerformanceMarks?.getMarks() ?? []; + for (const m of marks) { + // @ts-ignore + if (m.name.startsWith('code/chat/') && !globalThis._chatPerfCapture.some(c => c.name === m.name)) { + // @ts-ignore + globalThis._chatPerfCapture.push({ name: m.name, startTime: m.startTime }); + } + } + }, 16); // poll every frame (~60fps) + }); + + // Submit + const completionsBefore = mockServer.completionCount(); + const submitTime = Date.now(); + await window.keyboard.press('Enter'); + + // Wait for mock server to serve the response + try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { } + const firstResponseTime = Date.now(); + + // Wait for DOM response to settle + await dismissDialog(); + const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`; + await window.waitForFunction( + (sel) => { + const responses = document.querySelectorAll(sel); + if (responses.length === 0) { return false; } + return !responses[responses.length - 1].classList.contains('chat-response-loading'); + }, + responseSelector, { timeout: 30_000 }, + ); + const responseCompleteTime = Date.now(); + + // Stop CPU profiler and save the profile + const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop')); + const profilePath = path.join(runDiagDir, 'profile.cpuprofile'); + fs.writeFileSync(profilePath, JSON.stringify(profile)); + if (verbose) { + console.log(` [debug] CPU profile saved to ${profilePath}`); + } + + const responseInfo = await window.evaluate((sel) => { + const responses = document.querySelectorAll(sel); + const last = responses[responses.length - 1]; + if (!last) { return { hasContent: false, text: '' }; } + const text = last.textContent || ''; + return { hasContent: text.trim().length > 0, text: text.substring(0, 200) }; + }, responseSelector); + + if (verbose) { + console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`); + console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`); + } + + // Collect perf marks from our polling capture and stop the poll + const chatMarks = await window.evaluate(() => { + // @ts-ignore + clearInterval(globalThis._chatPerfPollId); + // @ts-ignore + const marks = globalThis._chatPerfCapture ?? []; + // @ts-ignore + delete globalThis._chatPerfCapture; + // @ts-ignore + delete globalThis._chatPerfPollId; + return marks; + }); + if (verbose && chatMarks.length > 0) { + console.log(` [debug] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`); + } + + const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + /** @type {Array} */ + const traceEvents = []; + cdp.on('Tracing.dataCollected', (/** @type {any} */ data) => { traceEvents.push(...data.value); }); + await cdp.send('Tracing.end'); + await new Promise(r => setTimeout(r, 500)); + const metricsAfter = await cdp.send('Performance.getMetrics'); + + // Save performance trace (Chrome DevTools format) + const tracePath = path.join(runDiagDir, 'trace.json'); + fs.writeFileSync(tracePath, JSON.stringify({ traceEvents })); + + // Take heap snapshot + const snapshotPath = path.join(runDiagDir, 'heap.heapsnapshot'); + await cdp.send('HeapProfiler.enable'); + const snapshotChunks = /** @type {string[]} */ ([]); + cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => { + snapshotChunks.push(params.chunk); + }); + await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); + fs.writeFileSync(snapshotPath, snapshotChunks.join('')); + + // Parse timing — always use client-side Date.now() for timeToFirstToken + // and timeToComplete so cross-build comparisons use the same method. + // Internal marks are reported separately for diagnostics. + const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated'); + const timeToFirstToken = firstResponseTime - submitTime; + const timeToComplete = responseCompleteTime - submitTime; + const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions'); + const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke'); + // Internal-mark TTFT (more precise, but only available on dev builds) + const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken'); + + // Parse GC/long tasks + let majorGCs = 0, minorGCs = 0, gcDurationMs = 0; + for (const event of traceEvents) { + if (event.cat === 'v8.gc' || event.name === 'V8.GCFinalizeMC' || event.name === 'V8.GCScavenger') { + if (event.name?.includes('MC') || event.name?.includes('Major') || event.name === 'MajorGC') { majorGCs++; } + else if (event.name?.includes('Scavenger') || event.name?.includes('Minor') || event.name === 'MinorGC') { minorGCs++; } + if (event.dur) { gcDurationMs += event.dur / 1000; } + } + } + let longTaskCount = 0; + for (const event of traceEvents) { + if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; } + } + + /** @param {any} r @param {string} name */ + function getMetric(r, name) { + const e = r.metrics?.find((/** @type {any} */ m) => m.name === name); + return e ? e.value : 0; + } + + return { + timeToUIUpdated, timeToFirstToken, timeToComplete, instructionCollectionTime, agentInvokeTime, + heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024), + heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024), + heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024), + majorGCs, minorGCs, + gcDurationMs: Math.round(gcDurationMs * 100) / 100, + layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'), + recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'), + forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'), + longTaskCount, + hasInternalMarks: chatMarks.length > 0, + responseHasContent: responseInfo.hasContent, + internalFirstToken, + profilePath, + tracePath, + snapshotPath, + }; + } finally { + await vscode.close(); + } +} + +// -- Main -------------------------------------------------------------------- + +async function main() { + const opts = parseArgs(); + const electronPath = await resolveBuild(opts.build); + + if (!fs.existsSync(electronPath)) { + console.error(`Electron not found at: ${electronPath}`); + console.error('Run "node build/lib/preLaunch.ts" first, or pass --build '); + process.exit(1); + } + + const { startServer } = require('./common/mock-llm-server'); + const mockServer = await startServer(0); + console.log(`[chat-perf] Mock LLM server: ${mockServer.url}`); + + // Create a timestamped run directory for all output + const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); + const runDir = path.join(DATA_DIR, runTimestamp); + fs.mkdirSync(runDir, { recursive: true }); + console.log(`[chat-perf] Output: ${runDir}`); + + // -- Baseline build -------------------------------------------------- + if (opts.baselineBuild) { + const baselineJsonPath = path.join(runDir, `baseline-${opts.baselineBuild}.json`); + const cachedPath = path.join(DATA_DIR, `baseline-${opts.baselineBuild}.json`); + const cachedBaseline = fs.existsSync(cachedPath) + ? JSON.parse(fs.readFileSync(cachedPath, 'utf-8')) + : null; + + if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) { + console.log(`[chat-perf] Using cached baseline for ${opts.baselineBuild}`); + fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); + opts.baseline = baselineJsonPath; + } else { + const baselineExePath = await resolveBuild(opts.baselineBuild); + console.log(`[chat-perf] Benchmarking baseline build (${opts.baselineBuild})...`); + /** @type {Record} */ + const baselineResults = {}; + for (const scenario of opts.scenarios) { + /** @type {RunMetrics[]} */ + const results = []; + for (let i = 0; i < opts.runs; i++) { + try { results.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline')); } + catch (err) { console.error(`[chat-perf] Baseline run ${i + 1} failed: ${err}`); } + } + if (results.length > 0) { baselineResults[scenario] = results; } + } + const baselineReport = { + timestamp: new Date().toISOString(), + baselineBuildVersion: opts.baselineBuild, + platform: process.platform, + runsPerScenario: opts.runs, + scenarios: /** @type {Record} */ ({}), + }; + for (const [scenario, results] of Object.entries(baselineResults)) { + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {} }); + for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } + baselineReport.scenarios[scenario] = sd; + } + fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2)); + // Cache at the top level for reuse across runs + fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2)); + opts.baseline = baselineJsonPath; + } + console.log(''); + } + + // -- Run benchmarks -------------------------------------------------- + console.log(`[chat-perf] Electron: ${electronPath}`); + console.log(`[chat-perf] Runs per scenario: ${opts.runs}`); + console.log(`[chat-perf] Scenarios: ${opts.scenarios.join(', ')}`); + console.log(''); + + /** @type {Record} */ + const allResults = {}; + let anyFailed = false; + + for (const scenario of opts.scenarios) { + console.log(`[chat-perf] === Scenario: ${scenario} ===`); + /** @type {RunMetrics[]} */ + const results = []; + for (let i = 0; i < opts.runs; i++) { + console.log(`[chat-perf] Run ${i + 1}/${opts.runs}...`); + try { + const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test'); + results.push(metrics); + if (opts.verbose) { + const src = metrics.hasInternalMarks ? 'internal' : 'client-side'; + console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`); + } + } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } + } + if (results.length === 0) { console.error(`[chat-perf] All runs failed for scenario: ${scenario}`); anyFailed = true; } + else { allResults[scenario] = results; } + console.log(''); + } + + // -- Summary --------------------------------------------------------- + console.log('[chat-perf] ======================= Summary ======================='); + for (const [scenario, results] of Object.entries(allResults)) { + console.log(''); + console.log(` -- ${scenario} (${results.length} runs) --`); + console.log(''); + console.log(' Timing:'); + console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms')); + console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms')); + console.log(''); + console.log(' Rendering:'); + console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', '')); + console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', '')); + console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', '')); + console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', '')); + console.log(''); + console.log(' Memory:'); + console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB')); + console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms')); + } + + // -- JSON output ----------------------------------------------------- + const jsonPath = path.join(runDir, 'results.json'); + const jsonReport = { timestamp: new Date().toISOString(), platform: process.platform, runsPerScenario: opts.runs, scenarios: /** @type {Record} */ ({}) }; + for (const [scenario, results] of Object.entries(allResults)) { + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); + for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } + jsonReport.scenarios[scenario] = sd; + } + fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2)); + console.log(''); + console.log(`[chat-perf] Results written to ${jsonPath}`); + + // -- Save baseline --------------------------------------------------- + if (opts.saveBaseline) { + if (!opts.baseline) { console.error('[chat-perf] --save-baseline requires --baseline '); process.exit(1); } + fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2)); + console.log(`[chat-perf] Baseline saved to ${opts.baseline}`); + } + + // -- Baseline comparison --------------------------------------------- + let regressionFound = false; + if (opts.baseline && fs.existsSync(opts.baseline)) { + const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')); + console.log(''); + console.log(`[chat-perf] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`); + console.log(`[chat-perf] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`); + console.log(''); + + // Metrics that trigger regression failure when they exceed the threshold + const regressionMetrics = [ + // [metric, group, unit] + ['timeToFirstToken', 'timing', 'ms'], + ['timeToComplete', 'timing', 'ms'], + ['layoutCount', 'rendering', ''], + ['recalcStyleCount', 'rendering', ''], + ['forcedReflowCount', 'rendering', ''], + ['longTaskCount', 'rendering', ''], + ]; + // Informational metrics — shown in comparison but don't trigger failure + const infoMetrics = [ + ['heapDelta', 'memory', 'MB'], + ['gcDurationMs', 'memory', 'ms'], + ]; + + for (const scenario of Object.keys(jsonReport.scenarios)) { + const current = jsonReport.scenarios[scenario]; + const base = baseline.scenarios?.[scenario]; + if (!base) { console.log(` ${scenario}: (no baseline)`); continue; } + + /** @type {string[]} */ + const diffs = []; + let scenarioRegression = false; + + for (const [metric, group, unit] of regressionMetrics) { + const cur = current[group]?.[metric]; + const bas = base[group]?.[metric]; + if (!cur || !bas || !bas.median) { continue; } + const change = (cur.median - bas.median) / bas.median; + const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; + const flag = change > opts.threshold ? ' ← REGRESSION' : ''; + if (change > opts.threshold) { scenarioRegression = true; regressionFound = true; } + diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`); + } + for (const [metric, group, unit] of infoMetrics) { + const cur = current[group]?.[metric]; + const bas = base[group]?.[metric]; + if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; } + const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0; + const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; + diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`); + } + console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`); + diffs.forEach(d => console.log(d)); + } + + console.log(''); + console.log(regressionFound + ? `[chat-perf] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold` + : `[chat-perf] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline`); + } + + if (anyFailed || regressionFound) { process.exit(1); } + await mockServer.close(); +} + +main().catch(err => { console.error(err); process.exit(1); }); From a4a562b6f046d3cc813ad2032c14785c07ea6787 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Mon, 13 Apr 2026 19:30:53 -0700 Subject: [PATCH 02/13] PR --- .github/skills/chat-perf/SKILL.md | 38 +- .github/workflows/chat-perf.yml | 144 +++++++ scripts/chat-perf/common/utils.js | 104 +++++ .../chat-perf/test-chat-perf-regression.js | 402 +++++++++++++++++- 4 files changed, 663 insertions(+), 25 deletions(-) create mode 100644 .github/workflows/chat-perf.yml diff --git a/.github/skills/chat-perf/SKILL.md b/.github/skills/chat-perf/SKILL.md index fd45b8b6b2a7c..a110cafc2edbb 100644 --- a/.github/skills/chat-perf/SKILL.md +++ b/.github/skills/chat-perf/SKILL.md @@ -28,7 +28,7 @@ npm run perf:chat-leak -- --messages 20 --verbose ## Perf regression test -**Script:** `scripts/chat-perf/test-chat-perf-regression.js` +**Script:** `scripts/chat-perf/test-chat-perf-regression.js` **npm:** `npm run perf:chat` Launches VS Code via Playwright Electron, opens the chat panel, sends a message with a mock LLM response, and measures timing, layout, and rendering metrics. By default, downloads VS Code 1.115.0 as a baseline, benchmarks it, then benchmarks the local dev build and compares. @@ -42,6 +42,7 @@ Launches VS Code via Playwright Electron, opens the chat panel, sends a message | `--build ` | local dev | Build to test. Accepts path or version (`1.110.0`, `insiders`). | | `--baseline-build ` | `1.115.0` | Version to download and compare against. | | `--no-baseline` | — | Skip baseline comparison entirely. | +| `--resume ` | — | Resume a previous run, adding more iterations to increase confidence. | | `--threshold ` | `0.2` | Regression threshold (0.2 = flag if 20% slower). | | `--verbose` | — | Print per-run details including response content. | @@ -52,10 +53,37 @@ Launches VS Code via Playwright Electron, opens the chat panel, sends a message npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 --runs 5 ``` +### Resuming a run for more confidence + +When results exceed the threshold but aren't statistically significant, the tool prints a `--resume` hint. Use it to add more iterations to an existing run: + +```bash +# Initial run with 3 iterations — may be inconclusive: +npm run perf:chat -- --scenario text-only --runs 3 + +# Add 3 more runs to the same results file (both test + baseline): +npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 3 + +# Keep adding until confidence is reached: +npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 5 +``` + +`--resume` loads the previous `results.json` and its associated `baseline-*.json`, runs N more iterations for both builds, merges rawRuns, recomputes stats, and re-runs the comparison. The updated files are written back in-place. You can resume multiple times — samples accumulate. + +### Statistical significance + +Regression detection uses **Welch's t-test** to avoid false positives from noisy measurements. A metric is only flagged as `REGRESSION` when it both exceeds the threshold AND is statistically significant (p < 0.05). Otherwise it's reported as `(likely noise — p=X, not significant)`. + +With typical variance (cv ≈ 20%), you need: +- **n ≥ 5** per build to detect a 35% regression at 95% confidence +- **n ≥ 10** per build to detect a 20% regression reliably + +Confidence levels reported: `high` (p < 0.01), `medium` (p < 0.05), `low` (p < 0.1), `none`. + ### Exit codes -- `0` — all metrics within threshold -- `1` — regression detected or runs failed +- `0` — all metrics within threshold, or exceeding threshold but not statistically significant +- `1` — statistically significant regression detected, or all runs failed ### Scenarios @@ -80,11 +108,11 @@ npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 --runs 5 ### Statistics -Results use **IQR-based outlier removal** and **median** (not mean) to handle startup jitter. The **coefficient of variation (cv)** is reported — under 15% is stable, over 15% gets a ⚠ warning. Use 5+ runs to get stable results. +Results use **IQR-based outlier removal** and **median** (not mean) to handle startup jitter. The **coefficient of variation (cv)** is reported — under 15% is stable, over 15% gets a ⚠ warning. Baseline comparison uses **Welch's t-test** on raw run values to determine statistical significance before flagging regressions. Use 5+ runs to get stable results. ## Memory leak check -**Script:** `scripts/chat-perf/test-chat-mem-leaks.js` +**Script:** `scripts/chat-perf/test-chat-mem-leaks.js` **npm:** `npm run perf:chat-leak` Launches one VS Code session, sends N messages sequentially, forces GC between each, and measures renderer heap and DOM node count. Uses **linear regression** on the samples to compute per-message growth rate, which is compared against a threshold. diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml new file mode 100644 index 0000000000000..adebefb98479b --- /dev/null +++ b/.github/workflows/chat-perf.yml @@ -0,0 +1,144 @@ +name: Chat Performance Comparison + +on: + workflow_dispatch: + inputs: + baseline_commit: + description: 'Baseline commit SHA or version (e.g. "1.115.0", "abc1234")' + required: true + type: string + test_commit: + description: 'Test commit SHA or version (e.g. "main", "abc1234")' + required: true + type: string + runs: + description: 'Runs per scenario (default: 7 for statistical significance)' + required: false + type: number + default: 7 + scenarios: + description: 'Comma-separated scenario list (empty = all)' + required: false + type: string + default: '' + threshold: + description: 'Regression threshold fraction (default: 0.2 = 20%)' + required: false + type: number + default: 0.2 + +permissions: + contents: read + +concurrency: + group: chat-perf-${{ github.run_id }} + cancel-in-progress: true + +jobs: + chat-perf: + name: Chat Perf – ${{ inputs.baseline_commit }} vs ${{ inputs.test_commit }} + runs-on: ubuntu-latest + timeout-minutes: 120 + steps: + - name: Checkout test commit + uses: actions/checkout@v6 + with: + ref: ${{ inputs.test_commit }} + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version-file: .nvmrc + + - name: Install system dependencies + run: | + sudo apt update -y + sudo apt install -y \ + build-essential pkg-config \ + libx11-dev libx11-xcb-dev libxkbfile-dev \ + libnotify-bin libkrb5-dev \ + xvfb sqlite3 \ + libnss3 libatk1.0-0 libatk-bridge2.0-0 \ + libcups2 libdrm2 libxcomposite1 libxdamage1 \ + libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \ + libasound2 libxshmfence1 libgtk-3-0 + + - name: Install dependencies + run: npm ci + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Install build dependencies + run: npm ci + working-directory: build + + - name: Transpile source + run: npm run transpile-client + + - name: Install Playwright Chromium + run: npx playwright install chromium + + - name: Run chat perf comparison + id: perf + run: | + SCENARIO_ARGS="" + if [[ -n "${{ inputs.scenarios }}" ]]; then + IFS=',' read -ra SCENS <<< "${{ inputs.scenarios }}" + for s in "${SCENS[@]}"; do + SCENARIO_ARGS="$SCENARIO_ARGS --scenario $(echo "$s" | xargs)" + done + fi + + xvfb-run node scripts/chat-perf/test-chat-perf-regression.js \ + --baseline-build "${{ inputs.baseline_commit }}" \ + --build "${{ inputs.test_commit }}" \ + --runs ${{ inputs.runs }} \ + --threshold ${{ inputs.threshold }} \ + --ci \ + $SCENARIO_ARGS \ + 2>&1 | tee perf-output.log + + # Extract exit code from the script (tee masks it) + exit ${PIPESTATUS[0]} + continue-on-error: true + + - name: Write job summary + if: always() + run: | + if [[ -f .chat-perf-data/ci-summary.md ]]; then + cat .chat-perf-data/ci-summary.md >> "$GITHUB_STEP_SUMMARY" + else + echo "⚠️ No summary file generated. Check perf-output.log artifact." >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Zip diagnostic outputs + if: always() + run: | + # Find the most recent timestamped run directory + RUN_DIR=$(ls -td .chat-perf-data/20*/ 2>/dev/null | head -1) + if [[ -n "$RUN_DIR" ]]; then + # Zip everything: results JSON, CPU profiles, traces, heap snapshots + cd .chat-perf-data + zip -r ../chat-perf-artifacts.zip \ + "$(basename "$RUN_DIR")"/ \ + ci-summary.md \ + baseline-*.json \ + 2>/dev/null || true + cd .. + fi + + - name: Upload perf artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: chat-perf-${{ inputs.baseline_commit }}-vs-${{ inputs.test_commit }} + path: | + chat-perf-artifacts.zip + perf-output.log + retention-days: 30 + + - name: Fail on regression + if: steps.perf.outcome == 'failure' + run: | + echo "::error::Chat performance regression detected. See job summary for details." + exit 1 diff --git a/scripts/chat-perf/common/utils.js b/scripts/chat-perf/common/utils.js index 671b660738ce4..6e8b60ce3d888 100644 --- a/scripts/chat-perf/common/utils.js +++ b/scripts/chat-perf/common/utils.js @@ -399,6 +399,108 @@ function removeOutliers(values) { return sorted.filter(v => v >= lo && v <= hi); } +/** + * Regularized incomplete beta function I_x(a, b) via continued fraction. + * Used for computing t-distribution CDF / p-values. + * @param {number} x + * @param {number} a + * @param {number} b + * @returns {number} + */ +function betaIncomplete(x, a, b) { + if (x <= 0) { return 0; } + if (x >= 1) { return 1; } + // Use symmetry relation when x > (a+1)/(a+b+2) for better convergence + if (x > (a + 1) / (a + b + 2)) { + return 1 - betaIncomplete(1 - x, b, a); + } + // Log-beta via Stirling: lnBeta(a,b) = lnGamma(a)+lnGamma(b)-lnGamma(a+b) + const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b); + const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a; + // Lentz's continued fraction + const maxIter = 200; + const eps = 1e-14; + let c = 1, d = 1 - (a + b) * x / (a + 1); + if (Math.abs(d) < eps) { d = eps; } + d = 1 / d; + let result = d; + for (let m = 1; m <= maxIter; m++) { + // Even step + let num = m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m)); + d = 1 + num * d; if (Math.abs(d) < eps) { d = eps; } d = 1 / d; + c = 1 + num / c; if (Math.abs(c) < eps) { c = eps; } + result *= d * c; + // Odd step + num = -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1)); + d = 1 + num * d; if (Math.abs(d) < eps) { d = eps; } d = 1 / d; + c = 1 + num / c; if (Math.abs(c) < eps) { c = eps; } + const delta = d * c; + result *= delta; + if (Math.abs(delta - 1) < eps) { break; } + } + return front * result; +} + +/** + * Log-gamma via Lanczos approximation. + * @param {number} z + * @returns {number} + */ +function lnGamma(z) { + const g = 7; + const coef = [0.99999999999980993, 676.5203681218851, -1259.1392167224028, + 771.32342877765313, -176.61502916214059, 12.507343278686905, + -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7]; + if (z < 0.5) { + return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z); + } + z -= 1; + let x = coef[0]; + for (let i = 1; i < g + 2; i++) { x += coef[i] / (z + i); } + const t = z + g + 0.5; + return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x); +} + +/** + * Two-tailed p-value from t-distribution. + * @param {number} t - t-statistic + * @param {number} df - degrees of freedom + * @returns {number} + */ +function tDistPValue(t, df) { + const x = df / (df + t * t); + return betaIncomplete(x, df / 2, 0.5); +} + +/** + * Welch's t-test for two independent samples (unequal variance). + * @param {number[]} a - Sample 1 (e.g., baseline values) + * @param {number[]} b - Sample 2 (e.g., current values) + * @returns {{ t: number, df: number, pValue: number, significant: boolean, confidence: string } | null} + */ +function welchTTest(a, b) { + if (a.length < 2 || b.length < 2) { return null; } + const meanA = a.reduce((s, v) => s + v, 0) / a.length; + const meanB = b.reduce((s, v) => s + v, 0) / b.length; + const varA = a.reduce((s, v) => s + (v - meanA) ** 2, 0) / (a.length - 1); + const varB = b.reduce((s, v) => s + (v - meanB) ** 2, 0) / (b.length - 1); + const seA = varA / a.length; + const seB = varB / b.length; + const seDiff = Math.sqrt(seA + seB); + if (seDiff === 0) { return null; } + const t = (meanB - meanA) / seDiff; + // Welch-Satterthwaite degrees of freedom + const df = (seA + seB) ** 2 / ((seA ** 2) / (a.length - 1) + (seB ** 2) / (b.length - 1)); + const pValue = tDistPValue(t, df); + const significant = pValue < 0.05; + let confidence; + if (pValue < 0.01) { confidence = 'high'; } + else if (pValue < 0.05) { confidence = 'medium'; } + else if (pValue < 0.1) { confidence = 'low'; } + else { confidence = 'none'; } + return { t: Math.round(t * 100) / 100, df: Math.round(df * 10) / 10, pValue: Math.round(pValue * 1000) / 1000, significant, confidence }; +} + /** * Compute robust stats for a metric array. * @param {number[]} raw @@ -482,6 +584,7 @@ const METRIC_DEFS = [ ['instructionCollectionTime', 'timing', 'ms'], ['agentInvokeTime', 'timing', 'ms'], ['heapDelta', 'memory', 'MB'], + ['gcDurationMs', 'memory', 'ms'], ['layoutCount', 'rendering', ''], ['recalcStyleCount', 'rendering', ''], ['forcedReflowCount', 'rendering', ''], @@ -504,6 +607,7 @@ module.exports = { median, removeOutliers, robustStats, + welchTTest, linearRegressionSlope, summarize, markDuration, diff --git a/scripts/chat-perf/test-chat-perf-regression.js b/scripts/chat-perf/test-chat-perf-regression.js index e3f33a6665f84..5abf6decd2e3f 100644 --- a/scripts/chat-perf/test-chat-perf-regression.js +++ b/scripts/chat-perf/test-chat-perf-regression.js @@ -18,6 +18,7 @@ * npm run perf:chat -- --scenario text-only # single scenario * npm run perf:chat -- --no-baseline # skip baseline comparison * npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 + * npm run perf:chat -- --resume .chat-perf-data/2026-04-14/results.json --runs 3 */ const path = require('path'); @@ -25,7 +26,7 @@ const fs = require('fs'); const { ROOT, DATA_DIR, SCENARIOS, METRIC_DEFS, resolveBuild, buildEnv, buildArgs, prepareRunDir, - robustStats, summarize, markDuration, launchVSCode, + robustStats, welchTTest, summarize, markDuration, launchVSCode, } = require('./common/utils'); // -- CLI args ---------------------------------------------------------------- @@ -35,6 +36,7 @@ function parseArgs() { const opts = { runs: 5, verbose: false, + ci: false, /** @type {string[]} */ scenarios: [], /** @type {string | undefined} */ @@ -45,6 +47,8 @@ function parseArgs() { baselineBuild: '1.115.0', saveBaseline: false, threshold: 0.2, + /** @type {string | undefined} */ + resume: undefined, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { @@ -57,6 +61,8 @@ function parseArgs() { case '--no-baseline': opts.baselineBuild = undefined; break; case '--save-baseline': opts.saveBaseline = true; break; case '--threshold': opts.threshold = parseFloat(args[++i]); break; + case '--resume': opts.resume = args[++i]; break; + case '--ci': opts.ci = true; break; case '--help': case '-h': console.log([ 'Chat performance benchmark', @@ -71,7 +77,10 @@ function parseArgs() { ' (default: 1.115.0; accepts "insiders", "1.100.0", commit hash)', ' --no-baseline Skip baseline comparison entirely', ' --save-baseline Save results as the new baseline (requires --baseline )', + ' --resume Resume a previous run, adding more iterations to increase', + ' confidence. Merges new runs with existing rawRuns data', ' --threshold Regression threshold fraction (default: 0.2 = 20%)', + ' --ci CI mode: write Markdown summary to ci-summary.md', ' --verbose Print per-run details', '', 'Scenarios: ' + SCENARIOS.join(', '), @@ -128,7 +137,14 @@ function parseArgs() { async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role) { const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer); const isDevBuild = !electronPath.includes('.vscode-test'); - const buildLabel = isDevBuild ? 'dev' : path.basename(path.dirname(path.dirname(path.dirname(electronPath)))).replace(/^vscode-/, ''); + // Extract a clean build label from the path. + // Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev" + // Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0" + let buildLabel = 'dev'; + if (!isDevBuild) { + const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/); + buildLabel = vscodeTestMatch ? vscodeTestMatch[1] : path.basename(electronPath); + } // Create a per-run diagnostics directory: /-/-/ const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, '')); @@ -337,16 +353,14 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); fs.writeFileSync(snapshotPath, snapshotChunks.join('')); - // Parse timing — always use client-side Date.now() for timeToFirstToken - // and timeToComplete so cross-build comparisons use the same method. - // Internal marks are reported separately for diagnostics. + // Parse timing — prefer internal code/chat/* marks (precise, in-process) + // with client-side Date.now() as fallback for older builds without marks. const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated'); - const timeToFirstToken = firstResponseTime - submitTime; + const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken'); + const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime); const timeToComplete = responseCompleteTime - submitTime; const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions'); const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke'); - // Internal-mark TTFT (more precise, but only available on dev builds) - const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken'); // Parse GC/long tasks let majorGCs = 0, minorGCs = 0, gcDurationMs = 0; @@ -391,10 +405,291 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru } } +// -- CI summary generation --------------------------------------------------- + +/** + * Generate a detailed Markdown summary table for CI. + * Printed to stdout and written to ci-summary.md. + * + * @param {Record} jsonReport + * @param {Record | null} baseline + * @param {{ threshold: number, runs: number, baselineBuild?: string, build?: string }} opts + */ +function generateCISummary(jsonReport, baseline, opts) { + const baseLabel = opts.baselineBuild || 'baseline'; + const testLabel = opts.build || 'dev (local)'; + const allMetrics = [ + ['timeToFirstToken', 'timing', 'ms'], + ['timeToComplete', 'timing', 'ms'], + ['layoutCount', 'rendering', ''], + ['recalcStyleCount', 'rendering', ''], + ['forcedReflowCount', 'rendering', ''], + ['longTaskCount', 'rendering', ''], + ['heapDelta', 'memory', 'MB'], + ['gcDurationMs', 'memory', 'ms'], + ]; + const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'layoutCount', 'recalcStyleCount', 'forcedReflowCount', 'longTaskCount']); + + const lines = []; + const scenarios = Object.keys(jsonReport.scenarios); + + lines.push(`# Chat Performance Comparison`); + lines.push(''); + lines.push(`| | |`); + lines.push(`|---|---|`); + lines.push(`| **Baseline** | \`${baseLabel}\` |`); + lines.push(`| **Test** | \`${testLabel}\` |`); + lines.push(`| **Runs per scenario** | ${opts.runs} |`); + lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`); + lines.push(`| **Scenarios** | ${scenarios.length} |`); + lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`); + lines.push(''); + + // Overall status + let totalRegressions = 0; + let totalImprovements = 0; + + // Per-scenario tables + for (const scenario of scenarios) { + const current = jsonReport.scenarios[scenario]; + const base = baseline?.scenarios?.[scenario]; + + lines.push(`## ${scenario}`); + lines.push(''); + + if (!base) { + lines.push('> No baseline data for this scenario.'); + lines.push(''); + + // Show absolute values + lines.push('| Metric | Value | StdDev | CV | n |'); + lines.push('|--------|------:|-------:|---:|--:|'); + for (const [metric, group, unit] of allMetrics) { + const cur = current[group]?.[metric]; + if (!cur) { continue; } + lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`); + } + lines.push(''); + continue; + } + + lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`); + lines.push(`|--------|----------|------|--------|---------|---------|`); + + for (const [metric, group, unit] of allMetrics) { + const cur = current[group]?.[metric]; + const bas = base[group]?.[metric]; + if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; } + + const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0; + const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; + const isRegressionMetric = regressionMetricNames.has(metric); + + // t-test + const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + const ttest = welchTTest(basRaw, curRaw); + const pStr = ttest ? `${ttest.pValue}` : 'n/a'; + + let verdict = ''; + if (isRegressionMetric) { + if (change > opts.threshold) { + if (!ttest) { + verdict = 'REGRESSION'; + totalRegressions++; + } else if (ttest.significant) { + verdict = 'REGRESSION'; + totalRegressions++; + } else { + verdict = 'noise'; + } + } else if (change < -opts.threshold && ttest?.significant) { + verdict = 'improved'; + totalImprovements++; + } else { + verdict = 'ok'; + } + } else { + verdict = 'info'; + } + + const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`; + const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`; + lines.push(`| ${metric} | ${basStr} | ${curStr} | ${pct} | ${pStr} | ${verdict} |`); + } + lines.push(''); + } + + // Grand summary + lines.push('## Summary'); + lines.push(''); + if (totalRegressions > 0) { + lines.push(`**${totalRegressions} regression(s) detected** across ${scenarios.length} scenario(s).`); + } else if (totalImprovements > 0) { + lines.push(`**No regressions.** ${totalImprovements} improvement(s) detected.`); + } else { + lines.push(`**No significant changes** across ${scenarios.length} scenario(s).`); + } + lines.push(''); + + // Raw data per scenario + lines.push('
Raw run data'); + lines.push(''); + for (const scenario of scenarios) { + const current = jsonReport.scenarios[scenario]; + lines.push(`### ${scenario}`); + lines.push(''); + lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | Heap Delta (MB) | Internal Marks |'); + lines.push('|----:|----------:|--------------:|--------:|--------------:|----------------:|:--------------:|'); + const runs = current.rawRuns || []; + for (let i = 0; i < runs.length; i++) { + const r = runs[i]; + lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); + } + lines.push(''); + } + if (baseline) { + for (const scenario of scenarios) { + const base = baseline.scenarios?.[scenario]; + if (!base) { continue; } + lines.push(`### ${scenario} (baseline)`); + lines.push(''); + lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | Heap Delta (MB) | Internal Marks |'); + lines.push('|----:|----------:|--------------:|--------:|--------------:|----------------:|:--------------:|'); + const runs = base.rawRuns || []; + for (let i = 0; i < runs.length; i++) { + const r = runs[i]; + lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); + } + lines.push(''); + } + } + lines.push('
'); + lines.push(''); + + return lines.join('\n'); +} + // -- Main -------------------------------------------------------------------- async function main() { const opts = parseArgs(); + + const { startServer } = require('./common/mock-llm-server'); + const mockServer = await startServer(0); + console.log(`[chat-perf] Mock LLM server: ${mockServer.url}`); + + // -- Resume mode -------------------------------------------------------- + if (opts.resume) { + if (!fs.existsSync(opts.resume)) { + console.error(`[chat-perf] Resume file not found: ${opts.resume}`); + process.exit(1); + } + const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8')); + const prevDir = path.dirname(opts.resume); + + // Find the associated baseline JSON in the same directory + const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json')); + const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null; + const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null; + + // Determine which scenarios to resume (default: all from previous run) + const resumeScenarios = opts.scenarios.length > 0 + ? opts.scenarios.filter(s => prevResults.scenarios?.[s]) + : Object.keys(prevResults.scenarios || {}); + + if (resumeScenarios.length === 0) { + console.error('[chat-perf] No matching scenarios found in previous results'); + process.exit(1); + } + + const testElectron = await resolveBuild(opts.build); + const baselineVersion = prevBaseline?.baselineBuildVersion; + const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null; + + const runsToAdd = opts.runs; + console.log(`[chat-perf] Resuming from: ${opts.resume}`); + console.log(`[chat-perf] Adding ${runsToAdd} runs per scenario`); + console.log(`[chat-perf] Scenarios: ${resumeScenarios.join(', ')}`); + if (prevBaseline) { + console.log(`[chat-perf] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`); + } + console.log(''); + + for (const scenario of resumeScenarios) { + console.log(`[chat-perf] === Resuming: ${scenario} ===`); + const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || []; + const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || []; + + // Run additional test iterations + console.log(`[chat-perf] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`); + for (let i = 0; i < runsToAdd; i++) { + const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`; + console.log(`[chat-perf] Run ${i + 1}/${runsToAdd}...`); + try { + const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test'); + prevTestRuns.push(m); + if (opts.verbose) { + const src = m.hasInternalMarks ? 'internal' : 'client-side'; + console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`); + } + } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } + } + + // Run additional baseline iterations + if (baselineElectron && prevBaseline?.scenarios?.[scenario]) { + console.log(`[chat-perf] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`); + for (let i = 0; i < runsToAdd; i++) { + const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`; + console.log(`[chat-perf] Run ${i + 1}/${runsToAdd}...`); + try { + const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline'); + prevBaseRuns.push(m); + } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } + } + } + + // Recompute stats with merged data + const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: prevTestRuns }); + for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); } + prevResults.scenarios[scenario] = sd; + + if (prevBaseline?.scenarios?.[scenario]) { + const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: prevBaseRuns }); + for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); } + prevBaseline.scenarios[scenario] = bsd; + } + console.log(`[chat-perf] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`); + console.log(''); + } + + // Write updated files back + prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs)); + prevResults.lastResumed = new Date().toISOString(); + fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2)); + console.log(`[chat-perf] Updated results: ${opts.resume}`); + + if (prevBaseline && baselineFile) { + prevBaseline.lastResumed = new Date().toISOString(); + fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2)); + // Also update cached baseline + const cachedPath = path.join(DATA_DIR, path.basename(baselineFile)); + fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2)); + console.log(`[chat-perf] Updated baseline: ${baselineFile}`); + } + + // -- Re-run comparison with merged data -------------------------------- + opts.baseline = baselineFile || undefined; + const jsonReport = prevResults; + jsonReport._resultsPath = opts.resume; + + // Fall through to comparison logic below + await printComparison(jsonReport, opts); + await mockServer.close(); + return; + } + + // -- Normal (non-resume) flow ------------------------------------------- const electronPath = await resolveBuild(opts.build); if (!fs.existsSync(electronPath)) { @@ -403,10 +698,6 @@ async function main() { process.exit(1); } - const { startServer } = require('./common/mock-llm-server'); - const mockServer = await startServer(0); - console.log(`[chat-perf] Mock LLM server: ${mockServer.url}`); - // Create a timestamped run directory for all output const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const runDir = path.join(DATA_DIR, runTimestamp); @@ -417,7 +708,7 @@ async function main() { if (opts.baselineBuild) { const baselineJsonPath = path.join(runDir, `baseline-${opts.baselineBuild}.json`); const cachedPath = path.join(DATA_DIR, `baseline-${opts.baselineBuild}.json`); - const cachedBaseline = fs.existsSync(cachedPath) + const cachedBaseline = !opts.ci && fs.existsSync(cachedPath) ? JSON.parse(fs.readFileSync(cachedPath, 'utf-8')) : null; @@ -447,7 +738,7 @@ async function main() { scenarios: /** @type {Record} */ ({}), }; for (const [scenario, results] of Object.entries(baselineResults)) { - const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {} }); + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } baselineReport.scenarios[scenario] = sd; } @@ -519,6 +810,7 @@ async function main() { jsonReport.scenarios[scenario] = sd; } fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2)); + jsonReport._resultsPath = jsonPath; console.log(''); console.log(`[chat-perf] Results written to ${jsonPath}`); @@ -530,7 +822,20 @@ async function main() { } // -- Baseline comparison --------------------------------------------- + await printComparison(jsonReport, opts); + + if (anyFailed) { process.exit(1); } + await mockServer.close(); +} + +/** + * Print baseline comparison and exit with code 1 if regressions found. + * @param {Record} jsonReport + * @param {{ baseline?: string, threshold: number, ci?: boolean, runs?: number, baselineBuild?: string, build?: string }} opts + */ +async function printComparison(jsonReport, opts) { let regressionFound = false; + let inconclusiveFound = false; if (opts.baseline && fs.existsSync(opts.baseline)) { const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')); console.log(''); @@ -569,8 +874,29 @@ async function main() { if (!cur || !bas || !bas.median) { continue; } const change = (cur.median - bas.median) / bas.median; const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; - const flag = change > opts.threshold ? ' ← REGRESSION' : ''; - if (change > opts.threshold) { scenarioRegression = true; regressionFound = true; } + + // Statistical significance via Welch's t-test on raw run values + const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + const ttest = welchTTest(basRaw, curRaw); + + let flag = ''; + if (change > opts.threshold) { + if (!ttest) { + flag = ' ← REGRESSION (n too small for significance test)'; + scenarioRegression = true; + regressionFound = true; + } else if (ttest.significant) { + flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`; + scenarioRegression = true; + regressionFound = true; + } else { + flag = ` (likely noise — p=${ttest.pValue}, not significant)`; + inconclusiveFound = true; + } + } else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') { + flag = ` (significant increase, p=${ttest.pValue})`; + } diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`); } for (const [metric, group, unit] of infoMetrics) { @@ -587,12 +913,48 @@ async function main() { console.log(''); console.log(regressionFound - ? `[chat-perf] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold` - : `[chat-perf] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline`); + ? `[chat-perf] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance` + : `[chat-perf] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`); + + if (inconclusiveFound && !regressionFound) { + // Find the results.json path to suggest in the hint + const resultsPath = Object.keys(jsonReport.scenarios).length > 0 + ? (jsonReport._resultsPath || opts.resume || 'path/to/results.json') + : 'path/to/results.json'; + console.log(''); + console.log('[chat-perf] Some metrics exceeded the threshold but were not statistically significant.'); + console.log('[chat-perf] To increase confidence, add more runs with --resume:'); + console.log(`[chat-perf] npm run perf:chat -- --resume ${resultsPath} --runs 3`); + } } - if (anyFailed || regressionFound) { process.exit(1); } - await mockServer.close(); + // -- CI summary ------------------------------------------------------ + if (opts.ci) { + const ciBaseline = opts.baseline && fs.existsSync(opts.baseline) + ? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')) + : null; + const summary = generateCISummary(jsonReport, ciBaseline, { + threshold: opts.threshold, + runs: jsonReport.runsPerScenario || opts.runs, + baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild, + build: opts.build, + }); + + // Write to file for GitHub Actions $GITHUB_STEP_SUMMARY + const summaryPath = path.join(DATA_DIR, 'ci-summary.md'); + fs.writeFileSync(summaryPath, summary); + console.log(`[chat-perf] CI summary written to ${summaryPath}`); + + // Also print the full summary table to stdout + console.log(''); + console.log('=================================================================='); + console.log(' CHAT PERF COMPARISON RESULTS '); + console.log('=================================================================='); + console.log(''); + console.log(summary); + } + + if (regressionFound) { process.exit(1); } } main().catch(err => { console.error(err); process.exit(1); }); From 2e1b4a725ec1c5accc567c203d0729a6a523c8a7 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 11:42:27 -0700 Subject: [PATCH 03/13] wip --- .github/workflows/chat-perf.yml | 59 +- scripts/chat-perf/common/mock-llm-server.js | 668 ++++++++++++++---- scripts/chat-perf/common/utils.js | 16 +- .../chat-perf/test-chat-perf-regression.js | 39 +- 4 files changed, 636 insertions(+), 146 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index adebefb98479b..ddce98e68468b 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -1,6 +1,12 @@ name: Chat Performance Comparison on: + pull_request: + paths: + - '.github/workflows/chat-perf.yml' + schedule: + # Every Friday at 12:00 AM PT (07:00 UTC) + - cron: '0 7 * * 5' workflow_dispatch: inputs: baseline_commit: @@ -34,16 +40,22 @@ concurrency: group: chat-perf-${{ github.run_id }} cancel-in-progress: true +env: + BASELINE_COMMIT: ${{ inputs.baseline_commit || '1.115.0' }} + TEST_COMMIT: ${{ inputs.test_commit || 'main' }} + PERF_RUNS: ${{ inputs.runs || 7 }} + PERF_THRESHOLD: ${{ inputs.threshold || 0.2 }} + jobs: chat-perf: - name: Chat Perf – ${{ inputs.baseline_commit }} vs ${{ inputs.test_commit }} + name: Chat Perf – ${{ inputs.baseline_commit || '1.115.0' }} vs ${{ inputs.test_commit || 'main' }} runs-on: ubuntu-latest timeout-minutes: 120 steps: - name: Checkout test commit uses: actions/checkout@v6 with: - ref: ${{ inputs.test_commit }} + ref: ${{ env.TEST_COMMIT }} - name: Setup Node.js uses: actions/setup-node@v6 @@ -90,10 +102,10 @@ jobs: fi xvfb-run node scripts/chat-perf/test-chat-perf-regression.js \ - --baseline-build "${{ inputs.baseline_commit }}" \ - --build "${{ inputs.test_commit }}" \ - --runs ${{ inputs.runs }} \ - --threshold ${{ inputs.threshold }} \ + --baseline-build "${{ env.BASELINE_COMMIT }}" \ + --build "${{ env.TEST_COMMIT }}" \ + --runs ${{ env.PERF_RUNS }} \ + --threshold ${{ env.PERF_THRESHOLD }} \ --ci \ $SCENARIO_ARGS \ 2>&1 | tee perf-output.log @@ -102,6 +114,19 @@ jobs: exit ${PIPESTATUS[0]} continue-on-error: true + - name: Run memory leak check + id: leak + run: | + xvfb-run node scripts/chat-perf/test-chat-mem-leaks.js \ + --build "${{ env.TEST_COMMIT }}" \ + --messages 10 \ + --threshold 2 \ + --verbose \ + 2>&1 | tee leak-output.log + + exit ${PIPESTATUS[0]} + continue-on-error: true + - name: Write job summary if: always() run: | @@ -111,6 +136,15 @@ jobs: echo "⚠️ No summary file generated. Check perf-output.log artifact." >> "$GITHUB_STEP_SUMMARY" fi + if [[ -f .chat-perf-data/chat-perf-leak-results.json ]]; then + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "## Memory Leak Check" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo '```json' >> "$GITHUB_STEP_SUMMARY" + cat .chat-perf-data/chat-perf-leak-results.json >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + fi + - name: Zip diagnostic outputs if: always() run: | @@ -123,6 +157,7 @@ jobs: "$(basename "$RUN_DIR")"/ \ ci-summary.md \ baseline-*.json \ + chat-perf-leak-results.json \ 2>/dev/null || true cd .. fi @@ -131,14 +166,20 @@ jobs: if: always() uses: actions/upload-artifact@v7 with: - name: chat-perf-${{ inputs.baseline_commit }}-vs-${{ inputs.test_commit }} + name: chat-perf-${{ env.BASELINE_COMMIT }}-vs-${{ env.TEST_COMMIT }} path: | chat-perf-artifacts.zip perf-output.log + leak-output.log retention-days: 30 - name: Fail on regression - if: steps.perf.outcome == 'failure' + if: steps.perf.outcome == 'failure' || steps.leak.outcome == 'failure' run: | - echo "::error::Chat performance regression detected. See job summary for details." + if [[ "${{ steps.perf.outcome }}" == "failure" ]]; then + echo "::error::Chat performance regression detected. See job summary for details." + fi + if [[ "${{ steps.leak.outcome }}" == "failure" ]]; then + echo "::error::Chat memory leak detected. See leak-output.log for details." + fi exit 1 diff --git a/scripts/chat-perf/common/mock-llm-server.js b/scripts/chat-perf/common/mock-llm-server.js index 1b45967d1b2f2..2e8429db28b83 100644 --- a/scripts/chat-perf/common/mock-llm-server.js +++ b/scripts/chat-perf/common/mock-llm-server.js @@ -16,88 +16,192 @@ */ const http = require('http'); +const path = require('path'); const { EventEmitter } = require('events'); +const ROOT = path.join(__dirname, '..', '..', '..'); + // -- Scenario fixtures ------------------------------------------------------- -/** @type {Record} */ +/** + * @typedef {{ content: string, delayMs: number }} StreamChunk + */ + +/** + * A single model turn in a multi-turn scenario. + * + * @typedef {{ + * kind: 'tool-calls', + * toolCalls: Array<{ toolNamePattern: RegExp, arguments: Record }>, + * } | { + * kind: 'content', + * chunks: StreamChunk[], + * }} ModelTurn + */ + +/** + * A multi-turn scenario — an ordered sequence of model turns. + * The mock server determines which turn to serve based on the number + * of assistant→tool round-trips already present in the conversation. + * + * @typedef {{ + * type: 'multi-turn', + * turns: ModelTurn[], + * }} MultiTurnScenario + */ + +/** + * @param {any} scenario + * @returns {scenario is MultiTurnScenario} + */ +function isMultiTurnScenario(scenario) { + return scenario && typeof scenario === 'object' && scenario.type === 'multi-turn'; +} + +/** + * Helper for building scenario chunk sequences with timing control. + */ +class ScenarioBuilder { + constructor() { + /** @type {StreamChunk[]} */ + this.chunks = []; + } + + /** + * Emit a content chunk immediately (no delay before it). + * @param {string} content + * @returns {this} + */ + emit(content) { + this.chunks.push({ content, delayMs: 0 }); + return this; + } + + /** + * Wait, then emit a content chunk — simulates network/token generation latency. + * @param {number} ms - delay in milliseconds before this chunk + * @param {string} content + * @returns {this} + */ + wait(ms, content) { + this.chunks.push({ content, delayMs: ms }); + return this; + } + + /** + * Emit multiple chunks with uniform inter-chunk delay. + * @param {string[]} contents + * @param {number} [delayMs=15] - delay between each chunk (default ~1 frame) + * @returns {this} + */ + stream(contents, delayMs = 15) { + for (const content of contents) { + this.chunks.push({ content, delayMs }); + } + return this; + } + + /** + * Emit multiple chunks with no delay (burst). + * @param {string[]} contents + * @returns {this} + */ + burst(contents) { + return this.stream(contents, 0); + } + + /** @returns {StreamChunk[]} */ + build() { + return this.chunks; + } +} + +/** @type {Record} */ const SCENARIOS = { - 'text-only': [ - 'Here is an explanation of the code you selected:\n\n', - 'The function `processItems` iterates over the input array and applies a transformation to each element. ', - 'It uses a `Map` to track previously seen values, which allows it to deduplicate results efficiently in O(n) time.\n\n', - 'The algorithm works in a single pass: for every element, it computes the transformed value, ', - 'checks membership in the set, and conditionally appends to the output array. ', - 'This is a common pattern in data processing pipelines where uniqueness constraints must be maintained.\n\n', - 'Edge cases to consider include empty arrays, duplicate transformations that produce the same key, ', - 'and items where the transform function itself is expensive.\n\n', - 'The time complexity is **O(n)** and the space complexity is **O(n)** in the worst case when all items are unique.\n', - ], - 'large-codeblock': [ - 'Here is the refactored implementation:\n\n', - '```typescript\n', - 'import { EventEmitter } from "events";\n\n', - 'interface CacheEntry {\n value: T;\n expiresAt: number;\n accessCount: number;\n}\n\n', - 'export class LRUCache {\n', - ' private readonly _map = new Map>();\n', - ' private readonly _emitter = new EventEmitter();\n\n', - ' constructor(\n private readonly _maxSize: number,\n private readonly _ttlMs: number = 60_000,\n ) {}\n\n', - ' get(key: K): V | undefined {\n const entry = this._map.get(key);\n if (!entry) { return undefined; }\n', - ' if (Date.now() > entry.expiresAt) {\n this._map.delete(key);\n this._emitter.emit("evict", key);\n return undefined;\n }\n', - ' entry.accessCount++;\n this._map.delete(key);\n this._map.set(key, entry);\n return entry.value;\n }\n\n', - ' set(key: K, value: V): void {\n if (this._map.size >= this._maxSize) {\n', - ' const oldest = this._map.keys().next().value;\n if (oldest !== undefined) {\n this._map.delete(oldest);\n this._emitter.emit("evict", oldest);\n }\n }\n', - ' this._map.set(key, { value, expiresAt: Date.now() + this._ttlMs, accessCount: 0 });\n }\n\n', - ' clear(): void { this._map.clear(); this._emitter.emit("clear"); }\n', - ' get size(): number { return this._map.size; }\n', - ' onEvict(listener: (key: K) => void): void { this._emitter.on("evict", listener); }\n}\n', - '```\n\n', - 'The key changes:\n- Added TTL-based expiry with configurable timeout\n- LRU eviction uses Map insertion order\n- EventEmitter notifies on evictions for cache observability\n', - ], + 'text-only': new ScenarioBuilder() + .stream([ + 'Here is an explanation of the code you selected:\n\n', + 'The function `processItems` iterates over the input array and applies a transformation to each element. ', + 'It uses a `Map` to track previously seen values, which allows it to deduplicate results efficiently in O(n) time.\n\n', + 'The algorithm works in a single pass: for every element, it computes the transformed value, ', + 'checks membership in the set, and conditionally appends to the output array. ', + 'This is a common pattern in data processing pipelines where uniqueness constraints must be maintained.\n\n', + 'Edge cases to consider include empty arrays, duplicate transformations that produce the same key, ', + 'and items where the transform function itself is expensive.\n\n', + 'The time complexity is **O(n)** and the space complexity is **O(n)** in the worst case when all items are unique.\n', + ], 20) + .build(), + + 'large-codeblock': new ScenarioBuilder() + .stream([ + 'Here is the refactored implementation:\n\n', + '```typescript\n', + 'import { EventEmitter } from "events";\n\n', + 'interface CacheEntry {\n value: T;\n expiresAt: number;\n accessCount: number;\n}\n\n', + 'export class LRUCache {\n', + ' private readonly _map = new Map>();\n', + ' private readonly _emitter = new EventEmitter();\n\n', + ' constructor(\n private readonly _maxSize: number,\n private readonly _ttlMs: number = 60_000,\n ) {}\n\n', + ' get(key: K): V | undefined {\n const entry = this._map.get(key);\n if (!entry) { return undefined; }\n', + ' if (Date.now() > entry.expiresAt) {\n this._map.delete(key);\n this._emitter.emit("evict", key);\n return undefined;\n }\n', + ' entry.accessCount++;\n this._map.delete(key);\n this._map.set(key, entry);\n return entry.value;\n }\n\n', + ' set(key: K, value: V): void {\n if (this._map.size >= this._maxSize) {\n', + ' const oldest = this._map.keys().next().value;\n if (oldest !== undefined) {\n this._map.delete(oldest);\n this._emitter.emit("evict", oldest);\n }\n }\n', + ' this._map.set(key, { value, expiresAt: Date.now() + this._ttlMs, accessCount: 0 });\n }\n\n', + ' clear(): void { this._map.clear(); this._emitter.emit("clear"); }\n', + ' get size(): number { return this._map.size; }\n', + ' onEvict(listener: (key: K) => void): void { this._emitter.on("evict", listener); }\n}\n', + '```\n\n', + 'The key changes:\n- Added TTL-based expiry with configurable timeout\n- LRU eviction uses Map insertion order\n- EventEmitter notifies on evictions for cache observability\n', + ], 20) + .build(), + 'many-small-chunks': (() => { - const chunks = ['Generating detailed analysis:\n\n']; - for (let i = 0; i < 200; i++) { - chunks.push(`Word${i} `); - } - chunks.push('\n\nAnalysis complete.\n'); - return chunks; + const words = ['Generating detailed analysis:\n\n']; + for (let i = 0; i < 200; i++) { words.push(`Word${i} `); } + words.push('\n\nAnalysis complete.\n'); + const b = new ScenarioBuilder(); + b.stream(words, 5); + return b.build(); })(), - 'mixed-content': [ - '## Issue Found\n\n', - 'The `DisposableStore` is not being disposed in the `deactivate` path, ', - 'which can lead to memory leaks.\n\n', - '### Current Code\n\n', - '```typescript\nclass MyService {\n private store = new DisposableStore();\n // missing dispose!\n}\n```\n\n', - '### Suggested Fix\n\n', - '```typescript\nclass MyService extends Disposable {\n', - ' private readonly store = this._register(new DisposableStore());\n\n', - ' override dispose(): void {\n this.store.dispose();\n super.dispose();\n }\n}\n```\n\n', - 'This ensures the store is cleaned up when the service is disposed via the workbench lifecycle.\n', - ], + + 'mixed-content': new ScenarioBuilder() + .stream([ + '## Issue Found\n\n', + 'The `DisposableStore` is not being disposed in the `deactivate` path, ', + 'which can lead to memory leaks.\n\n', + '### Current Code\n\n', + '```typescript\nclass MyService {\n private store = new DisposableStore();\n // missing dispose!\n}\n```\n\n', + '### Suggested Fix\n\n', + '```typescript\nclass MyService extends Disposable {\n', + ' private readonly store = this._register(new DisposableStore());\n\n', + ' override dispose(): void {\n this.store.dispose();\n super.dispose();\n }\n}\n```\n\n', + 'This ensures the store is cleaned up when the service is disposed via the workbench lifecycle.\n', + ], 20) + .build(), // -- Stress-test scenarios -------------------------------------------- - // ~500 lines of code across 10 fenced blocks — stresses syntax - // highlighting, code block rendering, and copy-button creation. 'many-codeblocks': (() => { - const chunks = ['Here are the implementations for each module:\n\n']; + const b = new ScenarioBuilder(); + b.emit('Here are the implementations for each module:\n\n'); for (let i = 0; i < 10; i++) { - chunks.push(`### Module ${i + 1}: \`handler${i}.ts\`\n\n`); - chunks.push('```typescript\n'); + b.wait(10, `### Module ${i + 1}: \`handler${i}.ts\`\n\n`); + b.emit('```typescript\n'); + const lines = []; for (let j = 0; j < 15; j++) { - chunks.push(`export function handle${i}_${j}(input: string): string {\n`); - chunks.push(` const result = input.trim().split('').reverse().join('');\n`); - chunks.push(` return \`[\${result}] processed by handler ${i}_${j}\`;\n`); - chunks.push('}\n\n'); + lines.push(`export function handle${i}_${j}(input: string): string {\n`); + lines.push(` const result = input.trim().split('').reverse().join('');\n`); + lines.push(` return \`[\${result}] processed by handler ${i}_${j}\`;\n`); + lines.push('}\n\n'); } - chunks.push('```\n\n'); + b.stream(lines, 5); + b.emit('```\n\n'); } - chunks.push('All modules implement the same pattern with unique handler IDs.\n'); - return chunks; + b.emit('All modules implement the same pattern with unique handler IDs.\n'); + return b.build(); })(), - // Very long prose — stresses markdown rendering, word wrapping, - // and layout with ~3000 words of continuous text. 'long-prose': (() => { const sentences = [ 'The architecture follows a layered dependency injection pattern where each service declares its dependencies through constructor parameters. ', @@ -109,74 +213,74 @@ const SCENARIOS = { 'Contributors register their functionality through extension points, which are processed during the appropriate lifecycle phase. ', 'This contribution model allows features to be added without modifying the core workbench code, maintaining a clean separation of concerns. ', ]; - const chunks = ['# Detailed Architecture Analysis\n\n']; + const b = new ScenarioBuilder(); + b.emit('# Detailed Architecture Analysis\n\n'); for (let para = 0; para < 15; para++) { - chunks.push(`## Section ${para + 1}: ${['Overview', 'Design Patterns', 'Service Layer', 'Event System', 'State Management', 'Error Handling', 'Performance', 'Testing', 'Deployment', 'Monitoring', 'Security', 'Extensibility', 'Compatibility', 'Migration', 'Future Work'][para]}\n\n`); - for (let s = 0; s < 25; s++) { - chunks.push(sentences[s % sentences.length]); - } - chunks.push('\n\n'); + b.wait(15, `## Section ${para + 1}: ${['Overview', 'Design Patterns', 'Service Layer', 'Event System', 'State Management', 'Error Handling', 'Performance', 'Testing', 'Deployment', 'Monitoring', 'Security', 'Extensibility', 'Compatibility', 'Migration', 'Future Work'][para]}\n\n`); + const paraSentences = []; + for (let s = 0; s < 25; s++) { paraSentences.push(sentences[s % sentences.length]); } + b.stream(paraSentences, 8); + b.emit('\n\n'); } - return chunks; + return b.build(); })(), - // Deeply nested markdown — headers, ordered/unordered lists, bold, - // italic, inline code, links, blockquotes. Exercises the full - // markdown renderer pipeline. 'rich-markdown': (() => { - const chunks = ['# Comprehensive Code Review Report\n\n']; - chunks.push('> **Summary**: Found 12 issues across 4 severity levels.\n\n'); + const b = new ScenarioBuilder(); + b.emit('# Comprehensive Code Review Report\n\n'); + b.wait(15, '> **Summary**: Found 12 issues across 4 severity levels.\n\n'); for (let section = 0; section < 6; section++) { - chunks.push(`## ${section + 1}. ${['Critical Issues', 'Performance Concerns', 'Code Style', 'Documentation Gaps', 'Test Coverage', 'Security Review'][section]}\n\n`); + b.wait(10, `## ${section + 1}. ${['Critical Issues', 'Performance Concerns', 'Code Style', 'Documentation Gaps', 'Test Coverage', 'Security Review'][section]}\n\n`); for (let item = 0; item < 5; item++) { - chunks.push(`${item + 1}. **Issue ${section * 5 + item + 1}**: \`${['useState', 'useEffect', 'useMemo', 'useCallback', 'useRef'][item]}\` in \`src/components/Widget${item}.tsx\`\n`); - chunks.push(` - Severity: ${['[Critical]', '[Warning]', '[Info]', '[Suggestion]', '[Note]'][item]}\n`); - chunks.push(` - The current implementation uses *unnecessary re-renders* due to missing dependency arrays.\n`); - chunks.push(` - See [React docs](https://react.dev/reference) and the [\`useMemo\` guide](https://react.dev/reference/react/useMemo).\n`); - chunks.push(` - Fix: wrap in \`useCallback\` or extract to a ***separate memoized component***.\n\n`); + b.stream([ + `${item + 1}. **Issue ${section * 5 + item + 1}**: \`${['useState', 'useEffect', 'useMemo', 'useCallback', 'useRef'][item]}\` in \`src/components/Widget${item}.tsx\`\n`, + ` - Severity: ${['[Critical]', '[Warning]', '[Info]', '[Suggestion]', '[Note]'][item]}\n`, + ` - The current implementation uses *unnecessary re-renders* due to missing dependency arrays.\n`, + ` - See [React docs](https://react.dev/reference) and the [\`useMemo\` guide](https://react.dev/reference/react/useMemo).\n`, + ` - Fix: wrap in \`useCallback\` or extract to a ***separate memoized component***.\n\n`, + ], 10); } - chunks.push('---\n\n'); + b.emit('---\n\n'); } - chunks.push('> *Report generated automatically. Please review all suggestions before applying.*\n'); - return chunks; + b.emit('> *Report generated automatically. Please review all suggestions before applying.*\n'); + return b.build(); })(), - // A huge single code block (~200 lines) — stresses the syntax - // highlighter and scroll virtualization within a code block. 'giant-codeblock': (() => { - const chunks = ['Here is the complete implementation:\n\n```typescript\n']; - chunks.push('import { Disposable, DisposableStore } from "vs/base/common/lifecycle";\n'); - chunks.push('import { Emitter, Event } from "vs/base/common/event";\n'); - chunks.push('import { URI } from "vs/base/common/uri";\n\n'); + const b = new ScenarioBuilder(); + b.emit('Here is the complete implementation:\n\n```typescript\n'); + b.stream([ + 'import { Disposable, DisposableStore } from "vs/base/common/lifecycle";\n', + 'import { Emitter, Event } from "vs/base/common/event";\n', + 'import { URI } from "vs/base/common/uri";\n\n', + ], 10); for (let i = 0; i < 40; i++) { - chunks.push(`export class Service${i} extends Disposable {\n`); - chunks.push(` private readonly _onDidChange = this._register(new Emitter());\n`); - chunks.push(` readonly onDidChange: Event = this._onDidChange.event;\n\n`); - chunks.push(` private _value: string = '';\n`); - chunks.push(` get value(): string { return this._value; }\n\n`); - chunks.push(` async update(uri: URI): Promise {\n`); - chunks.push(` this._value = uri.toString();\n`); - chunks.push(` this._onDidChange.fire();\n`); - chunks.push(` }\n`); - chunks.push('}\n\n'); + b.stream([ + `export class Service${i} extends Disposable {\n`, + ` private readonly _onDidChange = this._register(new Emitter());\n`, + ` readonly onDidChange: Event = this._onDidChange.event;\n\n`, + ` private _value: string = '';\n`, + ` get value(): string { return this._value; }\n\n`, + ` async update(uri: URI): Promise {\n`, + ` this._value = uri.toString();\n`, + ` this._onDidChange.fire();\n`, + ` }\n`, + '}\n\n', + ], 5); } - chunks.push('```\n\nThis defines 40 service classes following the standard VS Code pattern.\n'); - return chunks; + b.emit('```\n\nThis defines 40 service classes following the standard VS Code pattern.\n'); + return b.build(); })(), - // 1000 very small chunks — stresses the streaming SSE pipeline - // and incremental DOM updates with high chunk frequency. 'rapid-stream': (() => { - const chunks = []; - for (let i = 0; i < 1000; i++) { - chunks.push(`w${i} `); - } - return chunks; + const b = new ScenarioBuilder(); + const words = []; + for (let i = 0; i < 1000; i++) { words.push(`w${i} `); } + // Very fast inter-chunk delay to stress the streaming pipeline + b.stream(words, 2); + return b.build(); })(), - // Many file URI references — stresses link detection, file - // resolution, path rendering, hover providers, and inline - // anchor widget creation. 'file-links': (() => { const files = [ 'src/vs/workbench/contrib/chat/browser/chatListRenderer.ts', @@ -192,24 +296,114 @@ const SCENARIOS = { 'src/vs/editor/browser/widget/codeEditor/editor.ts', 'src/vs/workbench/browser/parts/editor/editorGroupView.ts', ]; - const chunks = ['I found references to the disposable pattern across the following files:\n\n']; + const b = new ScenarioBuilder(); + b.emit('I found references to the disposable pattern across the following files:\n\n'); for (let i = 0; i < files.length; i++) { const line = Math.floor(Math.random() * 500) + 1; - chunks.push(`${i + 1}. [${files[i]}](${files[i]}#L${line}) — `); - chunks.push(`Line ${line}: uses \`DisposableStore\` with ${Math.floor(Math.random() * 10) + 1} registrations\n`); + b.stream([ + `${i + 1}. [${files[i]}](${files[i]}#L${line}) -- `, + `Line ${line}: uses \`DisposableStore\` with ${Math.floor(Math.random() * 10) + 1} registrations\n`, + ], 15); } - chunks.push('\nAdditionally, the following files import from `vs/base/common/lifecycle`:\n\n'); + b.wait(10, '\nAdditionally, the following files import from `vs/base/common/lifecycle`:\n\n'); for (let i = 0; i < 20; i++) { const depth = ['base', 'platform', 'editor', 'workbench'][i % 4]; const area = ['common', 'browser', 'node', 'electron-browser'][i % 4]; const name = ['service', 'provider', 'contribution', 'handler', 'manager'][i % 5]; const file = `src/vs/${depth}/${area}/${name}${i}.ts`; - chunks.push(`- [${file}](${file}#L${i * 10 + 5})`); - chunks.push(` — imports \`Disposable\`, \`DisposableStore\`\n`); + b.stream([ + `- [${file}](${file}#L${i * 10 + 5})`, + ` -- imports \`Disposable\`, \`DisposableStore\`\n`, + ], 12); } - chunks.push('\nTotal: 32 files reference the disposable pattern.\n'); - return chunks; + b.emit('\nTotal: 32 files reference the disposable pattern.\n'); + return b.build(); })(), + + // -- Tool call scenarios ----------------------------------------------- + + 'tool-read-file': /** @type {MultiTurnScenario} */ ({ + type: 'multi-turn', + turns: [ + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /read.?file/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + offset: 1, + limit: 50, + }, + }, + ], + }, + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, 'I read the file `src/vs/base/common/lifecycle.ts`. Here is my analysis:\n\n') + .stream([ + 'The `Disposable` base class provides a standard lifecycle pattern for VS Code components. ', + 'It maintains a `DisposableStore` internally via `this._store` and exposes `this._register()` ', + 'for subclasses to track their own disposables.\n\n', + 'Key patterns:\n', + '- **`_register()`** — adds a disposable to the internal store, ensuring cleanup on `dispose()`\n', + '- **`DisposableStore`** — a collection that disposes all contained items when itself disposed\n', + '- **`MutableDisposable`** — holds a single disposable that can be swapped; the old one is disposed automatically\n\n', + 'The `toDisposable()` helper wraps a callback into an `IDisposable`, which is convenient for ', + 'one-off cleanup like removing event listeners.\n', + ], 20) + .build(), + }, + ], + }), + + 'tool-edit-file': /** @type {MultiTurnScenario} */ ({ + type: 'multi-turn', + turns: [ + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /read.?file/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + offset: 1, + limit: 30, + }, + }, + ], + }, + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + oldString: '// perf-benchmark-marker', + newString: '// perf-benchmark-marker (updated)', + explanation: 'Update the benchmark marker comment', + }, + }, + ], + }, + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, 'I have read and edited `src/vs/base/common/lifecycle.ts`.\n\n') + .stream([ + 'The changes I made:\n', + '1. Read the file to understand its structure\n', + '2. Applied the edit to update the benchmark marker comment\n\n', + 'The `Disposable` pattern in this file is the foundation of VS Code\'s lifecycle management. ', + 'All components that own resources should extend `Disposable` and register their cleanup ', + 'handlers via `this._register()`. This ensures proper teardown when the component is disposed.\n', + ], 20) + .build(), + }, + ], + }), }; const DEFAULT_SCENARIO = 'text-only'; @@ -255,6 +449,99 @@ function makeInitialChunk() { }; } +/** + * Build a tool-call initial chunk (role only, no content). + */ +function makeToolCallInitialChunk() { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { role: 'assistant', content: null }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + +/** + * Build a tool-call function-start chunk. + * @param {number} index - tool call index + * @param {string} callId - unique call ID + * @param {string} functionName - tool function name + */ +function makeToolCallStartChunk(index, callId, functionName) { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { + tool_calls: [{ + index, + id: callId, + type: 'function', + function: { name: functionName, arguments: '' }, + }], + }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + +/** + * Build a tool-call arguments chunk. + * @param {number} index - tool call index + * @param {string} argsFragment - partial JSON arguments + */ +function makeToolCallArgsChunk(index, argsFragment) { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { + tool_calls: [{ + index, + function: { arguments: argsFragment }, + }], + }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + +/** + * Build a tool-call finish chunk. + */ +function makeToolCallFinishChunk() { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: {}, + finish_reason: 'tool_calls', + content_filter_results: {}, + }], + usage: null, + }; +} + // -- Request handler --------------------------------------------------------- /** @@ -505,16 +792,40 @@ function handleRequest(req, res) { /** Emitted when a scenario chat completion is fully served. */ const serverEvents = new EventEmitter(); +/** @param {number} ms */ +const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +/** + * Count the number of completed assistant→tool round-trips in the conversation. + * Each round-trip = one assistant message with tool_calls followed by one or + * more tool result messages. + * @param {any[]} messages + * @returns {number} + */ +function countCompletedToolRoundTrips(messages) { + let roundTrips = 0; + for (const msg of messages) { + if (msg.role === 'assistant' && msg.tool_calls && msg.tool_calls.length > 0) { + roundTrips++; + } + } + return roundTrips; +} + /** * @param {string} body * @param {http.ServerResponse} res */ -function handleChatCompletions(body, res) { +async function handleChatCompletions(body, res) { let scenarioId = DEFAULT_SCENARIO; let isScenarioRequest = false; + /** @type {string[]} */ + let requestToolNames = []; + /** @type {any[]} */ + let messages = []; try { const parsed = JSON.parse(body); - const messages = parsed.messages || []; + messages = parsed.messages || []; // Log user messages for debugging const userMsgs = messages.filter((/** @type {any} */ m) => m.role === 'user'); if (userMsgs.length > 0) { @@ -524,6 +835,14 @@ function handleChatCompletions(body, res) { const ts = new Date().toISOString().slice(11, -1); console.log(`[mock-llm] ${ts} → ${messages.length} msgs, last user: "${lastContent}"`); } + // Extract available tool names from the request's tools array + const tools = parsed.tools || []; + requestToolNames = tools.map((/** @type {any} */ t) => t.function?.name).filter(Boolean); + if (requestToolNames.length > 0) { + const ts = new Date().toISOString().slice(11, -1); + console.log(`[mock-llm] ${ts} → ${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`); + } + const lastUser = [...messages].reverse().find((/** @type {any} */ m) => m.role === 'user'); if (lastUser) { // Extract scenario ID from user message content @@ -540,7 +859,7 @@ function handleChatCompletions(body, res) { } } catch { } - const chunks = SCENARIOS[scenarioId] || SCENARIOS[DEFAULT_SCENARIO]; + const scenario = SCENARIOS[scenarioId] || SCENARIOS[DEFAULT_SCENARIO]; res.writeHead(200, { 'Content-Type': 'text/event-stream', @@ -549,18 +868,66 @@ function handleChatCompletions(body, res) { 'X-Request-Id': 'perf-benchmark-' + Date.now(), }); - // Initial role chunk + // Handle multi-turn scenarios — only when the request actually has tools. + // Ancillary requests (title generation, progress messages) also contain the + // [scenario:...] tag but don't send tools, so they fall through to content. + if (isMultiTurnScenario(scenario) && requestToolNames.length > 0) { + const roundTrips = countCompletedToolRoundTrips(messages); + const turnIndex = Math.min(roundTrips, scenario.turns.length - 1); + const turn = scenario.turns[turnIndex]; + + const ts = new Date().toISOString().slice(11, -1); + console.log(`[mock-llm] ${ts} → multi-turn scenario ${scenarioId}, turn ${turnIndex + 1}/${scenario.turns.length} (${turn.kind}), ${roundTrips} round-trips in history`); + + if (turn.kind === 'tool-calls') { + await streamToolCalls(res, turn.toolCalls, requestToolNames, scenarioId); + return; + } + + // kind === 'content' — stream the final text response + await streamContent(res, turn.chunks, isScenarioRequest); + return; + } + + // Standard content-only scenario (or multi-turn scenario falling back for + // ancillary requests like title generation that don't include tools) + const chunks = isMultiTurnScenario(scenario) + ? getFirstContentTurn(scenario) + : /** @type {StreamChunk[]} */ (scenario); + + await streamContent(res, chunks, isScenarioRequest); +} + +/** + * Get the chunks from the first content turn of a multi-turn scenario, + * used as fallback text for ancillary requests (title generation etc). + * @param {MultiTurnScenario} scenario + * @returns {StreamChunk[]} + */ +function getFirstContentTurn(scenario) { + for (const turn of scenario.turns) { + if (turn.kind === 'content') { + return turn.chunks; + } + } + return SCENARIOS[DEFAULT_SCENARIO]; +} + +/** + * Stream content chunks as a standard SSE response. + * @param {http.ServerResponse} res + * @param {StreamChunk[]} chunks + * @param {boolean} isScenarioRequest + */ +async function streamContent(res, chunks, isScenarioRequest) { res.write(`data: ${JSON.stringify(makeInitialChunk())}\n\n`); - // Content chunks for (const chunk of chunks) { - res.write(`data: ${JSON.stringify(makeChunk(chunk, 0, false))}\n\n`); + if (chunk.delayMs > 0) { await sleep(chunk.delayMs); } + res.write(`data: ${JSON.stringify(makeChunk(chunk.content, 0, false))}\n\n`); } - // Finish chunk res.write(`data: ${JSON.stringify(makeChunk('', 0, true))}\n\n`); - - // Done res.write('data: [DONE]\n\n'); res.end(); @@ -569,6 +936,45 @@ function handleChatCompletions(body, res) { } } +/** + * Stream tool call chunks as an SSE response. + * @param {http.ServerResponse} res + * @param {Array<{ toolNamePattern: RegExp, arguments: Record }>} toolCalls + * @param {string[]} requestToolNames + * @param {string} scenarioId + */ +async function streamToolCalls(res, toolCalls, requestToolNames, scenarioId) { + res.write(`data: ${JSON.stringify(makeToolCallInitialChunk())}\n\n`); + + for (let i = 0; i < toolCalls.length; i++) { + const call = toolCalls[i]; + const callId = `call_perf_${scenarioId}_${i}_${Date.now()}`; + + // Find the matching tool name from the request's tools array + let toolName = requestToolNames.find(name => call.toolNamePattern.test(name)); + if (!toolName) { + toolName = call.toolNamePattern.source.replace(/[\\.|?*+^${}()\[\]]/g, ''); + console.warn(`[mock-llm] No matching tool for pattern ${call.toolNamePattern}, using fallback: ${toolName}`); + } + + // Stream tool call: start chunk, then arguments in fragments + res.write(`data: ${JSON.stringify(makeToolCallStartChunk(i, callId, toolName))}\n\n`); + await sleep(10); + + const argsJson = JSON.stringify(call.arguments); + const fragmentSize = Math.max(20, Math.ceil(argsJson.length / 4)); + for (let pos = 0; pos < argsJson.length; pos += fragmentSize) { + const fragment = argsJson.slice(pos, pos + fragmentSize); + res.write(`data: ${JSON.stringify(makeToolCallArgsChunk(i, fragment))}\n\n`); + await sleep(5); + } + } + + res.write(`data: ${JSON.stringify(makeToolCallFinishChunk())}\n\n`); + res.write('data: [DONE]\n\n'); + res.end(); +} + /** * Start the mock server and return a handle. * @param {number} port diff --git a/scripts/chat-perf/common/utils.js b/scripts/chat-perf/common/utils.js index 6e8b60ce3d888..e616ecbfb5719 100644 --- a/scripts/chat-perf/common/utils.js +++ b/scripts/chat-perf/common/utils.js @@ -29,6 +29,8 @@ const SCENARIOS = [ 'giant-codeblock', 'rapid-stream', 'file-links', + 'tool-read-file', + 'tool-edit-file', ]; // -- Electron path resolution ------------------------------------------------ @@ -212,7 +214,19 @@ function prepareRunDir(runId, mockServer) { const userDataDir = path.join(tmpBase, `run-${runId}`); const extDir = path.join(DATA_DIR, 'extensions'); const logsDir = path.join(tmpBase, 'logs', `run-${runId}`); - fs.rmSync(userDataDir, { recursive: true, force: true }); + // Retry rmSync to handle ENOTEMPTY race conditions from Electron cache locks + for (let attempt = 0; attempt < 3; attempt++) { + try { + fs.rmSync(userDataDir, { recursive: true, force: true }); + break; + } catch (err) { + if (attempt < 2 && err.code === 'ENOTEMPTY') { + require('child_process').execSync(`sleep 0.5`); + } else { + throw err; + } + } + } fs.mkdirSync(userDataDir, { recursive: true }); fs.mkdirSync(extDir, { recursive: true }); fs.mkdirSync(logsDir, { recursive: true }); diff --git a/scripts/chat-perf/test-chat-perf-regression.js b/scripts/chat-perf/test-chat-perf-regression.js index 5abf6decd2e3f..90c9e81ab2275 100644 --- a/scripts/chat-perf/test-chat-perf-regression.js +++ b/scripts/chat-perf/test-chat-perf-regression.js @@ -165,6 +165,8 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await cdp.send('Performance.enable'); const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + // Stop any existing tracing session (stable builds may have one active) + try { await cdp.send('Tracing.end'); await new Promise(r => setTimeout(r, 200)); } catch { } await cdp.send('Tracing.start', { traceConfig: { includedCategories: ['v8.gc', 'devtools.timeline'], @@ -713,9 +715,36 @@ async function main() { : null; if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) { - console.log(`[chat-perf] Using cached baseline for ${opts.baselineBuild}`); - fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); - opts.baseline = baselineJsonPath; + // Check if the cache covers all requested scenarios + const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {})); + const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s)); + + if (missingScenarios.length === 0) { + console.log(`[chat-perf] Using cached baseline for ${opts.baselineBuild}`); + fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); + opts.baseline = baselineJsonPath; + } else { + console.log(`[chat-perf] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); + console.log(`[chat-perf] Running baseline for missing scenarios...`); + const baselineExePath = await resolveBuild(opts.baselineBuild); + for (const scenario of missingScenarios) { + /** @type {RunMetrics[]} */ + const results = []; + for (let i = 0; i < opts.runs; i++) { + try { results.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline')); } + catch (err) { console.error(`[chat-perf] Baseline run ${i + 1} failed: ${err}`); } + } + if (results.length > 0) { + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); + for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } + cachedBaseline.scenarios[scenario] = sd; + } + } + cachedBaseline.runsPerScenario = opts.runs; + fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); + fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2)); + opts.baseline = baselineJsonPath; + } } else { const baselineExePath = await resolveBuild(opts.baselineBuild); console.log(`[chat-perf] Benchmarking baseline build (${opts.baselineBuild})...`); @@ -803,7 +832,7 @@ async function main() { // -- JSON output ----------------------------------------------------- const jsonPath = path.join(runDir, 'results.json'); - const jsonReport = { timestamp: new Date().toISOString(), platform: process.platform, runsPerScenario: opts.runs, scenarios: /** @type {Record} */ ({}) }; + const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, scenarios: Record, _resultsPath?: string }} */ ({ timestamp: new Date().toISOString(), platform: process.platform, runsPerScenario: opts.runs, scenarios: /** @type {Record} */ ({}) }); for (const [scenario, results] of Object.entries(allResults)) { const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } @@ -831,7 +860,7 @@ async function main() { /** * Print baseline comparison and exit with code 1 if regressions found. * @param {Record} jsonReport - * @param {{ baseline?: string, threshold: number, ci?: boolean, runs?: number, baselineBuild?: string, build?: string }} opts + * @param {{ baseline?: string, threshold: number, ci?: boolean, runs?: number, baselineBuild?: string, build?: string, resume?: string }} opts */ async function printComparison(jsonReport, opts) { let regressionFound = false; From 36ef00775dfd0672ab0cfe54e40254ab547af3f7 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 14:23:57 -0700 Subject: [PATCH 04/13] PR --- .github/skills/chat-perf/SKILL.md | 10 +- .gitignore | 2 +- package.json | 4 +- .../common/mock-llm-server.js | 572 ++++++++---------- .../chat-simulation/common/perf-scenarios.js | 511 ++++++++++++++++ .../common/utils.js | 99 +-- .../test-chat-mem-leaks.js | 16 +- .../test-chat-perf-regression.js | 188 ++++-- 8 files changed, 985 insertions(+), 417 deletions(-) rename scripts/{chat-perf => chat-simulation}/common/mock-llm-server.js (59%) create mode 100644 scripts/chat-simulation/common/perf-scenarios.js rename scripts/{chat-perf => chat-simulation}/common/utils.js (87%) rename scripts/{chat-perf => chat-simulation}/test-chat-mem-leaks.js (91%) rename scripts/{chat-perf => chat-simulation}/test-chat-perf-regression.js (83%) diff --git a/.github/skills/chat-perf/SKILL.md b/.github/skills/chat-perf/SKILL.md index a110cafc2edbb..9592b51649131 100644 --- a/.github/skills/chat-perf/SKILL.md +++ b/.github/skills/chat-perf/SKILL.md @@ -28,7 +28,7 @@ npm run perf:chat-leak -- --messages 20 --verbose ## Perf regression test -**Script:** `scripts/chat-perf/test-chat-perf-regression.js` +**Script:** `scripts/chat-simulation/test-chat-perf-regression.js` **npm:** `npm run perf:chat` Launches VS Code via Playwright Electron, opens the chat panel, sends a message with a mock LLM response, and measures timing, layout, and rendering metrics. By default, downloads VS Code 1.115.0 as a baseline, benchmarks it, then benchmarks the local dev build and compares. @@ -62,10 +62,10 @@ When results exceed the threshold but aren't statistically significant, the tool npm run perf:chat -- --scenario text-only --runs 3 # Add 3 more runs to the same results file (both test + baseline): -npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 3 +npm run perf:chat -- --resume .chat-simulation-data/2026-04-14T02-15-14/results.json --runs 3 # Keep adding until confidence is reached: -npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 5 +npm run perf:chat -- --resume .chat-simulation-data/2026-04-14T02-15-14/results.json --runs 5 ``` `--resume` loads the previous `results.json` and its associated `baseline-*.json`, runs N more iterations for both builds, merges rawRuns, recomputes stats, and re-runs the comparison. The updated files are written back in-place. You can resume multiple times — samples accumulate. @@ -112,7 +112,7 @@ Results use **IQR-based outlier removal** and **median** (not mean) to handle st ## Memory leak check -**Script:** `scripts/chat-perf/test-chat-mem-leaks.js` +**Script:** `scripts/chat-simulation/test-chat-mem-leaks.js` **npm:** `npm run perf:chat-leak` Launches one VS Code session, sends N messages sequentially, forces GC between each, and measures renderer heap and DOM node count. Uses **linear regression** on the samples to compute per-message growth rate, which is compared against a threshold. @@ -141,7 +141,7 @@ Launches one VS Code session, sends N messages sequentially, forces GC between e ## Architecture ``` -scripts/chat-perf/ +scripts/chat-simulation/ ├── common/ │ ├── mock-llm-server.js # Mock CAPI server matching @vscode/copilot-api URL structure │ └── utils.js # Shared: paths, env setup, stats, launch helpers diff --git a/.gitignore b/.gitignore index 7e5189df7aa01..ab9acd25f4555 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,7 @@ product.overrides.json *.snap.actual *.tsbuildinfo .vscode-test -.chat-perf-data +.chat-simulation-data vscode-telemetry-docs/ test-output.json test/componentFixtures/.screenshots/* diff --git a/package.json b/package.json index b9067cf575740..78d6c74a489f5 100644 --- a/package.json +++ b/package.json @@ -79,8 +79,8 @@ "extensions-ci": "npm run gulp extensions-ci", "extensions-ci-pr": "npm run gulp extensions-ci-pr", "perf": "node scripts/code-perf.js", - "perf:chat": "node scripts/chat-perf/test-chat-perf-regression.js", - "perf:chat-leak": "node scripts/chat-perf/test-chat-mem-leaks.js", + "perf:chat": "node scripts/chat-simulation/test-chat-perf-regression.js", + "perf:chat-leak": "node scripts/chat-simulation/test-chat-mem-leaks.js", "copilot:setup": "npm --prefix extensions/copilot run setup", "copilot:get_token": "npm --prefix extensions/copilot run get_token", "update-build-ts-version": "npm install -D typescript@next && npm install -D @typescript/native-preview && (cd build && npm run typecheck)", diff --git a/scripts/chat-perf/common/mock-llm-server.js b/scripts/chat-simulation/common/mock-llm-server.js similarity index 59% rename from scripts/chat-perf/common/mock-llm-server.js rename to scripts/chat-simulation/common/mock-llm-server.js index 2e8429db28b83..4df12376e90c8 100644 --- a/scripts/chat-perf/common/mock-llm-server.js +++ b/scripts/chat-simulation/common/mock-llm-server.js @@ -28,7 +28,7 @@ const ROOT = path.join(__dirname, '..', '..', '..'); */ /** - * A single model turn in a multi-turn scenario. + * A single turn in a multi-turn scenario. * * @typedef {{ * kind: 'tool-calls', @@ -36,17 +36,55 @@ const ROOT = path.join(__dirname, '..', '..', '..'); * } | { * kind: 'content', * chunks: StreamChunk[], - * }} ModelTurn + * } | { + * kind: 'thinking', + * thinkingChunks: StreamChunk[], + * chunks: StreamChunk[], + * } | { + * kind: 'user', + * message: string, + * }} ScenarioTurn + */ + +/** + * A scenario turn produced by the model. + * + * @typedef {{ + * kind: 'tool-calls', + * toolCalls: Array<{ toolNamePattern: RegExp, arguments: Record }>, + * } | { + * kind: 'content', + * chunks: StreamChunk[], + * } | { + * kind: 'thinking', + * thinkingChunks: StreamChunk[], + * chunks: StreamChunk[], + * }} ModelScenarioTurn + */ + +/** + * A model turn that emits content chunks. + * + * @typedef {{ + * kind: 'content', + * chunks: StreamChunk[], + * } | { + * kind: 'thinking', + * thinkingChunks: StreamChunk[], + * chunks: StreamChunk[], + * }} ContentScenarioTurn */ /** - * A multi-turn scenario — an ordered sequence of model turns. - * The mock server determines which turn to serve based on the number + * A multi-turn scenario — an ordered sequence of turns. + * The mock server determines which model turn to serve based on the number * of assistant→tool round-trips already present in the conversation. + * User turns are skipped by the server and instead injected by the test + * harness, which types them into the chat input and presses Enter. * * @typedef {{ * type: 'multi-turn', - * turns: ModelTurn[], + * turns: ScenarioTurn[], * }} MultiTurnScenario */ @@ -117,297 +155,21 @@ class ScenarioBuilder { } /** @type {Record} */ -const SCENARIOS = { - 'text-only': new ScenarioBuilder() - .stream([ - 'Here is an explanation of the code you selected:\n\n', - 'The function `processItems` iterates over the input array and applies a transformation to each element. ', - 'It uses a `Map` to track previously seen values, which allows it to deduplicate results efficiently in O(n) time.\n\n', - 'The algorithm works in a single pass: for every element, it computes the transformed value, ', - 'checks membership in the set, and conditionally appends to the output array. ', - 'This is a common pattern in data processing pipelines where uniqueness constraints must be maintained.\n\n', - 'Edge cases to consider include empty arrays, duplicate transformations that produce the same key, ', - 'and items where the transform function itself is expensive.\n\n', - 'The time complexity is **O(n)** and the space complexity is **O(n)** in the worst case when all items are unique.\n', - ], 20) - .build(), - - 'large-codeblock': new ScenarioBuilder() - .stream([ - 'Here is the refactored implementation:\n\n', - '```typescript\n', - 'import { EventEmitter } from "events";\n\n', - 'interface CacheEntry {\n value: T;\n expiresAt: number;\n accessCount: number;\n}\n\n', - 'export class LRUCache {\n', - ' private readonly _map = new Map>();\n', - ' private readonly _emitter = new EventEmitter();\n\n', - ' constructor(\n private readonly _maxSize: number,\n private readonly _ttlMs: number = 60_000,\n ) {}\n\n', - ' get(key: K): V | undefined {\n const entry = this._map.get(key);\n if (!entry) { return undefined; }\n', - ' if (Date.now() > entry.expiresAt) {\n this._map.delete(key);\n this._emitter.emit("evict", key);\n return undefined;\n }\n', - ' entry.accessCount++;\n this._map.delete(key);\n this._map.set(key, entry);\n return entry.value;\n }\n\n', - ' set(key: K, value: V): void {\n if (this._map.size >= this._maxSize) {\n', - ' const oldest = this._map.keys().next().value;\n if (oldest !== undefined) {\n this._map.delete(oldest);\n this._emitter.emit("evict", oldest);\n }\n }\n', - ' this._map.set(key, { value, expiresAt: Date.now() + this._ttlMs, accessCount: 0 });\n }\n\n', - ' clear(): void { this._map.clear(); this._emitter.emit("clear"); }\n', - ' get size(): number { return this._map.size; }\n', - ' onEvict(listener: (key: K) => void): void { this._emitter.on("evict", listener); }\n}\n', - '```\n\n', - 'The key changes:\n- Added TTL-based expiry with configurable timeout\n- LRU eviction uses Map insertion order\n- EventEmitter notifies on evictions for cache observability\n', - ], 20) - .build(), - - 'many-small-chunks': (() => { - const words = ['Generating detailed analysis:\n\n']; - for (let i = 0; i < 200; i++) { words.push(`Word${i} `); } - words.push('\n\nAnalysis complete.\n'); - const b = new ScenarioBuilder(); - b.stream(words, 5); - return b.build(); - })(), - - 'mixed-content': new ScenarioBuilder() - .stream([ - '## Issue Found\n\n', - 'The `DisposableStore` is not being disposed in the `deactivate` path, ', - 'which can lead to memory leaks.\n\n', - '### Current Code\n\n', - '```typescript\nclass MyService {\n private store = new DisposableStore();\n // missing dispose!\n}\n```\n\n', - '### Suggested Fix\n\n', - '```typescript\nclass MyService extends Disposable {\n', - ' private readonly store = this._register(new DisposableStore());\n\n', - ' override dispose(): void {\n this.store.dispose();\n super.dispose();\n }\n}\n```\n\n', - 'This ensures the store is cleaned up when the service is disposed via the workbench lifecycle.\n', - ], 20) - .build(), - - // -- Stress-test scenarios -------------------------------------------- - - 'many-codeblocks': (() => { - const b = new ScenarioBuilder(); - b.emit('Here are the implementations for each module:\n\n'); - for (let i = 0; i < 10; i++) { - b.wait(10, `### Module ${i + 1}: \`handler${i}.ts\`\n\n`); - b.emit('```typescript\n'); - const lines = []; - for (let j = 0; j < 15; j++) { - lines.push(`export function handle${i}_${j}(input: string): string {\n`); - lines.push(` const result = input.trim().split('').reverse().join('');\n`); - lines.push(` return \`[\${result}] processed by handler ${i}_${j}\`;\n`); - lines.push('}\n\n'); - } - b.stream(lines, 5); - b.emit('```\n\n'); - } - b.emit('All modules implement the same pattern with unique handler IDs.\n'); - return b.build(); - })(), - - 'long-prose': (() => { - const sentences = [ - 'The architecture follows a layered dependency injection pattern where each service declares its dependencies through constructor parameters. ', - 'This approach ensures that circular dependencies are detected at compile time rather than at runtime, which significantly reduces debugging overhead. ', - 'When a service is instantiated, the instantiation service resolves all of its dependencies recursively, creating a directed acyclic graph of service instances. ', - 'Each service is a singleton within its scope, meaning that multiple consumers of the same service interface receive the same instance. ', - 'The workbench lifecycle manages the creation and disposal of these services through well-defined phases: creation, restoration, and eventual shutdown. ', - 'During the restoration phase, services that persist state across sessions reload their data from storage, which may involve asynchronous operations. ', - 'Contributors register their functionality through extension points, which are processed during the appropriate lifecycle phase. ', - 'This contribution model allows features to be added without modifying the core workbench code, maintaining a clean separation of concerns. ', - ]; - const b = new ScenarioBuilder(); - b.emit('# Detailed Architecture Analysis\n\n'); - for (let para = 0; para < 15; para++) { - b.wait(15, `## Section ${para + 1}: ${['Overview', 'Design Patterns', 'Service Layer', 'Event System', 'State Management', 'Error Handling', 'Performance', 'Testing', 'Deployment', 'Monitoring', 'Security', 'Extensibility', 'Compatibility', 'Migration', 'Future Work'][para]}\n\n`); - const paraSentences = []; - for (let s = 0; s < 25; s++) { paraSentences.push(sentences[s % sentences.length]); } - b.stream(paraSentences, 8); - b.emit('\n\n'); - } - return b.build(); - })(), - - 'rich-markdown': (() => { - const b = new ScenarioBuilder(); - b.emit('# Comprehensive Code Review Report\n\n'); - b.wait(15, '> **Summary**: Found 12 issues across 4 severity levels.\n\n'); - for (let section = 0; section < 6; section++) { - b.wait(10, `## ${section + 1}. ${['Critical Issues', 'Performance Concerns', 'Code Style', 'Documentation Gaps', 'Test Coverage', 'Security Review'][section]}\n\n`); - for (let item = 0; item < 5; item++) { - b.stream([ - `${item + 1}. **Issue ${section * 5 + item + 1}**: \`${['useState', 'useEffect', 'useMemo', 'useCallback', 'useRef'][item]}\` in \`src/components/Widget${item}.tsx\`\n`, - ` - Severity: ${['[Critical]', '[Warning]', '[Info]', '[Suggestion]', '[Note]'][item]}\n`, - ` - The current implementation uses *unnecessary re-renders* due to missing dependency arrays.\n`, - ` - See [React docs](https://react.dev/reference) and the [\`useMemo\` guide](https://react.dev/reference/react/useMemo).\n`, - ` - Fix: wrap in \`useCallback\` or extract to a ***separate memoized component***.\n\n`, - ], 10); - } - b.emit('---\n\n'); - } - b.emit('> *Report generated automatically. Please review all suggestions before applying.*\n'); - return b.build(); - })(), - - 'giant-codeblock': (() => { - const b = new ScenarioBuilder(); - b.emit('Here is the complete implementation:\n\n```typescript\n'); - b.stream([ - 'import { Disposable, DisposableStore } from "vs/base/common/lifecycle";\n', - 'import { Emitter, Event } from "vs/base/common/event";\n', - 'import { URI } from "vs/base/common/uri";\n\n', - ], 10); - for (let i = 0; i < 40; i++) { - b.stream([ - `export class Service${i} extends Disposable {\n`, - ` private readonly _onDidChange = this._register(new Emitter());\n`, - ` readonly onDidChange: Event = this._onDidChange.event;\n\n`, - ` private _value: string = '';\n`, - ` get value(): string { return this._value; }\n\n`, - ` async update(uri: URI): Promise {\n`, - ` this._value = uri.toString();\n`, - ` this._onDidChange.fire();\n`, - ` }\n`, - '}\n\n', - ], 5); - } - b.emit('```\n\nThis defines 40 service classes following the standard VS Code pattern.\n'); - return b.build(); - })(), - - 'rapid-stream': (() => { - const b = new ScenarioBuilder(); - const words = []; - for (let i = 0; i < 1000; i++) { words.push(`w${i} `); } - // Very fast inter-chunk delay to stress the streaming pipeline - b.stream(words, 2); - return b.build(); - })(), - - 'file-links': (() => { - const files = [ - 'src/vs/workbench/contrib/chat/browser/chatListRenderer.ts', - 'src/vs/workbench/contrib/chat/common/chatService/chatServiceImpl.ts', - 'src/vs/workbench/contrib/chat/browser/widget/input/chatInputPart.ts', - 'src/vs/workbench/contrib/chat/common/chatPerf.ts', - 'src/vs/base/common/lifecycle.ts', - 'src/vs/base/common/event.ts', - 'src/vs/platform/instantiation/common/instantiation.ts', - 'src/vs/workbench/services/extensions/common/abstractExtensionService.ts', - 'src/vs/workbench/api/common/extHostLanguageModels.ts', - 'src/vs/workbench/contrib/chat/common/languageModels.ts', - 'src/vs/editor/browser/widget/codeEditor/editor.ts', - 'src/vs/workbench/browser/parts/editor/editorGroupView.ts', - ]; - const b = new ScenarioBuilder(); - b.emit('I found references to the disposable pattern across the following files:\n\n'); - for (let i = 0; i < files.length; i++) { - const line = Math.floor(Math.random() * 500) + 1; - b.stream([ - `${i + 1}. [${files[i]}](${files[i]}#L${line}) -- `, - `Line ${line}: uses \`DisposableStore\` with ${Math.floor(Math.random() * 10) + 1} registrations\n`, - ], 15); - } - b.wait(10, '\nAdditionally, the following files import from `vs/base/common/lifecycle`:\n\n'); - for (let i = 0; i < 20; i++) { - const depth = ['base', 'platform', 'editor', 'workbench'][i % 4]; - const area = ['common', 'browser', 'node', 'electron-browser'][i % 4]; - const name = ['service', 'provider', 'contribution', 'handler', 'manager'][i % 5]; - const file = `src/vs/${depth}/${area}/${name}${i}.ts`; - b.stream([ - `- [${file}](${file}#L${i * 10 + 5})`, - ` -- imports \`Disposable\`, \`DisposableStore\`\n`, - ], 12); - } - b.emit('\nTotal: 32 files reference the disposable pattern.\n'); - return b.build(); - })(), - - // -- Tool call scenarios ----------------------------------------------- - - 'tool-read-file': /** @type {MultiTurnScenario} */ ({ - type: 'multi-turn', - turns: [ - { - kind: 'tool-calls', - toolCalls: [ - { - toolNamePattern: /read.?file/i, - arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), - offset: 1, - limit: 50, - }, - }, - ], - }, - { - kind: 'content', - chunks: new ScenarioBuilder() - .wait(20, 'I read the file `src/vs/base/common/lifecycle.ts`. Here is my analysis:\n\n') - .stream([ - 'The `Disposable` base class provides a standard lifecycle pattern for VS Code components. ', - 'It maintains a `DisposableStore` internally via `this._store` and exposes `this._register()` ', - 'for subclasses to track their own disposables.\n\n', - 'Key patterns:\n', - '- **`_register()`** — adds a disposable to the internal store, ensuring cleanup on `dispose()`\n', - '- **`DisposableStore`** — a collection that disposes all contained items when itself disposed\n', - '- **`MutableDisposable`** — holds a single disposable that can be swapped; the old one is disposed automatically\n\n', - 'The `toDisposable()` helper wraps a callback into an `IDisposable`, which is convenient for ', - 'one-off cleanup like removing event listeners.\n', - ], 20) - .build(), - }, - ], - }), - - 'tool-edit-file': /** @type {MultiTurnScenario} */ ({ - type: 'multi-turn', - turns: [ - { - kind: 'tool-calls', - toolCalls: [ - { - toolNamePattern: /read.?file/i, - arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), - offset: 1, - limit: 30, - }, - }, - ], - }, - { - kind: 'tool-calls', - toolCalls: [ - { - toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, - arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), - oldString: '// perf-benchmark-marker', - newString: '// perf-benchmark-marker (updated)', - explanation: 'Update the benchmark marker comment', - }, - }, - ], - }, - { - kind: 'content', - chunks: new ScenarioBuilder() - .wait(20, 'I have read and edited `src/vs/base/common/lifecycle.ts`.\n\n') - .stream([ - 'The changes I made:\n', - '1. Read the file to understand its structure\n', - '2. Applied the edit to update the benchmark marker comment\n\n', - 'The `Disposable` pattern in this file is the foundation of VS Code\'s lifecycle management. ', - 'All components that own resources should extend `Disposable` and register their cleanup ', - 'handlers via `this._register()`. This ensures proper teardown when the component is disposed.\n', - ], 20) - .build(), - }, - ], - }), -}; +const SCENARIOS = /** @type {Record} */ ({}); const DEFAULT_SCENARIO = 'text-only'; +/** + * @returns {StreamChunk[]} + */ +function getDefaultScenarioChunks() { + const scenario = SCENARIOS[DEFAULT_SCENARIO]; + if (isMultiTurnScenario(scenario)) { + throw new Error(`Default scenario '${DEFAULT_SCENARIO}' must be content-only`); + } + return scenario; +} + // -- SSE chunk builder ------------------------------------------------------- const MODEL = 'gpt-4o-2024-08-06'; @@ -542,6 +304,47 @@ function makeToolCallFinishChunk() { }; } +/** + * Build a thinking (chain-of-thought summary) chunk. + * Uses the `cot_summary` field in the delta, matching the Copilot API wire format. + * @param {string} text - thinking text fragment + */ +function makeThinkingChunk(text) { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { cot_summary: text }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + +/** + * Build a thinking ID chunk (sent after thinking text to close the block). + * @param {string} cotId - unique chain-of-thought ID + */ +function makeThinkingIdChunk(cotId) { + return { + id: 'chatcmpl-perf-benchmark', + object: 'chat.completion.chunk', + created: Math.floor(Date.now() / 1000), + model: MODEL, + choices: [{ + index: 0, + delta: { cot_id: cotId }, + finish_reason: null, + content_filter_results: {}, + }], + usage: null, + }; +} + // -- Request handler --------------------------------------------------------- /** @@ -796,20 +599,54 @@ const serverEvents = new EventEmitter(); const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); /** - * Count the number of completed assistant→tool round-trips in the conversation. - * Each round-trip = one assistant message with tool_calls followed by one or - * more tool result messages. + * Count the number of model turns already completed in the conversation. + * A model turn is one of: + * - An assistant message with tool_calls (tool-calls turn) + * - An assistant message with content but no tool_calls (content/thinking turn) + * The first assistant message after each user message counts as a new model + * turn. User turns in the scenario are detected by counting user messages + * beyond the initial one. * @param {any[]} messages * @returns {number} */ -function countCompletedToolRoundTrips(messages) { - let roundTrips = 0; +function countCompletedModelTurns(messages) { + let turns = 0; for (const msg of messages) { - if (msg.role === 'assistant' && msg.tool_calls && msg.tool_calls.length > 0) { - roundTrips++; + if (msg.role === 'assistant') { + turns++; } } - return roundTrips; + return turns; +} + +/** + * Count user messages in the conversation (including the initial one). + * @param {any[]} messages + * @returns {number} + */ +function countUserMessages(messages) { + return messages.filter((/** @type {any} */ m) => m.role === 'user').length; +} + +/** + * Compute the model-turn index for the current request given the scenario's + * turn list. User turns are skipped (they're handled by the test harness) + * and do not consume a model turn index. + * + * The algorithm counts completed assistant messages in the conversation + * history (each one = one served model turn), then maps that to the + * n-th model turn in the scenario (skipping user turns). + * + * @param {ScenarioTurn[]} turns + * @param {any[]} messages + * @returns {{ turn: ModelScenarioTurn, turnIndex: number }} + */ +function resolveCurrentTurn(turns, messages) { + const completedModelTurns = countCompletedModelTurns(messages); + // Build the model-only turn list (skip user turns) + const modelTurns = /** @type {ModelScenarioTurn[]} */ (turns.filter(t => t.kind !== 'user')); + const idx = Math.min(completedModelTurns, modelTurns.length - 1); + return { turn: modelTurns[idx], turnIndex: idx }; } /** @@ -843,18 +680,20 @@ async function handleChatCompletions(body, res) { console.log(`[mock-llm] ${ts} → ${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`); } - const lastUser = [...messages].reverse().find((/** @type {any} */ m) => m.role === 'user'); - if (lastUser) { - // Extract scenario ID from user message content - const content = typeof lastUser.content === 'string' - ? lastUser.content - : Array.isArray(lastUser.content) - ? lastUser.content.map((/** @type {any} */ c) => c.text || '').join('') + // Search all user messages for the scenario tag (not just the last one, + // since follow-up user messages in multi-turn scenarios won't have it). + for (const msg of messages) { + if (msg.role !== 'user') { continue; } + const content = typeof msg.content === 'string' + ? msg.content + : Array.isArray(msg.content) + ? msg.content.map((/** @type {any} */ c) => c.text || '').join('') : ''; const match = content.match(/\[scenario:([^\]]+)\]/); if (match && SCENARIOS[match[1]]) { scenarioId = match[1]; isScenarioRequest = true; + break; } } } catch { } @@ -872,18 +711,22 @@ async function handleChatCompletions(body, res) { // Ancillary requests (title generation, progress messages) also contain the // [scenario:...] tag but don't send tools, so they fall through to content. if (isMultiTurnScenario(scenario) && requestToolNames.length > 0) { - const roundTrips = countCompletedToolRoundTrips(messages); - const turnIndex = Math.min(roundTrips, scenario.turns.length - 1); - const turn = scenario.turns[turnIndex]; + const { turn, turnIndex } = resolveCurrentTurn(scenario.turns, messages); + const modelTurnCount = scenario.turns.filter(t => t.kind !== 'user').length; const ts = new Date().toISOString().slice(11, -1); - console.log(`[mock-llm] ${ts} → multi-turn scenario ${scenarioId}, turn ${turnIndex + 1}/${scenario.turns.length} (${turn.kind}), ${roundTrips} round-trips in history`); + console.log(`[mock-llm] ${ts} → multi-turn scenario ${scenarioId}, model turn ${turnIndex + 1}/${modelTurnCount} (${turn.kind}), ${countCompletedModelTurns(messages)} completed turns in history`); if (turn.kind === 'tool-calls') { await streamToolCalls(res, turn.toolCalls, requestToolNames, scenarioId); return; } + if (turn.kind === 'thinking') { + await streamThinkingThenContent(res, turn.thinkingChunks, turn.chunks, isScenarioRequest); + return; + } + // kind === 'content' — stream the final text response await streamContent(res, turn.chunks, isScenarioRequest); return; @@ -905,12 +748,19 @@ async function handleChatCompletions(body, res) { * @returns {StreamChunk[]} */ function getFirstContentTurn(scenario) { + /** @type {ContentScenarioTurn | undefined} */ + let contentTurn; for (const turn of scenario.turns) { if (turn.kind === 'content') { - return turn.chunks; + contentTurn = turn; + break; + } + if (turn.kind === 'thinking') { + contentTurn = turn; + break; } } - return SCENARIOS[DEFAULT_SCENARIO]; + return contentTurn?.chunks ?? getDefaultScenarioChunks(); } /** @@ -936,6 +786,44 @@ async function streamContent(res, chunks, isScenarioRequest) { } } +/** + * Stream thinking chunks followed by content chunks as an SSE response. + * Thinking is emitted as `cot_summary` deltas, then a `cot_id` to close the + * thinking block, followed by standard content deltas. + * @param {http.ServerResponse} res + * @param {StreamChunk[]} thinkingChunks + * @param {StreamChunk[]} contentChunks + * @param {boolean} isScenarioRequest + */ +async function streamThinkingThenContent(res, thinkingChunks, contentChunks, isScenarioRequest) { + res.write(`data: ${JSON.stringify(makeInitialChunk())}\n\n`); + + // Stream thinking text + for (const chunk of thinkingChunks) { + if (chunk.delayMs > 0) { await sleep(chunk.delayMs); } + res.write(`data: ${JSON.stringify(makeThinkingChunk(chunk.content))}\n\n`); + } + + // Close thinking block with ID + const cotId = `cot_perf_${Date.now()}`; + res.write(`data: ${JSON.stringify(makeThinkingIdChunk(cotId))}\n\n`); + await sleep(10); + + // Stream content + for (const chunk of contentChunks) { + if (chunk.delayMs > 0) { await sleep(chunk.delayMs); } + res.write(`data: ${JSON.stringify(makeChunk(chunk.content, 0, false))}\n\n`); + } + + res.write(`data: ${JSON.stringify(makeChunk('', 0, true))}\n\n`); + res.write('data: [DONE]\n\n'); + res.end(); + + if (isScenarioRequest) { + serverEvents.emit('scenarioCompletion'); + } +} + /** * Stream tool call chunks as an SSE response. * @param {http.ServerResponse} res @@ -1048,6 +936,8 @@ function startServer(port = 0) { // Allow running standalone for testing: node scripts/mock-llm-server.js if (require.main === module) { + const { registerPerfScenarios } = require('./perf-scenarios'); + registerPerfScenarios(); const port = parseInt(process.argv[2] || '0', 10); startServer(port).then((/** @type {any} */ handle) => { console.log(`Mock LLM server listening at ${handle.url}`); @@ -1055,4 +945,56 @@ if (require.main === module) { }); } -module.exports = { startServer, SCENARIOS }; +/** + * Get the user follow-up messages for a scenario, in order. + * Returns an array of { message, afterModelTurn } objects where afterModelTurn + * is the 0-based index of the model turn after which this user message should + * be injected. + * @param {string} scenarioId + * @returns {Array<{ message: string, afterModelTurn: number }>} + */ +function getUserTurns(scenarioId) { + const scenario = SCENARIOS[scenarioId]; + if (!isMultiTurnScenario(scenario)) { return []; } + const result = []; + let modelTurnsSeen = 0; + for (const turn of scenario.turns) { + if (turn.kind === 'user') { + result.push({ message: turn.message, afterModelTurn: modelTurnsSeen }); + } else { + modelTurnsSeen++; + } + } + return result; +} + +/** + * Get the total number of model turns (non-user turns) in a scenario. + * @param {string} scenarioId + * @returns {number} + */ +function getModelTurnCount(scenarioId) { + const scenario = SCENARIOS[scenarioId]; + if (!isMultiTurnScenario(scenario)) { return 1; } + return scenario.turns.filter(t => t.kind !== 'user').length; +} + +/** + * Register a scenario dynamically. Test files call this to add + * scenarios that are only relevant to them. + * @param {string} id - unique scenario identifier + * @param {StreamChunk[] | MultiTurnScenario} definition - scenario data + */ +function registerScenario(id, definition) { + SCENARIOS[id] = definition; +} + +/** + * Return the IDs of all currently registered scenarios. + * @returns {string[]} + */ +function getScenarioIds() { + return Object.keys(SCENARIOS); +} + +module.exports = { startServer, SCENARIOS, ScenarioBuilder, registerScenario, getScenarioIds, getUserTurns, getModelTurnCount }; diff --git a/scripts/chat-simulation/common/perf-scenarios.js b/scripts/chat-simulation/common/perf-scenarios.js new file mode 100644 index 0000000000000..ce46effc816e4 --- /dev/null +++ b/scripts/chat-simulation/common/perf-scenarios.js @@ -0,0 +1,511 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// @ts-check + +/** + * Built-in scenario definitions for chat performance benchmarks and leak checks. + * + * Each test file imports this module and calls `registerScenario()` for the + * scenarios it needs, keeping scenario ownership close to the test that uses it. + */ + +const path = require('path'); +const { ScenarioBuilder, registerScenario } = require('./mock-llm-server'); + +const ROOT = path.join(__dirname, '..', '..', '..'); + +// -- Content-only scenarios --------------------------------------------------- + +/** @type {Record} */ +const CONTENT_SCENARIOS = { + 'text-only': new ScenarioBuilder() + .stream([ + 'Here is an explanation of the code you selected:\n\n', + 'The function `processItems` iterates over the input array and applies a transformation to each element. ', + 'It uses a `Map` to track previously seen values, which allows it to deduplicate results efficiently in O(n) time.\n\n', + 'The algorithm works in a single pass: for every element, it computes the transformed value, ', + 'checks membership in the set, and conditionally appends to the output array. ', + 'This is a common pattern in data processing pipelines where uniqueness constraints must be maintained.\n\n', + 'Edge cases to consider include empty arrays, duplicate transformations that produce the same key, ', + 'and items where the transform function itself is expensive.\n\n', + 'The time complexity is **O(n)** and the space complexity is **O(n)** in the worst case when all items are unique.\n', + ], 20) + .build(), + + 'large-codeblock': new ScenarioBuilder() + .stream([ + 'Here is the refactored implementation:\n\n', + '```typescript\n', + 'import { EventEmitter } from "events";\n\n', + 'interface CacheEntry {\n value: T;\n expiresAt: number;\n accessCount: number;\n}\n\n', + 'export class LRUCache {\n', + ' private readonly _map = new Map>();\n', + ' private readonly _emitter = new EventEmitter();\n\n', + ' constructor(\n private readonly _maxSize: number,\n private readonly _ttlMs: number = 60_000,\n ) {}\n\n', + ' get(key: K): V | undefined {\n const entry = this._map.get(key);\n if (!entry) { return undefined; }\n', + ' if (Date.now() > entry.expiresAt) {\n this._map.delete(key);\n this._emitter.emit("evict", key);\n return undefined;\n }\n', + ' entry.accessCount++;\n this._map.delete(key);\n this._map.set(key, entry);\n return entry.value;\n }\n\n', + ' set(key: K, value: V): void {\n if (this._map.size >= this._maxSize) {\n', + ' const oldest = this._map.keys().next().value;\n if (oldest !== undefined) {\n this._map.delete(oldest);\n this._emitter.emit("evict", oldest);\n }\n }\n', + ' this._map.set(key, { value, expiresAt: Date.now() + this._ttlMs, accessCount: 0 });\n }\n\n', + ' clear(): void { this._map.clear(); this._emitter.emit("clear"); }\n', + ' get size(): number { return this._map.size; }\n', + ' onEvict(listener: (key: K) => void): void { this._emitter.on("evict", listener); }\n}\n', + '```\n\n', + 'The key changes:\n- Added TTL-based expiry with configurable timeout\n- LRU eviction uses Map insertion order\n- EventEmitter notifies on evictions for cache observability\n', + ], 20) + .build(), + + 'many-small-chunks': (() => { + const words = ['Generating detailed analysis:\n\n']; + for (let i = 0; i < 200; i++) { words.push(`Word${i} `); } + words.push('\n\nAnalysis complete.\n'); + const b = new ScenarioBuilder(); + b.stream(words, 5); + return b.build(); + })(), + + 'mixed-content': new ScenarioBuilder() + .stream([ + '## Issue Found\n\n', + 'The `DisposableStore` is not being disposed in the `deactivate` path, ', + 'which can lead to memory leaks.\n\n', + '### Current Code\n\n', + '```typescript\nclass MyService {\n private store = new DisposableStore();\n // missing dispose!\n}\n```\n\n', + '### Suggested Fix\n\n', + '```typescript\nclass MyService extends Disposable {\n', + ' private readonly store = this._register(new DisposableStore());\n\n', + ' override dispose(): void {\n this.store.dispose();\n super.dispose();\n }\n}\n```\n\n', + 'This ensures the store is cleaned up when the service is disposed via the workbench lifecycle.\n', + ], 20) + .build(), + + // -- Stress-test scenarios -------------------------------------------- + + 'many-codeblocks': (() => { + const b = new ScenarioBuilder(); + b.emit('Here are the implementations for each module:\n\n'); + for (let i = 0; i < 10; i++) { + b.wait(10, `### Module ${i + 1}: \`handler${i}.ts\`\n\n`); + b.emit('```typescript\n'); + const lines = []; + for (let j = 0; j < 15; j++) { + lines.push(`export function handle${i}_${j}(input: string): string {\n`); + lines.push(` const result = input.trim().split('').reverse().join('');\n`); + lines.push(` return \`[\${result}] processed by handler ${i}_${j}\`;\n`); + lines.push('}\n\n'); + } + b.stream(lines, 5); + b.emit('```\n\n'); + } + b.emit('All modules implement the same pattern with unique handler IDs.\n'); + return b.build(); + })(), + + 'long-prose': (() => { + const sentences = [ + 'The architecture follows a layered dependency injection pattern where each service declares its dependencies through constructor parameters. ', + 'This approach ensures that circular dependencies are detected at compile time rather than at runtime, which significantly reduces debugging overhead. ', + 'When a service is instantiated, the instantiation service resolves all of its dependencies recursively, creating a directed acyclic graph of service instances. ', + 'Each service is a singleton within its scope, meaning that multiple consumers of the same service interface receive the same instance. ', + 'The workbench lifecycle manages the creation and disposal of these services through well-defined phases: creation, restoration, and eventual shutdown. ', + 'During the restoration phase, services that persist state across sessions reload their data from storage, which may involve asynchronous operations. ', + 'Contributors register their functionality through extension points, which are processed during the appropriate lifecycle phase. ', + 'This contribution model allows features to be added without modifying the core workbench code, maintaining a clean separation of concerns. ', + ]; + const b = new ScenarioBuilder(); + b.emit('# Detailed Architecture Analysis\n\n'); + for (let para = 0; para < 15; para++) { + b.wait(15, `## Section ${para + 1}: ${['Overview', 'Design Patterns', 'Service Layer', 'Event System', 'State Management', 'Error Handling', 'Performance', 'Testing', 'Deployment', 'Monitoring', 'Security', 'Extensibility', 'Compatibility', 'Migration', 'Future Work'][para]}\n\n`); + const paraSentences = []; + for (let s = 0; s < 25; s++) { paraSentences.push(sentences[s % sentences.length]); } + b.stream(paraSentences, 8); + b.emit('\n\n'); + } + return b.build(); + })(), + + 'rich-markdown': (() => { + const b = new ScenarioBuilder(); + b.emit('# Comprehensive Code Review Report\n\n'); + b.wait(15, '> **Summary**: Found 12 issues across 4 severity levels.\n\n'); + for (let section = 0; section < 6; section++) { + b.wait(10, `## ${section + 1}. ${['Critical Issues', 'Performance Concerns', 'Code Style', 'Documentation Gaps', 'Test Coverage', 'Security Review'][section]}\n\n`); + for (let item = 0; item < 5; item++) { + b.stream([ + `${item + 1}. **Issue ${section * 5 + item + 1}**: \`${['useState', 'useEffect', 'useMemo', 'useCallback', 'useRef'][item]}\` in \`src/components/Widget${item}.tsx\`\n`, + ` - Severity: ${['[Critical]', '[Warning]', '[Info]', '[Suggestion]', '[Note]'][item]}\n`, + ` - The current implementation uses *unnecessary re-renders* due to missing dependency arrays.\n`, + ` - See [React docs](https://react.dev/reference) and the [\`useMemo\` guide](https://react.dev/reference/react/useMemo).\n`, + ` - Fix: wrap in \`useCallback\` or extract to a ***separate memoized component***.\n\n`, + ], 10); + } + b.emit('---\n\n'); + } + b.emit('> *Report generated automatically. Please review all suggestions before applying.*\n'); + return b.build(); + })(), + + 'giant-codeblock': (() => { + const b = new ScenarioBuilder(); + b.emit('Here is the complete implementation:\n\n```typescript\n'); + b.stream([ + 'import { Disposable, DisposableStore } from "vs/base/common/lifecycle";\n', + 'import { Emitter, Event } from "vs/base/common/event";\n', + 'import { URI } from "vs/base/common/uri";\n\n', + ], 10); + for (let i = 0; i < 40; i++) { + b.stream([ + `export class Service${i} extends Disposable {\n`, + ` private readonly _onDidChange = this._register(new Emitter());\n`, + ` readonly onDidChange: Event = this._onDidChange.event;\n\n`, + ` private _value: string = '';\n`, + ` get value(): string { return this._value; }\n\n`, + ` async update(uri: URI): Promise {\n`, + ` this._value = uri.toString();\n`, + ` this._onDidChange.fire();\n`, + ` }\n`, + '}\n\n', + ], 5); + } + b.emit('```\n\nThis defines 40 service classes following the standard VS Code pattern.\n'); + return b.build(); + })(), + + 'rapid-stream': (() => { + const b = new ScenarioBuilder(); + const words = []; + for (let i = 0; i < 1000; i++) { words.push(`w${i} `); } + // Very fast inter-chunk delay to stress the streaming pipeline + b.stream(words, 2); + return b.build(); + })(), + + 'file-links': (() => { + const files = [ + 'src/vs/workbench/contrib/chat/browser/chatListRenderer.ts', + 'src/vs/workbench/contrib/chat/common/chatService/chatServiceImpl.ts', + 'src/vs/workbench/contrib/chat/browser/widget/input/chatInputPart.ts', + 'src/vs/workbench/contrib/chat/common/chatPerf.ts', + 'src/vs/base/common/lifecycle.ts', + 'src/vs/base/common/event.ts', + 'src/vs/platform/instantiation/common/instantiation.ts', + 'src/vs/workbench/services/extensions/common/abstractExtensionService.ts', + 'src/vs/workbench/api/common/extHostLanguageModels.ts', + 'src/vs/workbench/contrib/chat/common/languageModels.ts', + 'src/vs/editor/browser/widget/codeEditor/editor.ts', + 'src/vs/workbench/browser/parts/editor/editorGroupView.ts', + ]; + const b = new ScenarioBuilder(); + b.emit('I found references to the disposable pattern across the following files:\n\n'); + for (let i = 0; i < files.length; i++) { + const line = Math.floor(Math.random() * 500) + 1; + b.stream([ + `${i + 1}. [${files[i]}](${files[i]}#L${line}) -- `, + `Line ${line}: uses \`DisposableStore\` with ${Math.floor(Math.random() * 10) + 1} registrations\n`, + ], 15); + } + b.wait(10, '\nAdditionally, the following files import from `vs/base/common/lifecycle`:\n\n'); + for (let i = 0; i < 20; i++) { + const depth = ['base', 'platform', 'editor', 'workbench'][i % 4]; + const area = ['common', 'browser', 'node', 'electron-browser'][i % 4]; + const name = ['service', 'provider', 'contribution', 'handler', 'manager'][i % 5]; + const file = `src/vs/${depth}/${area}/${name}${i}.ts`; + b.stream([ + `- [${file}](${file}#L${i * 10 + 5})`, + ` -- imports \`Disposable\`, \`DisposableStore\`\n`, + ], 12); + } + b.emit('\nTotal: 32 files reference the disposable pattern.\n'); + return b.build(); + })(), +}; + +// -- Tool call scenarios ------------------------------------------------------ + +/** @type {Record} */ +const TOOL_CALL_SCENARIOS = { + // Stress test: read 8 files across multiple tool-call rounds, simulating + // a real agent gathering context before answering. + 'tool-read-file': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ((() => { + const filesToRead = [ + 'src/vs/base/common/lifecycle.ts', + 'src/vs/base/common/event.ts', + 'src/vs/base/common/uri.ts', + 'src/vs/base/common/errors.ts', + 'src/vs/base/common/async.ts', + 'src/vs/base/common/strings.ts', + 'src/vs/base/common/arrays.ts', + 'src/vs/base/common/types.ts', + ]; + // Round 1: parallel read of first 4 files + // Round 2: parallel read of next 4 files + // Round 3: final content response + return { + type: 'multi-turn', + turns: [ + { + kind: 'tool-calls', + toolCalls: filesToRead.slice(0, 4).map(f => ({ + toolNamePattern: /read.?file/i, + arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 50 }, + })), + }, + { + kind: 'tool-calls', + toolCalls: filesToRead.slice(4).map(f => ({ + toolNamePattern: /read.?file/i, + arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 50 }, + })), + }, + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, '## Analysis of VS Code Base Utilities\n\n') + .stream([ + 'I read 8 core utility files from `src/vs/base/common/`. Here is a summary:\n\n', + '### lifecycle.ts\n', + 'The `Disposable` base class provides the standard lifecycle pattern. Components register cleanup ', + 'handlers via `this._register()` which are automatically disposed when the parent is disposed.\n\n', + '### event.ts\n', + 'The `Emitter` class implements the observer pattern. `Event.once()`, `Event.map()`, and `Event.filter()` ', + 'provide functional combinators for composing event streams.\n\n', + '### uri.ts\n', + '`URI` is an immutable representation of a resource identifier with scheme, authority, path, query, and fragment.\n\n', + '### errors.ts\n', + 'Central error handling with `onUnexpectedError()` and `isCancellationError()` for distinguishing user cancellation.\n\n', + '### async.ts\n', + '`Throttler`, `Delayer`, `RunOnceScheduler`, and `Queue` manage async operation scheduling and deduplication.\n\n', + '### strings.ts\n', + 'String utilities including `format()`, `escape()`, `startsWith()`, and `endsWith()` for common string operations.\n\n', + '### arrays.ts\n', + 'Array helpers like `coalesce()`, `groupBy()`, `distinct()`, and binary search implementations.\n\n', + '### types.ts\n', + 'Type guards and assertion helpers: `isString()`, `isNumber()`, `assertType()`, `assertIsDefined()`.\n', + ], 15) + .build(), + }, + ], + }; + })()), + + // Stress test: read 3 files then apply edits to 2 of them, simulating + // a real agent reading context and making multiple edits. + 'tool-edit-file': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ((() => { + const readFiles = [ + 'src/vs/base/common/lifecycle.ts', + 'src/vs/base/common/event.ts', + 'src/vs/base/common/errors.ts', + ]; + return { + type: 'multi-turn', + turns: [ + // Round 1: read all 3 files in parallel + { + kind: 'tool-calls', + toolCalls: readFiles.map(f => ({ + toolNamePattern: /read.?file/i, + arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 40 }, + })), + }, + // Round 2: edit 2 files in parallel + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + oldString: '// perf-benchmark-marker', + newString: '// perf-benchmark-marker (updated)', + explanation: 'Update the benchmark marker comment in lifecycle.ts', + }, + }, + { + toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/event.ts'), + oldString: '// perf-benchmark-marker', + newString: '// perf-benchmark-marker (updated)', + explanation: 'Update the benchmark marker comment in event.ts', + }, + }, + ], + }, + // Round 3: final content + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, '## Edits Applied\n\n') + .stream([ + 'I read 3 files and applied edits to 2 of them:\n\n', + '### Files read:\n', + '1. `src/vs/base/common/lifecycle.ts` — Disposable pattern and lifecycle management\n', + '2. `src/vs/base/common/event.ts` — Event emitter and observer pattern\n', + '3. `src/vs/base/common/errors.ts` — Error handling utilities\n\n', + '### Edits applied:\n', + '1. **lifecycle.ts** — Updated the benchmark marker comment\n', + '2. **event.ts** — Updated the benchmark marker comment\n\n', + 'Both files follow the standard VS Code pattern of using `Disposable` as a base class ', + 'with `_register()` for lifecycle management. The edits were minimal and localized.\n', + ], 20) + .build(), + }, + ], + }; + })()), +}; + +// -- Multi-turn user conversation scenarios ----------------------------------- + +/** @type {Record} */ +const MULTI_TURN_SCENARIOS = { + 'thinking-response': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ({ + type: 'multi-turn', + turns: [ + { + kind: 'thinking', + thinkingChunks: new ScenarioBuilder() + .stream([ + 'Let me analyze this code carefully. ', + 'The user is asking about the lifecycle pattern in VS Code. ', + 'I should look at the Disposable base class and how it manages cleanup. ', + 'The key methods are _register(), dispose(), and the DisposableStore pattern. ', + 'I need to read the file first to give an accurate explanation.', + ], 15) + .build(), + chunks: new ScenarioBuilder() + .wait(20, 'I\'ll start by reading the file to understand its structure.\n\n') + .stream([ + 'The `Disposable` base class in `lifecycle.ts` provides a standard pattern ', + 'for managing resources. It uses a `DisposableStore` internally to track ', + 'all registered disposables and clean them up on `dispose()`.\n', + ], 20) + .build(), + }, + ], + }), + + 'multi-turn-user': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ({ + type: 'multi-turn', + turns: [ + // Turn 1: Model reads a file + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /read.?file/i, + arguments: { + filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + offset: 1, + limit: 50, + }, + }, + ], + }, + // Turn 2: Model responds with analysis + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, 'I\'ve read the file. Here\'s what I found:\n\n') + .stream([ + 'The `Disposable` class is the base for lifecycle management. ', + 'It internally holds a `DisposableStore` via `this._store`. ', + 'Subclasses call `this._register()` to track their own disposables.\n\n', + 'Would you like me to explain any specific part in more detail?\n', + ], 20) + .build(), + }, + // Turn 3: User follow-up (injected by test harness, not served by mock) + { + kind: 'user', + message: 'Yes, explain the MutableDisposable pattern', + }, + // Turn 4: Model responds with thinking, then content + { + kind: 'thinking', + thinkingChunks: new ScenarioBuilder() + .stream([ + 'The user wants to understand MutableDisposable specifically. ', + 'Let me recall the key aspects: it holds a single disposable that can be swapped. ', + 'When a new value is set, the old one is automatically disposed. ', + 'This is useful for things like event listener subscriptions that need to be replaced.', + ], 10) + .build(), + chunks: new ScenarioBuilder() + .wait(15, '## MutableDisposable\n\n') + .stream([ + '`MutableDisposable` holds a **single disposable** that can be swapped at any time. ', + 'When you set a new value via `.value = newDisposable`, the previous value is automatically disposed.\n\n', + 'This is perfect for:\n', + '- **Event listeners** that need to be re-subscribed when configuration changes\n', + '- **Editor decorations** that are replaced when content updates\n', + '- **Watchers** that switch targets dynamically\n\n', + '```typescript\n', + 'class MyService extends Disposable {\n', + ' private readonly _listener = this._register(new MutableDisposable());\n\n', + ' updateTarget(editor: ICodeEditor): void {\n', + ' // Old listener is automatically disposed\n', + ' this._listener.value = editor.onDidChangeModel(() => {\n', + ' this._handleModelChange();\n', + ' });\n', + ' }\n', + '}\n', + '```\n\n', + 'The key benefit is that you never forget to dispose the old subscription.\n', + ], 15) + .build(), + }, + // Turn 5: Second user follow-up + { + kind: 'user', + message: 'Can you also show me DisposableMap?', + }, + // Turn 6: Final response + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, '## DisposableMap\n\n') + .stream([ + '`DisposableMap` extends `Map` with automatic disposal semantics:\n\n', + '- When a key is **overwritten**, the old value is disposed\n', + '- When a key is **deleted**, the value is disposed\n', + '- When the map itself is **disposed**, all values are disposed\n\n', + '```typescript\n', + 'class ToolManager extends Disposable {\n', + ' private readonly _tools = this._register(new DisposableMap());\n\n', + ' registerTool(id: string, tool: IDisposable): void {\n', + ' this._tools.set(id, tool); // auto-disposes previous tool with same id\n', + ' }\n', + '}\n', + '```\n\n', + 'This is commonly used for managing collections of disposable resources keyed by ID.\n', + ], 15) + .build(), + }, + ], + }), +}; + +// -- Registration helper ------------------------------------------------------ + +/** + * Register all built-in perf scenarios into the mock LLM server. + * Call this from your test file before starting the server. + */ +function registerPerfScenarios() { + for (const [id, def] of Object.entries(CONTENT_SCENARIOS)) { + registerScenario(id, def); + } + for (const [id, def] of Object.entries(TOOL_CALL_SCENARIOS)) { + registerScenario(id, def); + } + for (const [id, def] of Object.entries(MULTI_TURN_SCENARIOS)) { + registerScenario(id, def); + } +} + +module.exports = { registerPerfScenarios, CONTENT_SCENARIOS, TOOL_CALL_SCENARIOS, MULTI_TURN_SCENARIOS }; diff --git a/scripts/chat-perf/common/utils.js b/scripts/chat-simulation/common/utils.js similarity index 87% rename from scripts/chat-perf/common/utils.js rename to scripts/chat-simulation/common/utils.js index e616ecbfb5719..c7b120a219393 100644 --- a/scripts/chat-perf/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -16,22 +16,7 @@ const http = require('http'); const { execSync, spawn } = require('child_process'); const ROOT = path.join(__dirname, '..', '..', '..'); -const DATA_DIR = path.join(ROOT, '.chat-perf-data'); - -const SCENARIOS = [ - 'text-only', - 'large-codeblock', - 'many-small-chunks', - 'mixed-content', - 'many-codeblocks', - 'long-prose', - 'rich-markdown', - 'giant-codeblock', - 'rapid-stream', - 'file-links', - 'tool-read-file', - 'tool-edit-file', -]; +const DATA_DIR = path.join(ROOT, '.chat-simulation-data'); // -- Electron path resolution ------------------------------------------------ @@ -58,6 +43,22 @@ function isVersionString(value) { return false; } +/** + * Get the built-in extensions directory for a VS Code executable. + * @param {string} exePath + * @returns {string | undefined} + */ +function getBuiltinExtensionsDir(exePath) { + if (process.platform === 'darwin') { + const appDir = exePath.split('/Contents/')[0]; + return path.join(appDir, 'Contents', 'Resources', 'app', 'extensions'); + } else if (process.platform === 'linux') { + return path.join(path.dirname(exePath), 'resources', 'app', 'extensions'); + } else { + return path.join(path.dirname(exePath), 'resources', 'app', 'extensions'); + } +} + /** * Resolve a build arg to an executable path. * Version strings are downloaded via @vscode/test-electron. @@ -69,29 +70,40 @@ async function resolveBuild(buildArg) { return getElectronPath(); } if (isVersionString(buildArg)) { - console.log(`[chat-perf] Downloading VS Code ${buildArg}...`); + console.log(`[chat-simulation] Downloading VS Code ${buildArg}...`); const { downloadAndUnzipVSCode, resolveCliArgsFromVSCodeExecutablePath } = require('@vscode/test-electron'); const exePath = await downloadAndUnzipVSCode(buildArg); - console.log(`[chat-perf] Downloaded: ${exePath}`); - - // Install the copilot extension into our shared extensions dir so it's - // available when we launch with --extensions-dir=DATA_DIR/extensions. - const extDir = path.join(DATA_DIR, 'extensions'); - fs.mkdirSync(extDir, { recursive: true }); - const [cli, ...cliArgs] = resolveCliArgsFromVSCodeExecutablePath(exePath); - const extId = 'GitHub.copilot'; - console.log(`[chat-perf] Installing ${extId} into ${extDir}...`); - const { spawnSync } = require('child_process'); - const result = spawnSync(cli, [...cliArgs, '--extensions-dir', extDir, '--install-extension', extId], { - encoding: 'utf-8', - stdio: 'pipe', - shell: process.platform === 'win32', - timeout: 120_000, - }); - if (result.status !== 0) { - console.warn(`[chat-perf] Extension install exited with ${result.status}: ${(result.stderr || '').substring(0, 500)}`); + console.log(`[chat-simulation] Downloaded: ${exePath}`); + + // Check if copilot is already bundled as a built-in extension + // (recent Insiders/Stable builds ship it in the app's extensions/ dir). + const builtinExtDir = getBuiltinExtensionsDir(exePath); + const hasCopilotBuiltin = builtinExtDir && fs.existsSync(builtinExtDir) + && fs.readdirSync(builtinExtDir).some(e => e === 'copilot'); + + if (hasCopilotBuiltin) { + console.log(`[chat-simulation] Copilot is bundled as a built-in extension`); } else { - console.log(`[chat-perf] ${extId} installed`); + // Install copilot-chat from the marketplace into our shared + // extensions dir so it's available when we launch with + // --extensions-dir=DATA_DIR/extensions. + const extDir = path.join(DATA_DIR, 'extensions'); + fs.mkdirSync(extDir, { recursive: true }); + const [cli, ...cliArgs] = resolveCliArgsFromVSCodeExecutablePath(exePath); + const extId = 'GitHub.copilot-chat'; + console.log(`[chat-simulation] Installing ${extId} into ${extDir}...`); + const { spawnSync } = require('child_process'); + const result = spawnSync(cli, [...cliArgs, '--extensions-dir', extDir, '--install-extension', extId], { + encoding: 'utf-8', + stdio: 'pipe', + shell: process.platform === 'win32', + timeout: 120_000, + }); + if (result.status !== 0) { + console.warn(`[chat-simulation] Extension install exited with ${result.status}: ${(result.stderr || '').substring(0, 500)}`); + } else { + console.log(`[chat-simulation] ${extId} installed`); + } } return exePath; @@ -111,7 +123,7 @@ function preseedStorage(userDataDir) { const globalStorageDir = path.join(userDataDir, 'User', 'globalStorage'); fs.mkdirSync(globalStorageDir, { recursive: true }); const dbPath = path.join(globalStorageDir, 'state.vscdb'); - execSync(`sqlite3 "${dbPath}" "CREATE TABLE IF NOT EXISTS ItemTable (key TEXT UNIQUE ON CONFLICT REPLACE, value BLOB); INSERT INTO ItemTable (key, value) VALUES ('builtinChatExtensionEnablementMigration', 'true');"`); + execSync(`sqlite3 "${dbPath}" "CREATE TABLE IF NOT EXISTS ItemTable (key TEXT UNIQUE ON CONFLICT REPLACE, value BLOB); INSERT INTO ItemTable (key, value) VALUES ('builtinChatExtensionEnablementMigration', 'true'); INSERT INTO ItemTable (key, value) VALUES ('chat.tools.global.autoApprove.optIn', 'true');"`); } // -- Launch helpers ---------------------------------------------------------- @@ -200,6 +212,9 @@ function writeSettings(userDataDir, mockServer) { 'chat.mcp.enabled': false, 'github.copilot.chat.githubMcpServer.enabled': false, 'github.copilot.chat.cli.mcp.enabled': false, + // Auto-approve all tool invocations (YOLO mode) so tool call + // scenarios don't block on confirmation dialogs. + 'chat.tools.global.autoApprove': true, }, null, '\t')); } @@ -210,7 +225,7 @@ function writeSettings(userDataDir, mockServer) { * @returns {{ userDataDir: string, extDir: string, logsDir: string }} */ function prepareRunDir(runId, mockServer) { - const tmpBase = path.join(os.tmpdir(), 'vscode-chat-perf'); + const tmpBase = path.join(os.tmpdir(), 'vscode-chat-simulation'); const userDataDir = path.join(tmpBase, `run-${runId}`); const extDir = path.join(DATA_DIR, 'extensions'); const logsDir = path.join(tmpBase, 'logs', `run-${runId}`); @@ -220,10 +235,11 @@ function prepareRunDir(runId, mockServer) { fs.rmSync(userDataDir, { recursive: true, force: true }); break; } catch (err) { - if (attempt < 2 && err.code === 'ENOTEMPTY') { + const error = /** @type {NodeJS.ErrnoException} */ (err); + if (attempt < 2 && error.code === 'ENOTEMPTY') { require('child_process').execSync(`sleep 0.5`); } else { - throw err; + throw error; } } } @@ -380,7 +396,7 @@ async function launchVSCode(executable, launchArgs, env, opts = {}) { // Kill crashpad handler — it self-daemonizes and outlives the // parent. Wait briefly for it to detach, then kill by pattern. await new Promise(r => setTimeout(r, 500)); - try { execSync('pkill -9 -f crashpad_handler.*vscode-chat-perf', { stdio: 'ignore' }); } + try { execSync('pkill -9 -f crashpad_handler.*vscode-chat-simulation', { stdio: 'ignore' }); } catch { } }, }; @@ -608,7 +624,6 @@ const METRIC_DEFS = [ module.exports = { ROOT, DATA_DIR, - SCENARIOS, METRIC_DEFS, getElectronPath, isVersionString, diff --git a/scripts/chat-perf/test-chat-mem-leaks.js b/scripts/chat-simulation/test-chat-mem-leaks.js similarity index 91% rename from scripts/chat-perf/test-chat-mem-leaks.js rename to scripts/chat-simulation/test-chat-mem-leaks.js index b4f588a6a4362..c11b416b90ca2 100644 --- a/scripts/chat-perf/test-chat-mem-leaks.js +++ b/scripts/chat-simulation/test-chat-mem-leaks.js @@ -190,15 +190,17 @@ async function main() { } const { startServer } = require('./common/mock-llm-server'); + const { registerPerfScenarios } = require('./common/perf-scenarios'); + registerPerfScenarios(); const mockServer = await startServer(0); - console.log(`[chat-perf] Leak check: ${opts.messages} messages, threshold ${opts.leakThresholdMB}MB/msg`); - console.log(`[chat-perf] Build: ${electronPath}`); + console.log(`[chat-simulation] Leak check: ${opts.messages} messages, threshold ${opts.leakThresholdMB}MB/msg`); + console.log(`[chat-simulation] Build: ${electronPath}`); console.log(''); const result = await runLeakCheck(electronPath, mockServer, opts.messages, opts.verbose); - console.log('[chat-perf] =================== Leak Check Results ==================='); + console.log('[chat-simulation] =================== Leak Check Results ==================='); console.log(''); console.log(` Heap samples (MB): ${result.heapSamples.join(' → ')}`); console.log(` DOM node samples: ${result.domNodeSamples.join(' → ')}`); @@ -210,16 +212,16 @@ async function main() { console.log(''); // Write JSON - const jsonPath = path.join(DATA_DIR, 'chat-perf-leak-results.json'); + const jsonPath = path.join(DATA_DIR, 'chat-simulation-leak-results.json'); fs.writeFileSync(jsonPath, JSON.stringify({ timestamp: new Date().toISOString(), ...result }, null, 2)); - console.log(`[chat-perf] Results written to ${jsonPath}`); + console.log(`[chat-simulation] Results written to ${jsonPath}`); const leaked = result.leakPerMessageMB > opts.leakThresholdMB; console.log(''); if (leaked) { - console.log(`[chat-perf] LEAK DETECTED — ${result.leakPerMessageMB}MB/msg exceeds ${opts.leakThresholdMB}MB/msg threshold`); + console.log(`[chat-simulation] LEAK DETECTED — ${result.leakPerMessageMB}MB/msg exceeds ${opts.leakThresholdMB}MB/msg threshold`); } else { - console.log(`[chat-perf] No leak detected (${result.leakPerMessageMB}MB/msg < ${opts.leakThresholdMB}MB/msg)`); + console.log(`[chat-simulation] No leak detected (${result.leakPerMessageMB}MB/msg < ${opts.leakThresholdMB}MB/msg)`); } await mockServer.close(); diff --git a/scripts/chat-perf/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js similarity index 83% rename from scripts/chat-perf/test-chat-perf-regression.js rename to scripts/chat-simulation/test-chat-perf-regression.js index 90c9e81ab2275..2dff251c20f8a 100644 --- a/scripts/chat-perf/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -18,16 +18,18 @@ * npm run perf:chat -- --scenario text-only # single scenario * npm run perf:chat -- --no-baseline # skip baseline comparison * npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 - * npm run perf:chat -- --resume .chat-perf-data/2026-04-14/results.json --runs 3 + * npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3 */ const path = require('path'); const fs = require('fs'); const { - ROOT, DATA_DIR, SCENARIOS, METRIC_DEFS, + DATA_DIR, METRIC_DEFS, resolveBuild, buildEnv, buildArgs, prepareRunDir, robustStats, welchTTest, summarize, markDuration, launchVSCode, } = require('./common/utils'); +const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server'); +const { registerPerfScenarios } = require('./common/perf-scenarios'); // -- CLI args ---------------------------------------------------------------- @@ -83,13 +85,13 @@ function parseArgs() { ' --ci CI mode: write Markdown summary to ci-summary.md', ' --verbose Print per-run details', '', - 'Scenarios: ' + SCENARIOS.join(', '), + 'Scenarios: ' + getScenarioIds().join(', '), ].join('\n')); process.exit(0); } } if (opts.scenarios.length === 0) { - opts.scenarios = SCENARIOS; + opts.scenarios = getScenarioIds(); } return opts; } @@ -156,6 +158,7 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru buildEnv(mockServer, { isDevBuild }), { verbose }, ); + activeVSCode = vscode; const window = vscode.page; try { @@ -294,7 +297,79 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru }, responseSelector, { timeout: 30_000 }, ); - const responseCompleteTime = Date.now(); + let responseCompleteTime = Date.now(); + + // -- User turn injection loop ----------------------------------------- + // For multi-turn scenarios with user follow-ups, type each follow-up + // message and wait for the model's response to settle. + const userTurns = getUserTurns(scenario); + for (let ut = 0; ut < userTurns.length; ut++) { + const userTurn = userTurns[ut]; + if (verbose) { + console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`); + } + + // Brief pause to let the UI settle between turns + await new Promise(r => setTimeout(r, 500)); + + // Focus the chat input + await window.click(chatEditorSel); + const utFocusStart = Date.now(); + while (Date.now() - utFocusStart < 3_000) { + const focused = await window.evaluate((sel) => { + const el = document.querySelector(sel); + return el && (el.classList.contains('focused') || el.contains(document.activeElement)); + }, chatEditorSel).catch(() => false); + if (focused) { break; } + await new Promise(r => setTimeout(r, 50)); + } + + // Type the follow-up message + if (hasDriver) { + await window.evaluate(({ selector, text }) => { + // @ts-ignore + return globalThis.driver.typeInEditor(selector, text); + }, { selector: actualInputSelector, text: userTurn.message }); + } else { + await window.click(actualInputSelector); + await new Promise(r => setTimeout(r, 200)); + await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 }); + } + + // Note current response count before submitting + const responseCountBefore = await window.evaluate((sel) => { + return document.querySelectorAll(sel).length; + }, responseSelector); + + // Submit follow-up + const utCompBefore = mockServer.completionCount(); + await window.keyboard.press('Enter'); + + // Wait for mock server to serve the response for this turn + try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { } + + // Wait for a new response element to appear and settle + await dismissDialog(); + await window.waitForFunction( + ({ sel, prevCount }) => { + const responses = document.querySelectorAll(sel); + if (responses.length <= prevCount) { return false; } + return !responses[responses.length - 1].classList.contains('chat-response-loading'); + }, + { sel: responseSelector, prevCount: responseCountBefore }, + { timeout: 30_000 }, + ); + responseCompleteTime = Date.now(); + + if (verbose) { + const utResponseInfo = await window.evaluate((sel) => { + const responses = document.querySelectorAll(sel); + const last = responses[responses.length - 1]; + return last ? (last.textContent || '').substring(0, 150) : '(empty)'; + }, responseSelector); + console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`); + } + } // Stop CPU profiler and save the profile const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop')); @@ -403,6 +478,7 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru snapshotPath, }; } finally { + activeVSCode = null; await vscode.close(); } } @@ -572,19 +648,41 @@ function generateCISummary(jsonReport, baseline, opts) { return lines.join('\n'); } +// -- Cleanup on SIGINT/SIGTERM ----------------------------------------------- + +/** @type {{ close: () => Promise } | null} */ +let activeVSCode = null; +/** @type {{ close: () => Promise } | null} */ +let activeMockServer = null; + +function installSignalHandlers() { + const cleanup = async () => { + console.log('\n[chat-simulation] Caught interrupt, cleaning up...'); + try { await activeVSCode?.close(); } catch { } + try { await activeMockServer?.close(); } catch { } + process.exit(130); + }; + process.on('SIGINT', cleanup); + process.on('SIGTERM', cleanup); +} + // -- Main -------------------------------------------------------------------- async function main() { + registerPerfScenarios(); const opts = parseArgs(); + installSignalHandlers(); + const { startServer } = require('./common/mock-llm-server'); const mockServer = await startServer(0); - console.log(`[chat-perf] Mock LLM server: ${mockServer.url}`); + activeMockServer = mockServer; + console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`); // -- Resume mode -------------------------------------------------------- if (opts.resume) { if (!fs.existsSync(opts.resume)) { - console.error(`[chat-perf] Resume file not found: ${opts.resume}`); + console.error(`[chat-simulation] Resume file not found: ${opts.resume}`); process.exit(1); } const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8')); @@ -601,7 +699,7 @@ async function main() { : Object.keys(prevResults.scenarios || {}); if (resumeScenarios.length === 0) { - console.error('[chat-perf] No matching scenarios found in previous results'); + console.error('[chat-simulation] No matching scenarios found in previous results'); process.exit(1); } @@ -610,24 +708,24 @@ async function main() { const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null; const runsToAdd = opts.runs; - console.log(`[chat-perf] Resuming from: ${opts.resume}`); - console.log(`[chat-perf] Adding ${runsToAdd} runs per scenario`); - console.log(`[chat-perf] Scenarios: ${resumeScenarios.join(', ')}`); + console.log(`[chat-simulation] Resuming from: ${opts.resume}`); + console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`); + console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`); if (prevBaseline) { - console.log(`[chat-perf] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`); + console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`); } console.log(''); for (const scenario of resumeScenarios) { - console.log(`[chat-perf] === Resuming: ${scenario} ===`); + console.log(`[chat-simulation] === Resuming: ${scenario} ===`); const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || []; const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || []; // Run additional test iterations - console.log(`[chat-perf] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`); + console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`); for (let i = 0; i < runsToAdd; i++) { const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`; - console.log(`[chat-perf] Run ${i + 1}/${runsToAdd}...`); + console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`); try { const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test'); prevTestRuns.push(m); @@ -640,10 +738,10 @@ async function main() { // Run additional baseline iterations if (baselineElectron && prevBaseline?.scenarios?.[scenario]) { - console.log(`[chat-perf] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`); + console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`); for (let i = 0; i < runsToAdd; i++) { const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`; - console.log(`[chat-perf] Run ${i + 1}/${runsToAdd}...`); + console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`); try { const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline'); prevBaseRuns.push(m); @@ -661,7 +759,7 @@ async function main() { for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); } prevBaseline.scenarios[scenario] = bsd; } - console.log(`[chat-perf] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`); + console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`); console.log(''); } @@ -669,7 +767,7 @@ async function main() { prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs)); prevResults.lastResumed = new Date().toISOString(); fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2)); - console.log(`[chat-perf] Updated results: ${opts.resume}`); + console.log(`[chat-simulation] Updated results: ${opts.resume}`); if (prevBaseline && baselineFile) { prevBaseline.lastResumed = new Date().toISOString(); @@ -677,7 +775,7 @@ async function main() { // Also update cached baseline const cachedPath = path.join(DATA_DIR, path.basename(baselineFile)); fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2)); - console.log(`[chat-perf] Updated baseline: ${baselineFile}`); + console.log(`[chat-simulation] Updated baseline: ${baselineFile}`); } // -- Re-run comparison with merged data -------------------------------- @@ -704,7 +802,7 @@ async function main() { const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const runDir = path.join(DATA_DIR, runTimestamp); fs.mkdirSync(runDir, { recursive: true }); - console.log(`[chat-perf] Output: ${runDir}`); + console.log(`[chat-simulation] Output: ${runDir}`); // -- Baseline build -------------------------------------------------- if (opts.baselineBuild) { @@ -720,19 +818,19 @@ async function main() { const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s)); if (missingScenarios.length === 0) { - console.log(`[chat-perf] Using cached baseline for ${opts.baselineBuild}`); + console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`); fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); opts.baseline = baselineJsonPath; } else { - console.log(`[chat-perf] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); - console.log(`[chat-perf] Running baseline for missing scenarios...`); + console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); + console.log(`[chat-simulation] Running baseline for missing scenarios...`); const baselineExePath = await resolveBuild(opts.baselineBuild); for (const scenario of missingScenarios) { /** @type {RunMetrics[]} */ const results = []; for (let i = 0; i < opts.runs; i++) { try { results.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline')); } - catch (err) { console.error(`[chat-perf] Baseline run ${i + 1} failed: ${err}`); } + catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); } } if (results.length > 0) { const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); @@ -747,7 +845,7 @@ async function main() { } } else { const baselineExePath = await resolveBuild(opts.baselineBuild); - console.log(`[chat-perf] Benchmarking baseline build (${opts.baselineBuild})...`); + console.log(`[chat-simulation] Benchmarking baseline build (${opts.baselineBuild})...`); /** @type {Record} */ const baselineResults = {}; for (const scenario of opts.scenarios) { @@ -755,7 +853,7 @@ async function main() { const results = []; for (let i = 0; i < opts.runs; i++) { try { results.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline')); } - catch (err) { console.error(`[chat-perf] Baseline run ${i + 1} failed: ${err}`); } + catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); } } if (results.length > 0) { baselineResults[scenario] = results; } } @@ -780,9 +878,9 @@ async function main() { } // -- Run benchmarks -------------------------------------------------- - console.log(`[chat-perf] Electron: ${electronPath}`); - console.log(`[chat-perf] Runs per scenario: ${opts.runs}`); - console.log(`[chat-perf] Scenarios: ${opts.scenarios.join(', ')}`); + console.log(`[chat-simulation] Electron: ${electronPath}`); + console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`); + console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`); console.log(''); /** @type {Record} */ @@ -790,11 +888,11 @@ async function main() { let anyFailed = false; for (const scenario of opts.scenarios) { - console.log(`[chat-perf] === Scenario: ${scenario} ===`); + console.log(`[chat-simulation] === Scenario: ${scenario} ===`); /** @type {RunMetrics[]} */ const results = []; for (let i = 0; i < opts.runs; i++) { - console.log(`[chat-perf] Run ${i + 1}/${opts.runs}...`); + console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`); try { const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test'); results.push(metrics); @@ -804,13 +902,13 @@ async function main() { } } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } } - if (results.length === 0) { console.error(`[chat-perf] All runs failed for scenario: ${scenario}`); anyFailed = true; } + if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; } else { allResults[scenario] = results; } console.log(''); } // -- Summary --------------------------------------------------------- - console.log('[chat-perf] ======================= Summary ======================='); + console.log('[chat-simulation] ======================= Summary ======================='); for (const [scenario, results] of Object.entries(allResults)) { console.log(''); console.log(` -- ${scenario} (${results.length} runs) --`); @@ -841,13 +939,13 @@ async function main() { fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2)); jsonReport._resultsPath = jsonPath; console.log(''); - console.log(`[chat-perf] Results written to ${jsonPath}`); + console.log(`[chat-simulation] Results written to ${jsonPath}`); // -- Save baseline --------------------------------------------------- if (opts.saveBaseline) { - if (!opts.baseline) { console.error('[chat-perf] --save-baseline requires --baseline '); process.exit(1); } + if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline '); process.exit(1); } fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2)); - console.log(`[chat-perf] Baseline saved to ${opts.baseline}`); + console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`); } // -- Baseline comparison --------------------------------------------- @@ -868,8 +966,8 @@ async function printComparison(jsonReport, opts) { if (opts.baseline && fs.existsSync(opts.baseline)) { const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')); console.log(''); - console.log(`[chat-perf] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`); - console.log(`[chat-perf] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`); + console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`); + console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`); console.log(''); // Metrics that trigger regression failure when they exceed the threshold @@ -942,8 +1040,8 @@ async function printComparison(jsonReport, opts) { console.log(''); console.log(regressionFound - ? `[chat-perf] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance` - : `[chat-perf] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`); + ? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance` + : `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`); if (inconclusiveFound && !regressionFound) { // Find the results.json path to suggest in the hint @@ -951,9 +1049,9 @@ async function printComparison(jsonReport, opts) { ? (jsonReport._resultsPath || opts.resume || 'path/to/results.json') : 'path/to/results.json'; console.log(''); - console.log('[chat-perf] Some metrics exceeded the threshold but were not statistically significant.'); - console.log('[chat-perf] To increase confidence, add more runs with --resume:'); - console.log(`[chat-perf] npm run perf:chat -- --resume ${resultsPath} --runs 3`); + console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.'); + console.log('[chat-simulation] To increase confidence, add more runs with --resume:'); + console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs 3`); } } @@ -972,7 +1070,7 @@ async function printComparison(jsonReport, opts) { // Write to file for GitHub Actions $GITHUB_STEP_SUMMARY const summaryPath = path.join(DATA_DIR, 'ci-summary.md'); fs.writeFileSync(summaryPath, summary); - console.log(`[chat-perf] CI summary written to ${summaryPath}`); + console.log(`[chat-simulation] CI summary written to ${summaryPath}`); // Also print the full summary table to stdout console.log(''); From bcdcda3db45a6349956620776fedc92a32012bb5 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 14:48:52 -0700 Subject: [PATCH 05/13] PR --- .github/workflows/chat-perf.yml | 4 +- build/filters.ts | 1 + .../chat-simulation/common/perf-scenarios.js | 36 ++--- scripts/chat-simulation/common/utils.js | 17 +++ scripts/chat-simulation/config.jsonc | 19 +++ scripts/chat-simulation/fixtures/arrays.ts | 84 +++++++++++ scripts/chat-simulation/fixtures/async.ts | 132 ++++++++++++++++++ scripts/chat-simulation/fixtures/errors.ts | 88 ++++++++++++ scripts/chat-simulation/fixtures/event.ts | 109 +++++++++++++++ scripts/chat-simulation/fixtures/lifecycle.ts | 127 +++++++++++++++++ scripts/chat-simulation/fixtures/strings.ts | 75 ++++++++++ scripts/chat-simulation/fixtures/types.ts | 92 ++++++++++++ scripts/chat-simulation/fixtures/uri.ts | 85 +++++++++++ .../chat-simulation/test-chat-mem-leaks.js | 10 +- .../test-chat-perf-regression.js | 90 +++++++++--- 15 files changed, 925 insertions(+), 44 deletions(-) create mode 100644 scripts/chat-simulation/config.jsonc create mode 100644 scripts/chat-simulation/fixtures/arrays.ts create mode 100644 scripts/chat-simulation/fixtures/async.ts create mode 100644 scripts/chat-simulation/fixtures/errors.ts create mode 100644 scripts/chat-simulation/fixtures/event.ts create mode 100644 scripts/chat-simulation/fixtures/lifecycle.ts create mode 100644 scripts/chat-simulation/fixtures/strings.ts create mode 100644 scripts/chat-simulation/fixtures/types.ts create mode 100644 scripts/chat-simulation/fixtures/uri.ts diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index ddce98e68468b..ae841e55f0f13 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -71,9 +71,9 @@ jobs: libnotify-bin libkrb5-dev \ xvfb sqlite3 \ libnss3 libatk1.0-0 libatk-bridge2.0-0 \ - libcups2 libdrm2 libxcomposite1 libxdamage1 \ + libcups2t64 libdrm2 libxcomposite1 libxdamage1 \ libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \ - libasound2 libxshmfence1 libgtk-3-0 + libasound2t64 libxshmfence1 libgtk-3-0 - name: Install dependencies run: npm ci diff --git a/build/filters.ts b/build/filters.ts index d4ea9c8db730d..27c79749e3c60 100644 --- a/build/filters.ts +++ b/build/filters.ts @@ -162,6 +162,7 @@ export const copyrightFilter = Object.freeze([ '**', '!**/*.desktop', '!**/*.json', + '!**/*.jsonc', '!**/*.jsonl', '!**/*.html', '!**/*.template', diff --git a/scripts/chat-simulation/common/perf-scenarios.js b/scripts/chat-simulation/common/perf-scenarios.js index ce46effc816e4..eaaf615a76f39 100644 --- a/scripts/chat-simulation/common/perf-scenarios.js +++ b/scripts/chat-simulation/common/perf-scenarios.js @@ -15,7 +15,7 @@ const path = require('path'); const { ScenarioBuilder, registerScenario } = require('./mock-llm-server'); -const ROOT = path.join(__dirname, '..', '..', '..'); +const FIXTURES_DIR = path.join(__dirname, '..', 'fixtures'); // -- Content-only scenarios --------------------------------------------------- @@ -232,14 +232,14 @@ const TOOL_CALL_SCENARIOS = { // a real agent gathering context before answering. 'tool-read-file': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ((() => { const filesToRead = [ - 'src/vs/base/common/lifecycle.ts', - 'src/vs/base/common/event.ts', - 'src/vs/base/common/uri.ts', - 'src/vs/base/common/errors.ts', - 'src/vs/base/common/async.ts', - 'src/vs/base/common/strings.ts', - 'src/vs/base/common/arrays.ts', - 'src/vs/base/common/types.ts', + 'lifecycle.ts', + 'event.ts', + 'uri.ts', + 'errors.ts', + 'async.ts', + 'strings.ts', + 'arrays.ts', + 'types.ts', ]; // Round 1: parallel read of first 4 files // Round 2: parallel read of next 4 files @@ -251,14 +251,14 @@ const TOOL_CALL_SCENARIOS = { kind: 'tool-calls', toolCalls: filesToRead.slice(0, 4).map(f => ({ toolNamePattern: /read.?file/i, - arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 50 }, + arguments: { filePath: path.join(FIXTURES_DIR, f), startLine: 1, endLine: 50 }, })), }, { kind: 'tool-calls', toolCalls: filesToRead.slice(4).map(f => ({ toolNamePattern: /read.?file/i, - arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 50 }, + arguments: { filePath: path.join(FIXTURES_DIR, f), startLine: 1, endLine: 50 }, })), }, { @@ -296,9 +296,9 @@ const TOOL_CALL_SCENARIOS = { // a real agent reading context and making multiple edits. 'tool-edit-file': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ((() => { const readFiles = [ - 'src/vs/base/common/lifecycle.ts', - 'src/vs/base/common/event.ts', - 'src/vs/base/common/errors.ts', + 'lifecycle.ts', + 'event.ts', + 'errors.ts', ]; return { type: 'multi-turn', @@ -308,7 +308,7 @@ const TOOL_CALL_SCENARIOS = { kind: 'tool-calls', toolCalls: readFiles.map(f => ({ toolNamePattern: /read.?file/i, - arguments: { filePath: path.join(ROOT, f), startLine: 1, endLine: 40 }, + arguments: { filePath: path.join(FIXTURES_DIR, f), startLine: 1, endLine: 40 }, })), }, // Round 2: edit 2 files in parallel @@ -318,7 +318,7 @@ const TOOL_CALL_SCENARIOS = { { toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), oldString: '// perf-benchmark-marker', newString: '// perf-benchmark-marker (updated)', explanation: 'Update the benchmark marker comment in lifecycle.ts', @@ -327,7 +327,7 @@ const TOOL_CALL_SCENARIOS = { { toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/event.ts'), + filePath: path.join(FIXTURES_DIR, 'event.ts'), oldString: '// perf-benchmark-marker', newString: '// perf-benchmark-marker (updated)', explanation: 'Update the benchmark marker comment in event.ts', @@ -399,7 +399,7 @@ const MULTI_TURN_SCENARIOS = { { toolNamePattern: /read.?file/i, arguments: { - filePath: path.join(ROOT, 'src/vs/base/common/lifecycle.ts'), + filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), offset: 1, limit: 50, }, diff --git a/scripts/chat-simulation/common/utils.js b/scripts/chat-simulation/common/utils.js index c7b120a219393..1b313aa86f68c 100644 --- a/scripts/chat-simulation/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -18,6 +18,22 @@ const { execSync, spawn } = require('child_process'); const ROOT = path.join(__dirname, '..', '..', '..'); const DATA_DIR = path.join(ROOT, '.chat-simulation-data'); +// -- Config loading ---------------------------------------------------------- + +/** @param {string} text */ +function stripJsoncComments(text) { return text.replace(/\/\/.*/g, '').replace(/\/\*[\s\S]*?\*\//g, ''); } + +/** + * Load a namespaced section from config.jsonc. + * @param {string} section - Top-level key (e.g. 'perfRegression', 'memLeaks') + * @returns {Record} + */ +function loadConfig(section) { + const raw = fs.readFileSync(path.join(__dirname, '..', 'config.jsonc'), 'utf-8'); + const config = JSON.parse(stripJsoncComments(raw)); + return config[section] ?? {}; +} + // -- Electron path resolution ------------------------------------------------ function getElectronPath() { @@ -625,6 +641,7 @@ module.exports = { ROOT, DATA_DIR, METRIC_DEFS, + loadConfig, getElectronPath, isVersionString, resolveBuild, diff --git a/scripts/chat-simulation/config.jsonc b/scripts/chat-simulation/config.jsonc new file mode 100644 index 0000000000000..ec758bbef11df --- /dev/null +++ b/scripts/chat-simulation/config.jsonc @@ -0,0 +1,19 @@ +{ + "perfRegression": { + // VS Code version, "insiders", or a commit hash (7-40 hex chars) + "baselineBuild": "1.115.0", + + // Number of benchmark iterations per scenario + "runsPerScenario": 5, + + // Fraction above baseline that triggers a regression (0.2 = 20%) + "regressionThreshold": 0.2 + }, + "memLeaks": { + // Number of chat messages to send during the leak check + "messages": 10, + + // Max acceptable heap growth per message in MB + "leakThresholdMB": 2 + } +} diff --git a/scripts/chat-simulation/fixtures/arrays.ts b/scripts/chat-simulation/fixtures/arrays.ts new file mode 100644 index 0000000000000..6a871b43e0ce3 --- /dev/null +++ b/scripts/chat-simulation/fixtures/arrays.ts @@ -0,0 +1,84 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/arrays.ts for stable perf testing. + */ + +export function coalesce(array: ReadonlyArray): T[] { + return array.filter((e): e is T => e !== undefined && e !== null); +} + +export function groupBy(data: ReadonlyArray, groupFn: (element: T) => string): { [key: string]: T[] } { + const result: { [key: string]: T[] } = {}; + for (const element of data) { + const key = groupFn(element); + (result[key] ??= []).push(element); + } + return result; +} + +export function distinct(array: ReadonlyArray, keyFn: (t: T) => any = t => t): T[] { + const seen = new Set(); + return array.filter(element => { + const key = keyFn(element); + if (seen.has(key)) { return false; } + seen.add(key); + return true; + }); +} + +export function firstOrDefault(array: ReadonlyArray): T | undefined; +export function firstOrDefault(array: ReadonlyArray, defaultValue: T): T; +export function firstOrDefault(array: ReadonlyArray, defaultValue?: T): T | undefined { + return array.length > 0 ? array[0] : defaultValue; +} + +export function lastOrDefault(array: ReadonlyArray): T | undefined; +export function lastOrDefault(array: ReadonlyArray, defaultValue: T): T; +export function lastOrDefault(array: ReadonlyArray, defaultValue?: T): T | undefined { + return array.length > 0 ? array[array.length - 1] : defaultValue; +} + +export function binarySearch(array: ReadonlyArray, key: T, comparator: (a: T, b: T) => number): number { + let low = 0; + let high = array.length - 1; + while (low <= high) { + const mid = ((low + high) / 2) | 0; + const comp = comparator(array[mid], key); + if (comp < 0) { low = mid + 1; } + else if (comp > 0) { high = mid - 1; } + else { return mid; } + } + return -(low + 1); +} + +export function insertSorted(array: T[], element: T, comparator: (a: T, b: T) => number): void { + const idx = binarySearch(array, element, comparator); + const insertIdx = idx < 0 ? ~idx : idx; + array.splice(insertIdx, 0, element); +} + +export function flatten(arr: T[][]): T[] { + return ([] as T[]).concat(...arr); +} + +export function range(to: number): number[]; +export function range(from: number, to: number): number[]; +export function range(arg: number, to?: number): number[] { + const from = to !== undefined ? arg : 0; + const end = to !== undefined ? to : arg; + const result: number[] = []; + for (let i = from; i < end; i++) { result.push(i); } + return result; +} + +export function tail(array: T[]): [T[], T] { + if (array.length === 0) { throw new Error('Invalid tail call'); } + return [array.slice(0, array.length - 1), array[array.length - 1]]; +} diff --git a/scripts/chat-simulation/fixtures/async.ts b/scripts/chat-simulation/fixtures/async.ts new file mode 100644 index 0000000000000..7964eea892ece --- /dev/null +++ b/scripts/chat-simulation/fixtures/async.ts @@ -0,0 +1,132 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/async.ts for stable perf testing. + */ + +import { IDisposable } from './lifecycle'; +import { CancellationError } from './errors'; + +export class Throttler { + private activePromise: Promise | null = null; + private queuedPromiseFactory: (() => Promise) | null = null; + + queue(promiseFactory: () => Promise): Promise { + if (this.activePromise) { + this.queuedPromiseFactory = promiseFactory; + return this.activePromise as Promise; + } + this.activePromise = promiseFactory(); + return this.activePromise.finally(() => { + this.activePromise = null; + if (this.queuedPromiseFactory) { + const factory = this.queuedPromiseFactory; + this.queuedPromiseFactory = null; + return this.queue(factory); + } + }); + } +} + +export class Delayer implements IDisposable { + private timeout: any; + private task: (() => T | Promise) | null = null; + + constructor(public defaultDelay: number) { } + + trigger(task: () => T | Promise, delay: number = this.defaultDelay): Promise { + this.task = task; + this.cancelTimeout(); + return new Promise((resolve, reject) => { + this.timeout = setTimeout(() => { + this.timeout = null; + try { resolve(this.task!()); } catch (e) { reject(e); } + this.task = null; + }, delay); + }); + } + + private cancelTimeout(): void { + if (this.timeout !== null) { + clearTimeout(this.timeout); + this.timeout = null; + } + } + + dispose(): void { + this.cancelTimeout(); + } +} + +export class RunOnceScheduler implements IDisposable { + private runner: (() => void) | null; + private timeout: any; + + constructor(runner: () => void, private delay: number) { + this.runner = runner; + } + + schedule(delay = this.delay): void { + this.cancel(); + this.timeout = setTimeout(() => { + this.timeout = null; + this.runner?.(); + }, delay); + } + + cancel(): void { + if (this.timeout !== null) { + clearTimeout(this.timeout); + this.timeout = null; + } + } + + isScheduled(): boolean { return this.timeout !== null; } + + dispose(): void { + this.cancel(); + this.runner = null; + } +} + +export class Queue { + private readonly queue: Array<() => Promise> = []; + private running = false; + + async enqueue(factory: () => Promise): Promise { + return new Promise((resolve, reject) => { + this.queue.push(() => factory().then(resolve, reject)); + if (!this.running) { this.processQueue(); } + }); + } + + private async processQueue(): Promise { + this.running = true; + while (this.queue.length > 0) { + const task = this.queue.shift()!; + await task(); + } + this.running = false; + } + + get size(): number { return this.queue.length; } +} + +export function timeout(millis: number): Promise { + return new Promise(resolve => setTimeout(resolve, millis)); +} + +export async function retry(task: () => Promise, delay: number, retries: number): Promise { + let lastError: Error | undefined; + for (let i = 0; i < retries; i++) { + try { return await task(); } + catch (error) { lastError = error as Error; await timeout(delay); } + } + throw lastError; +} diff --git a/scripts/chat-simulation/fixtures/errors.ts b/scripts/chat-simulation/fixtures/errors.ts new file mode 100644 index 0000000000000..0446dbb79a69f --- /dev/null +++ b/scripts/chat-simulation/fixtures/errors.ts @@ -0,0 +1,88 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/errors.ts for stable perf testing. + */ + +export interface ErrorListenerCallback { + (error: any): void; +} + +export interface ErrorListenerUnbind { + (): void; +} + +const _errorListeners: ErrorListenerCallback[] = []; + +export function setUnexpectedErrorHandler(handler: ErrorListenerCallback): void { + _errorListeners.length = 0; + _errorListeners.push(handler); +} + +export function onUnexpectedError(e: any): void { + if (!isCancellationError(e)) { + for (const listener of _errorListeners) { + try { listener(e); } catch { } + } + } +} + +export function onUnexpectedExternalError(e: any): void { + if (!isCancellationError(e)) { + for (const listener of _errorListeners) { + try { listener(e); } catch { } + } + } +} + +export function transformErrorForSerialization(error: any): any { + if (error instanceof Error) { + const { name, message, stack } = error; + return { $isError: true, name, message, stack }; + } + return error; +} + +const canceledName = 'Canceled'; + +export function isCancellationError(error: any): boolean { + if (error instanceof CancellationError) { return true; } + return error instanceof Error && error.name === canceledName && error.message === canceledName; +} + +export class CancellationError extends Error { + constructor() { + super(canceledName); + this.name = this.message; + } +} + +export class NotSupportedError extends Error { + constructor(message?: string) { + super(message || 'NotSupported'); + } +} + +export class NotImplementedError extends Error { + constructor(message?: string) { + super(message || 'NotImplemented'); + } +} + +export class IllegalArgumentError extends Error { + constructor(message?: string) { + super(message || 'Illegal argument'); + } +} + +export class BugIndicatingError extends Error { + constructor(message?: string) { + super(message || 'Bug Indicating Error'); + } +} diff --git a/scripts/chat-simulation/fixtures/event.ts b/scripts/chat-simulation/fixtures/event.ts new file mode 100644 index 0000000000000..6186e9e7042d9 --- /dev/null +++ b/scripts/chat-simulation/fixtures/event.ts @@ -0,0 +1,109 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/event.ts for stable perf testing. + */ + +import { IDisposable, DisposableStore } from './lifecycle'; + +export interface Event { + (listener: (e: T) => any, thisArgs?: any, disposables?: IDisposable[]): IDisposable; +} + +export namespace Event { + export const None: Event = () => ({ dispose() { } }); + + export function once(event: Event): Event { + return (listener, thisArgs?, disposables?) => { + let didFire = false; + const result = event(e => { + if (didFire) { return; } + didFire = true; + return listener.call(thisArgs, e); + }, null, disposables); + if (didFire) { result.dispose(); } + return result; + }; + } + + export function map(event: Event, map: (i: I) => O): Event { + return (listener, thisArgs?, disposables?) => + event(i => listener.call(thisArgs, map(i)), null, disposables); + } + + export function filter(event: Event, filter: (e: T) => boolean): Event { + return (listener, thisArgs?, disposables?) => + event(e => filter(e) && listener.call(thisArgs, e), null, disposables); + } + + export function debounce(event: Event, merge: (last: T | undefined, e: T) => T, delay: number = 100): Event { + let subscription: IDisposable; + let output: T | undefined; + let handle: any; + return (listener, thisArgs?, disposables?) => { + subscription = event(cur => { + output = merge(output, cur); + clearTimeout(handle); + handle = setTimeout(() => { + const e = output!; + output = undefined; + listener.call(thisArgs, e); + }, delay); + }); + return { dispose() { subscription.dispose(); clearTimeout(handle); } }; + }; + } +} + +export class Emitter { + private readonly _listeners = new Set<(e: T) => void>(); + private _disposed = false; + + readonly event: Event = (listener: (e: T) => void) => { + if (this._disposed) { return { dispose() { } }; } + this._listeners.add(listener); + return { + dispose: () => { this._listeners.delete(listener); } + }; + }; + + fire(event: T): void { + if (this._disposed) { return; } + for (const listener of [...this._listeners]) { + try { listener(event); } catch { } + } + } + + dispose(): void { + if (this._disposed) { return; } + this._disposed = true; + this._listeners.clear(); + } + + get hasListeners(): boolean { return this._listeners.size > 0; } +} + +export class PauseableEmitter extends Emitter { + private _isPaused = false; + private _queue: T[] = []; + + pause(): void { this._isPaused = true; } + + resume(): void { + this._isPaused = false; + while (this._queue.length > 0) { + super.fire(this._queue.shift()!); + } + } + + override fire(event: T): void { + if (this._isPaused) { this._queue.push(event); } + else { super.fire(event); } + } +} diff --git a/scripts/chat-simulation/fixtures/lifecycle.ts b/scripts/chat-simulation/fixtures/lifecycle.ts new file mode 100644 index 0000000000000..6f1bd1a16b3c8 --- /dev/null +++ b/scripts/chat-simulation/fixtures/lifecycle.ts @@ -0,0 +1,127 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/lifecycle.ts for stable perf testing. + */ + +export interface IDisposable { + dispose(): void; +} + +export function isDisposable(thing: T): thing is T & IDisposable { + return typeof (thing as IDisposable).dispose === 'function' + && (thing as IDisposable).dispose.length === 0; +} + +export function dispose(disposable: T): T; +export function dispose(disposable: T | undefined): T | undefined; +export function dispose(disposables: T[]): T[]; +export function dispose(disposables: readonly T[]): readonly T[]; +export function dispose(arg: T | T[] | undefined): any { + if (Array.isArray(arg)) { + const errors: any[] = []; + for (const d of arg) { + try { d.dispose(); } catch (e) { errors.push(e); } + } + if (errors.length > 0) { throw new Error(`Dispose errors: ${errors.length}`); } + return arg; + } else if (arg) { + arg.dispose(); + return arg; + } +} + +export class DisposableStore implements IDisposable { + private readonly _toDispose = new Set(); + private _isDisposed = false; + + dispose(): void { + if (this._isDisposed) { return; } + this._isDisposed = true; + this.clear(); + } + + clear(): void { + if (this._toDispose.size === 0) { return; } + const iter = this._toDispose.values(); + this._toDispose.clear(); + for (const item of iter) { + try { item.dispose(); } catch { } + } + } + + add(o: T): T { + if (this._isDisposed) { + console.warn('Adding to a disposed DisposableStore'); + return o; + } + this._toDispose.add(o); + return o; + } + + get size(): number { return this._toDispose.size; } +} + +export abstract class Disposable implements IDisposable { + private readonly _store = new DisposableStore(); + + dispose(): void { + this._store.dispose(); + } + + protected _register(o: T): T { + return this._store.add(o); + } +} + +export class MutableDisposable implements IDisposable { + private _value?: T; + private _isDisposed = false; + + get value(): T | undefined { return this._isDisposed ? undefined : this._value; } + + set value(value: T | undefined) { + if (this._isDisposed || value === this._value) { return; } + this._value?.dispose(); + this._value = value; + } + + dispose(): void { + this._isDisposed = true; + this._value?.dispose(); + this._value = undefined; + } +} + +export class DisposableMap implements IDisposable { + private readonly _map = new Map(); + private _isDisposed = false; + + set(key: K, value: V): void { + const existing = this._map.get(key); + if (existing !== value) { + existing?.dispose(); + this._map.set(key, value); + } + } + + get(key: K): V | undefined { return this._map.get(key); } + + delete(key: K): void { + this._map.get(key)?.dispose(); + this._map.delete(key); + } + + dispose(): void { + if (this._isDisposed) { return; } + this._isDisposed = true; + for (const [, v] of this._map) { v.dispose(); } + this._map.clear(); + } +} diff --git a/scripts/chat-simulation/fixtures/strings.ts b/scripts/chat-simulation/fixtures/strings.ts new file mode 100644 index 0000000000000..4c7ca7637e3bd --- /dev/null +++ b/scripts/chat-simulation/fixtures/strings.ts @@ -0,0 +1,75 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/strings.ts for stable perf testing. + */ + +export function format(value: string, ...args: any[]): string { + return value.replace(/{(\d+)}/g, (match, index) => { + const i = parseInt(index, 10); + return i >= 0 && i < args.length ? `${args[i]}` : match; + }); +} + +export function escape(value: string): string { + return value.replace(/[<>&"']/g, ch => { + switch (ch) { + case '<': return '<'; + case '>': return '>'; + case '&': return '&'; + case '"': return '"'; + case '\'': return '''; + default: return ch; + } + }); +} + +export function trim(value: string, ch: string = ' '): string { + let start = 0; + let end = value.length; + while (start < end && value[start] === ch) { start++; } + while (end > start && value[end - 1] === ch) { end--; } + return value.substring(start, end); +} + +export function equalsIgnoreCase(a: string, b: string): boolean { + return a.length === b.length && a.toLowerCase() === b.toLowerCase(); +} + +export function startsWithIgnoreCase(str: string, candidate: string): boolean { + if (str.length < candidate.length) { return false; } + return str.substring(0, candidate.length).toLowerCase() === candidate.toLowerCase(); +} + +export function commonPrefixLength(a: string, b: string): number { + const len = Math.min(a.length, b.length); + for (let i = 0; i < len; i++) { + if (a.charCodeAt(i) !== b.charCodeAt(i)) { return i; } + } + return len; +} + +export function commonSuffixLength(a: string, b: string): number { + const len = Math.min(a.length, b.length); + for (let i = 0; i < len; i++) { + if (a.charCodeAt(a.length - 1 - i) !== b.charCodeAt(b.length - 1 - i)) { return i; } + } + return len; +} + +export function splitLines(str: string): string[] { + return str.split(/\r\n|\r|\n/); +} + +export function regExpLeadsToEndlessLoop(regexp: RegExp): boolean { + if (regexp.source === '^' || regexp.source === '^$' || regexp.source === '$') { + return false; + } + return !regexp.exec('')?.length; +} diff --git a/scripts/chat-simulation/fixtures/types.ts b/scripts/chat-simulation/fixtures/types.ts new file mode 100644 index 0000000000000..0779f182b26d3 --- /dev/null +++ b/scripts/chat-simulation/fixtures/types.ts @@ -0,0 +1,92 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/types.ts for stable perf testing. + */ + +export function isString(thing: unknown): thing is string { + return typeof thing === 'string'; +} + +export function isNumber(thing: unknown): thing is number { + return typeof thing === 'number' && !isNaN(thing); +} + +export function isBoolean(thing: unknown): thing is boolean { + return thing === true || thing === false; +} + +export function isUndefined(thing: unknown): thing is undefined { + return typeof thing === 'undefined'; +} + +export function isDefined(thing: T | undefined | null): thing is T { + return !isUndefinedOrNull(thing); +} + +export function isUndefinedOrNull(thing: unknown): thing is undefined | null { + return isUndefined(thing) || thing === null; +} + +export function isFunction(thing: unknown): thing is Function { + return typeof thing === 'function'; +} + +export function isObject(thing: unknown): thing is object { + return typeof thing === 'object' + && thing !== null + && !Array.isArray(thing) + && !(thing instanceof RegExp) + && !(thing instanceof Date); +} + +export function isArray(thing: unknown): thing is unknown[] { + return Array.isArray(thing); +} + +export function assertType(condition: unknown, type?: string): asserts condition { + if (!condition) { + throw new Error(type ? `Unexpected type, expected '${type}'` : 'Unexpected type'); + } +} + +export function assertIsDefined(thing: T | undefined | null): T { + if (isUndefinedOrNull(thing)) { + throw new Error('Assertion failed: value is undefined or null'); + } + return thing; +} + +export function assertAllDefined(t1: T1 | undefined | null, t2: T2 | undefined | null): [T1, T2] { + return [assertIsDefined(t1), assertIsDefined(t2)]; +} + +export type TypeConstraint = string | Function; + +export function validateConstraints(args: unknown[], constraints: Array): void { + const len = Math.min(args.length, constraints.length); + for (let i = 0; i < len; i++) { + validateConstraint(args[i], constraints[i]); + } +} + +export function validateConstraint(arg: unknown, constraint: TypeConstraint | undefined): void { + if (isString(constraint)) { + if (typeof arg !== constraint) { + throw new Error(`argument does not match constraint: typeof ${constraint}`); + } + } else if (isFunction(constraint)) { + try { + if (arg instanceof constraint) { return; } + } catch { } + if (!isUndefinedOrNull(arg) && (arg as any).constructor === constraint) { return; } + if (constraint.length === 1 && constraint.call(undefined, arg) === true) { return; } + throw new Error('argument does not match one of these constraints: arg instanceof constraint, arg.constructor === constraint, nor constraint(arg) === true'); + } +} diff --git a/scripts/chat-simulation/fixtures/uri.ts b/scripts/chat-simulation/fixtures/uri.ts new file mode 100644 index 0000000000000..8a67bc8065eb6 --- /dev/null +++ b/scripts/chat-simulation/fixtures/uri.ts @@ -0,0 +1,85 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +// perf-benchmark-marker + +/** + * Fixture for chat-simulation benchmarks. + * Simplified from src/vs/base/common/uri.ts for stable perf testing. + */ + +const _empty = ''; +const _slash = '/'; + +export class URI { + readonly scheme: string; + readonly authority: string; + readonly path: string; + readonly query: string; + readonly fragment: string; + + private constructor(scheme: string, authority: string, path: string, query: string, fragment: string) { + this.scheme = scheme; + this.authority = authority || _empty; + this.path = path || _empty; + this.query = query || _empty; + this.fragment = fragment || _empty; + } + + static file(path: string): URI { + let authority = _empty; + if (path.length >= 2 && path.charCodeAt(0) === 47 /* / */ && path.charCodeAt(1) === 47 /* / */) { + const idx = path.indexOf(_slash, 2); + if (idx === -1) { + authority = path.substring(2); + path = _slash; + } else { + authority = path.substring(2, idx); + path = path.substring(idx) || _slash; + } + } + return new URI('file', authority, path, _empty, _empty); + } + + static parse(value: string): URI { + const match = /^([a-zA-Z][a-zA-Z0-9+.-]*):\/\/([^/?#]*)([^?#]*)(\?[^#]*)?(#.*)?$/.exec(value); + if (!match) { return new URI(_empty, _empty, _empty, _empty, _empty); } + return new URI(match[1], match[2], match[3], match[4]?.substring(1) || _empty, match[5]?.substring(1) || _empty); + } + + with(change: { scheme?: string; authority?: string; path?: string; query?: string; fragment?: string }): URI { + return new URI( + change.scheme ?? this.scheme, + change.authority ?? this.authority, + change.path ?? this.path, + change.query ?? this.query, + change.fragment ?? this.fragment, + ); + } + + toString(): string { + let result = ''; + if (this.scheme) { result += this.scheme + '://'; } + if (this.authority) { result += this.authority; } + if (this.path) { result += this.path; } + if (this.query) { result += '?' + this.query; } + if (this.fragment) { result += '#' + this.fragment; } + return result; + } + + get fsPath(): string { + return this.path; + } + + toJSON(): object { + return { + scheme: this.scheme, + authority: this.authority, + path: this.path, + query: this.query, + fragment: this.fragment, + }; + } +} diff --git a/scripts/chat-simulation/test-chat-mem-leaks.js b/scripts/chat-simulation/test-chat-mem-leaks.js index c11b416b90ca2..a00dc187acdbd 100644 --- a/scripts/chat-simulation/test-chat-mem-leaks.js +++ b/scripts/chat-simulation/test-chat-mem-leaks.js @@ -22,21 +22,25 @@ const fs = require('fs'); const path = require('path'); const { - DATA_DIR, + DATA_DIR, loadConfig, resolveBuild, buildEnv, buildArgs, prepareRunDir, linearRegressionSlope, launchVSCode, } = require('./common/utils'); +// -- Config (edit config.jsonc to change defaults) --------------------------- + +const CONFIG = loadConfig('memLeaks'); + // -- CLI args ---------------------------------------------------------------- function parseArgs() { const args = process.argv.slice(2); const opts = { - messages: 10, + messages: CONFIG.messages ?? 10, verbose: false, /** @type {string | undefined} */ build: undefined, - leakThresholdMB: 2, + leakThresholdMB: CONFIG.leakThresholdMB ?? 2, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index 2dff251c20f8a..a438a9a80584f 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -24,21 +24,26 @@ const path = require('path'); const fs = require('fs'); const { - DATA_DIR, METRIC_DEFS, + DATA_DIR, METRIC_DEFS, loadConfig, resolveBuild, buildEnv, buildArgs, prepareRunDir, robustStats, welchTTest, summarize, markDuration, launchVSCode, } = require('./common/utils'); const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server'); const { registerPerfScenarios } = require('./common/perf-scenarios'); +// -- Config (edit config.jsonc to change defaults) --------------------------- + +const CONFIG = loadConfig('perfRegression'); + // -- CLI args ---------------------------------------------------------------- function parseArgs() { const args = process.argv.slice(2); const opts = { - runs: 5, + runs: CONFIG.runsPerScenario ?? 5, verbose: false, ci: false, + noCache: false, /** @type {string[]} */ scenarios: [], /** @type {string | undefined} */ @@ -46,9 +51,9 @@ function parseArgs() { /** @type {string | undefined} */ baseline: undefined, /** @type {string | undefined} */ - baselineBuild: '1.115.0', + baselineBuild: CONFIG.baselineBuild ?? '1.115.0', saveBaseline: false, - threshold: 0.2, + threshold: CONFIG.regressionThreshold ?? 0.2, /** @type {string | undefined} */ resume: undefined, }; @@ -64,7 +69,8 @@ function parseArgs() { case '--save-baseline': opts.saveBaseline = true; break; case '--threshold': opts.threshold = parseFloat(args[++i]); break; case '--resume': opts.resume = args[++i]; break; - case '--ci': opts.ci = true; break; + case '--no-cache': opts.noCache = true; break; + case '--ci': opts.ci = true; opts.noCache = true; break; case '--help': case '-h': console.log([ 'Chat performance benchmark', @@ -82,7 +88,8 @@ function parseArgs() { ' --resume Resume a previous run, adding more iterations to increase', ' confidence. Merges new runs with existing rawRuns data', ' --threshold Regression threshold fraction (default: 0.2 = 20%)', - ' --ci CI mode: write Markdown summary to ci-summary.md', + ' --no-cache Ignore cached baseline data, always run fresh', + ' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache)', ' --verbose Print per-run details', '', 'Scenarios: ' + getScenarioIds().join(', '), @@ -808,7 +815,7 @@ async function main() { if (opts.baselineBuild) { const baselineJsonPath = path.join(runDir, `baseline-${opts.baselineBuild}.json`); const cachedPath = path.join(DATA_DIR, `baseline-${opts.baselineBuild}.json`); - const cachedBaseline = !opts.ci && fs.existsSync(cachedPath) + const cachedBaseline = !opts.noCache && fs.existsSync(cachedPath) ? JSON.parse(fs.readFileSync(cachedPath, 'utf-8')) : null; @@ -817,24 +824,39 @@ async function main() { const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {})); const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s)); - if (missingScenarios.length === 0) { + // Also check if cached scenarios have fewer runs than requested + const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => { + const cached = cachedBaseline.scenarios?.[s]; + return cached && (cached.rawRuns?.length || 0) < opts.runs; + }); + + if (missingScenarios.length === 0 && shortScenarios.length === 0) { console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`); fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); opts.baseline = baselineJsonPath; } else { - console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); - console.log(`[chat-simulation] Running baseline for missing scenarios...`); + const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])]; + if (missingScenarios.length > 0) { + console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); + } + if (shortScenarios.length > 0) { + console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`); + } + console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`); const baselineExePath = await resolveBuild(opts.baselineBuild); - for (const scenario of missingScenarios) { + for (const scenario of scenariosToRun) { + const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || []; + const runsNeeded = opts.runs - existingRuns.length; /** @type {RunMetrics[]} */ - const results = []; - for (let i = 0; i < opts.runs; i++) { - try { results.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline')); } + const newResults = []; + for (let i = 0; i < runsNeeded; i++) { + try { newResults.push(await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline')); } catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); } } - if (results.length > 0) { - const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); - for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } + const allRuns = [...existingRuns, ...newResults]; + if (allRuns.length > 0) { + const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: allRuns }); + for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); } cachedBaseline.scenarios[scenario] = sd; } } @@ -1010,9 +1032,8 @@ async function printComparison(jsonReport, opts) { let flag = ''; if (change > opts.threshold) { if (!ttest) { - flag = ' ← REGRESSION (n too small for significance test)'; - scenarioRegression = true; - regressionFound = true; + flag = ' ← possible regression (n too small for significance test)'; + inconclusiveFound = true; } else if (ttest.significant) { flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`; scenarioRegression = true; @@ -1048,10 +1069,37 @@ async function printComparison(jsonReport, opts) { const resultsPath = Object.keys(jsonReport.scenarios).length > 0 ? (jsonReport._resultsPath || opts.resume || 'path/to/results.json') : 'path/to/results.json'; + // Estimate required runs from the observed effect size and variance + // using power analysis for Welch's t-test (alpha=0.05, 80% power). + // n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d + let maxNeeded = 0; + for (const scenario of Object.keys(jsonReport.scenarios)) { + const current = jsonReport.scenarios[scenario]; + const base = baseline.scenarios?.[scenario]; + if (!base) { continue; } + for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) { + const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); + if (curRaw.length < 2 || basRaw.length < 2) { continue; } + const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length; + const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length; + const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1); + const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1); + const pooledSD = Math.sqrt((varA + varB) / 2); + if (pooledSD === 0) { continue; } + const d = Math.abs(meanB - meanA) / pooledSD; + if (d === 0) { continue; } + // z_0.025 = 1.96, z_0.2 = 0.842 + const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2); + const currentN = Math.min(curRaw.length, basRaw.length); + maxNeeded = Math.max(maxNeeded, nPerGroup - currentN); + } + } + const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20)); console.log(''); console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.'); console.log('[chat-simulation] To increase confidence, add more runs with --resume:'); - console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs 3`); + console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`); } } From 9cfbfd6d6a93b9a5b456e9329865af32ef296e49 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 15:08:05 -0700 Subject: [PATCH 06/13] more metrics --- .../chat-simulation/common/perf-scenarios.js | 119 ++++++++++++++++++ scripts/chat-simulation/common/utils.js | 6 + .../test-chat-perf-regression.js | 106 ++++++++++++++-- 3 files changed, 218 insertions(+), 13 deletions(-) diff --git a/scripts/chat-simulation/common/perf-scenarios.js b/scripts/chat-simulation/common/perf-scenarios.js index eaaf615a76f39..17752aba5772f 100644 --- a/scripts/chat-simulation/common/perf-scenarios.js +++ b/scripts/chat-simulation/common/perf-scenarios.js @@ -488,6 +488,125 @@ const MULTI_TURN_SCENARIOS = { }, ], }), + // Stress test: 20+ turn conversation to expose DOM growth, scroll + // performance, and memory accumulation over a long session. + 'long-conversation': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ((() => { + const topics = [ + { question: 'How does the Disposable pattern work?', heading: 'Disposable Pattern', content: 'The `Disposable` base class provides lifecycle management. Subclasses call `this._register()` to track child disposables that are cleaned up automatically when `dispose()` is called.' }, + { question: 'What about DisposableStore?', heading: 'DisposableStore', content: '`DisposableStore` aggregates multiple `IDisposable` instances and disposes them all at once. It tracks whether it has already been disposed and throws if you try to add after disposal.' }, + { question: 'How does the Event system work?', heading: 'Event System', content: 'The `Emitter` class implements the observer pattern. `Event.once()`, `Event.map()`, `Event.filter()`, and `Event.debounce()` provide functional combinators for composing event streams.' }, + { question: 'Explain dependency injection', heading: 'Dependency Injection', content: 'Services are injected through constructor parameters decorated with service identifiers. The `IInstantiationService` resolves dependencies recursively, creating singletons within each scope.' }, + { question: 'What is the contribution model?', heading: 'Contribution Model', content: 'Features register functionality through extension points like `Registry.as()`. Contributions are instantiated during specific lifecycle phases.' }, + { question: 'How does the editor handle text models?', heading: 'Text Models', content: 'The `TextModel` class manages document content with line-based storage. It supports undo/redo stacks, bracket matching, tokenization, and change tracking via edit operations.' }, + { question: 'Explain the extension host architecture', heading: 'Extension Host', content: 'Extensions run in a separate process (or worker) called the extension host. Communication happens via an RPC protocol over `IPC`. The main process proxies API calls back to the workbench.' }, + { question: 'How does file watching work?', heading: 'File Watching', content: 'The `IFileService` supports correlated and shared file watchers. Correlated watchers are preferred as they track specific resources. The underlying implementation uses `chokidar` or `parcel/watcher`.' }, + { question: 'What about the tree widget?', heading: 'Tree Widget', content: 'The `AsyncDataTree` and `ObjectTree` provide virtualized tree rendering. They support filtering, sorting, keyboard navigation, and accessibility. The `ITreeRenderer` interface handles element rendering.' }, + { question: 'How does the settings editor work?', heading: 'Settings Editor', content: 'Settings are declared in `package.json` contribution points. The settings editor reads the configuration registry, groups settings by category, and renders appropriate input controls for each type.' }, + ]; + + /** @type {import('./mock-llm-server').ScenarioTurn[]} */ + const turns = []; + + // Turn 1: Initial model response (no user turn needed before the first) + const firstTopic = topics[0]; + turns.push({ + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, `## ${firstTopic.heading}\n\n`) + .stream([ + `${firstTopic.content}\n\n`, + 'Here is a typical example:\n\n', + '```typescript\n', + 'class MyService extends Disposable {\n', + ' private readonly _onDidChange = this._register(new Emitter());\n', + ' readonly onDidChange: Event = this._onDidChange.event;\n\n', + ' constructor(@IFileService private readonly fileService: IFileService) {\n', + ' super();\n', + ' this._register(fileService.onDidFilesChange(e => this._handleChange(e)));\n', + ' }\n', + '}\n', + '```\n\n', + 'Would you like to know more about any specific aspect?\n', + ], 15) + .build(), + }); + + // Turns 2..N: alternating user follow-up + model response + for (let i = 1; i < topics.length; i++) { + const topic = topics[i]; + + // User follow-up + turns.push({ kind: 'user', message: topic.question }); + + // Model response — vary content type to stress different renderers + const b = new ScenarioBuilder(); + b.wait(20, `## ${topic.heading}\n\n`); + + // Main explanation + const sentences = topic.content.split('. '); + b.stream(sentences.map(s => s.endsWith('.') ? s + ' ' : s + '. '), 12); + b.emit('\n\n'); + + if (i % 3 === 0) { + // Every 3rd response: large code block + b.emit('```typescript\n'); + for (let j = 0; j < 8; j++) { + b.stream([ + `export class ${topic.heading.replace(/\s/g, '')}Part${j} extends Disposable {\n`, + ` private readonly _state = new Map();\n\n`, + ` process(input: string): string {\n`, + ` const cached = this._state.get(input);\n`, + ` if (cached) { return String(cached); }\n`, + ` const result = input.split('').reverse().join('');\n`, + ` this._state.set(input, result);\n`, + ` return result;\n`, + ` }\n`, + '}\n\n', + ], 5); + } + b.emit('```\n\n'); + } else if (i % 3 === 1) { + // Every 3rd+1 response: bullet list with bold + inline code + b.emit('Key points to remember:\n\n'); + for (let j = 0; j < 6; j++) { + b.stream([ + `${j + 1}. **Point ${j + 1}**: The \`${topic.heading.replace(/\s/g, '')}${j}\` `, + `component uses the standard pattern with \`_register()\` for lifecycle. `, + `It handles edge cases like ${['empty input', 'null references', 'concurrent access', 'circular deps', 'timeout expiry', 'disposal races'][j]}.\n`, + ], 10); + } + b.emit('\n'); + } else { + // Every 3rd+2 response: mixed prose + small code snippet + b.stream([ + 'This pattern is used extensively throughout the codebase. ', + 'The key insight is that resources are always tracked from creation, ', + 'ensuring no leaks even in error paths. ', + 'The ownership chain is explicit and follows the component hierarchy.\n\n', + ], 12); + b.emit('Quick example:\n\n```typescript\n'); + b.stream([ + `const store = new DisposableStore();\n`, + `store.add(event.on(() => { /* handler */ }));\n`, + `store.add(watcher.watch(uri));\n`, + `// Later: store.dispose(); // cleans up everything\n`, + ], 8); + b.emit('```\n\n'); + } + + b.stream([ + `That covers the essentials of **${topic.heading}**. `, + 'Let me know if you want to dive deeper into any of these concepts.\n', + ], 15); + + turns.push({ + kind: 'content', + chunks: b.build(), + }); + } + + return { type: 'multi-turn', turns }; + })()), }; // -- Registration helper ------------------------------------------------------ diff --git a/scripts/chat-simulation/common/utils.js b/scripts/chat-simulation/common/utils.js index 1b313aa86f68c..142f151cd969d 100644 --- a/scripts/chat-simulation/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -630,11 +630,17 @@ const METRIC_DEFS = [ ['instructionCollectionTime', 'timing', 'ms'], ['agentInvokeTime', 'timing', 'ms'], ['heapDelta', 'memory', 'MB'], + ['heapDeltaPostGC', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], ['layoutCount', 'rendering', ''], ['recalcStyleCount', 'rendering', ''], ['forcedReflowCount', 'rendering', ''], ['longTaskCount', 'rendering', ''], + ['longAnimationFrameCount', 'rendering', ''], + ['longAnimationFrameTotalMs', 'rendering', 'ms'], + ['frameCount', 'rendering', ''], + ['compositeLayers', 'rendering', ''], + ['paintCount', 'rendering', ''], ]; module.exports = { diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index a438a9a80584f..9453880f2fe5b 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -115,6 +115,7 @@ function parseArgs() { * heapUsedBefore: number, * heapUsedAfter: number, * heapDelta: number, + * heapDeltaPostGC: number, * majorGCs: number, * minorGCs: number, * gcDurationMs: number, @@ -122,6 +123,11 @@ function parseArgs() { * recalcStyleCount: number, * forcedReflowCount: number, * longTaskCount: number, + * longAnimationFrameCount: number, + * longAnimationFrameTotalMs: number, + * frameCount: number, + * compositeLayers: number, + * paintCount: number, * hasInternalMarks: boolean, * responseHasContent: boolean, * internalFirstToken: number, @@ -263,6 +269,26 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await cdp.send('Profiler.enable'); await cdp.send('Profiler.start'); + // Install a PerformanceObserver for Long Animation Frames (LoAF) + // to capture frame-level jank that longTaskCount alone misses. + await window.evaluate(() => { + // @ts-ignore + globalThis._chatLoAFEntries = []; + try { + // @ts-ignore + globalThis._chatLoAFObserver = new PerformanceObserver((list) => { + for (const entry of list.getEntries()) { + // @ts-ignore + globalThis._chatLoAFEntries.push({ duration: entry.duration, startTime: entry.startTime }); + } + }); + // @ts-ignore + globalThis._chatLoAFObserver.observe({ type: 'long-animation-frame', buffered: false }); + } catch { + // long-animation-frame not supported in this build — metrics will be 0 + } + }); + // Start polling for code/chat/* perf marks inside the renderer. // The marks are emitted during the request and cleared immediately // after RequestComplete in the same microtask. We poll rapidly from @@ -415,6 +441,21 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru console.log(` [debug] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`); } + // Collect Long Animation Frame entries and tear down the observer + const loafData = await window.evaluate(() => { + // @ts-ignore + if (globalThis._chatLoAFObserver) { globalThis._chatLoAFObserver.disconnect(); } + // @ts-ignore + const entries = globalThis._chatLoAFEntries ?? []; + // @ts-ignore + delete globalThis._chatLoAFEntries; + // @ts-ignore + delete globalThis._chatLoAFObserver; + const count = entries.length; + const totalMs = entries.reduce((/** @type {number} */ sum, /** @type {any} */ e) => sum + e.duration, 0); + return { count, totalMs }; + }); + const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); /** @type {Array} */ const traceEvents = []; @@ -446,14 +487,23 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions'); const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke'); - // Parse GC/long tasks + // Parse GC events from trace. + // Use the trace-event category and phase fields which are stable + // across V8 versions, rather than matching event name substrings. let majorGCs = 0, minorGCs = 0, gcDurationMs = 0; for (const event of traceEvents) { - if (event.cat === 'v8.gc' || event.name === 'V8.GCFinalizeMC' || event.name === 'V8.GCScavenger') { - if (event.name?.includes('MC') || event.name?.includes('Major') || event.name === 'MajorGC') { majorGCs++; } - else if (event.name?.includes('Scavenger') || event.name?.includes('Minor') || event.name === 'MinorGC') { minorGCs++; } - if (event.dur) { gcDurationMs += event.dur / 1000; } - } + const isGC = event.cat === 'v8.gc' + || event.cat === 'devtools.timeline,v8' + || (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => c.trim() === 'v8.gc')); + if (!isGC) { continue; } + // Only count complete ('X') or duration-begin ('B') events to + // avoid double-counting begin/end pairs. + if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; } + const name = event.name || ''; + if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; } + else if (/Minor|Scaveng/i.test(name)) { minorGCs++; } + else { minorGCs++; } // default unknown GC events to minor + if (event.dur) { gcDurationMs += event.dur / 1000; } } let longTaskCount = 0; for (const event of traceEvents) { @@ -471,12 +521,30 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024), heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024), heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024), + heapDeltaPostGC: await (async () => { + // Force a full GC then measure heap to get deterministic retained-memory delta. + // --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc() + // when includeCommandLineAPI is true. + try { + await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true }); + await new Promise(r => setTimeout(r, 200)); + const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024); + } catch { + return -1; // gc() not available in this build + } + })(), majorGCs, minorGCs, gcDurationMs: Math.round(gcDurationMs * 100) / 100, layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'), recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'), forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'), longTaskCount, + longAnimationFrameCount: loafData.count, + longAnimationFrameTotalMs: Math.round(loafData.totalMs * 100) / 100, + frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'), + compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'), + paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'), hasInternalMarks: chatMarks.length > 0, responseHasContent: responseInfo.hasContent, internalFirstToken, @@ -510,10 +578,16 @@ function generateCISummary(jsonReport, baseline, opts) { ['recalcStyleCount', 'rendering', ''], ['forcedReflowCount', 'rendering', ''], ['longTaskCount', 'rendering', ''], + ['longAnimationFrameCount', 'rendering', ''], + ['longAnimationFrameTotalMs', 'rendering', 'ms'], + ['frameCount', 'rendering', ''], + ['compositeLayers', 'rendering', ''], + ['paintCount', 'rendering', ''], ['heapDelta', 'memory', 'MB'], + ['heapDeltaPostGC', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], ]; - const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'layoutCount', 'recalcStyleCount', 'forcedReflowCount', 'longTaskCount']); + const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'layoutCount', 'recalcStyleCount', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']); const lines = []; const scenarios = Object.keys(jsonReport.scenarios); @@ -624,12 +698,12 @@ function generateCISummary(jsonReport, baseline, opts) { const current = jsonReport.scenarios[scenario]; lines.push(`### ${scenario}`); lines.push(''); - lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | Heap Delta (MB) | Internal Marks |'); - lines.push('|----:|----------:|--------------:|--------:|--------------:|----------------:|:--------------:|'); + lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |'); + lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|'); const runs = current.rawRuns || []; for (let i = 0; i < runs.length; i++) { const r = runs[i]; - lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); + lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs ?? '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); } lines.push(''); } @@ -639,12 +713,12 @@ function generateCISummary(jsonReport, baseline, opts) { if (!base) { continue; } lines.push(`### ${scenario} (baseline)`); lines.push(''); - lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | Heap Delta (MB) | Internal Marks |'); - lines.push('|----:|----------:|--------------:|--------:|--------------:|----------------:|:--------------:|'); + lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |'); + lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|'); const runs = base.rawRuns || []; for (let i = 0; i < runs.length; i++) { const r = runs[i]; - lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); + lines.push(`| ${i + 1} | ${r.timeToFirstToken} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs ?? '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); } lines.push(''); } @@ -944,9 +1018,15 @@ async function main() { console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', '')); console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', '')); console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', '')); + console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', '')); + console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms')); + console.log(summarize(results.map(r => r.frameCount), ' Frames ', '')); + console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', '')); + console.log(summarize(results.map(r => r.paintCount), ' Paints ', '')); console.log(''); console.log(' Memory:'); console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB')); + console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB')); console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms')); } From 4d8aad22dade607233a146ab38bb42a8dbcea7ab Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 15:24:41 -0700 Subject: [PATCH 07/13] update --- .github/workflows/chat-perf.yml | 61 +++++++++++-------- .../chat-simulation/common/mock-llm-server.js | 6 +- scripts/chat-simulation/common/utils.js | 6 ++ .../chat-simulation/test-chat-mem-leaks.js | 24 +++++--- .../test-chat-perf-regression.js | 5 +- 5 files changed, 68 insertions(+), 34 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index ae841e55f0f13..843a0a5317093 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -14,7 +14,7 @@ on: required: true type: string test_commit: - description: 'Test commit SHA or version (e.g. "main", "abc1234")' + description: 'Test commit SHA or version (e.g. "1.115.0", "abc1234")' required: true type: string runs: @@ -41,14 +41,15 @@ concurrency: cancel-in-progress: true env: - BASELINE_COMMIT: ${{ inputs.baseline_commit || '1.115.0' }} + # Only set when explicitly provided; otherwise scripts read config.jsonc + BASELINE_COMMIT: ${{ inputs.baseline_commit || '' }} TEST_COMMIT: ${{ inputs.test_commit || 'main' }} - PERF_RUNS: ${{ inputs.runs || 7 }} - PERF_THRESHOLD: ${{ inputs.threshold || 0.2 }} + PERF_RUNS: ${{ inputs.runs || '' }} + PERF_THRESHOLD: ${{ inputs.threshold || '' }} jobs: chat-perf: - name: Chat Perf – ${{ inputs.baseline_commit || '1.115.0' }} vs ${{ inputs.test_commit || 'main' }} + name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || 'main' }} runs-on: ubuntu-latest timeout-minutes: 120 steps: @@ -101,12 +102,22 @@ jobs: done fi - xvfb-run node scripts/chat-perf/test-chat-perf-regression.js \ - --baseline-build "${{ env.BASELINE_COMMIT }}" \ - --build "${{ env.TEST_COMMIT }}" \ - --runs ${{ env.PERF_RUNS }} \ - --threshold ${{ env.PERF_THRESHOLD }} \ - --ci \ + PERF_ARGS="--ci" + if [[ -n "$BASELINE_COMMIT" ]]; then + PERF_ARGS="$PERF_ARGS --baseline-build $BASELINE_COMMIT" + fi + if [[ -n "$TEST_COMMIT" ]]; then + PERF_ARGS="$PERF_ARGS --build $TEST_COMMIT" + fi + if [[ -n "$PERF_RUNS" ]]; then + PERF_ARGS="$PERF_ARGS --runs $PERF_RUNS" + fi + if [[ -n "$PERF_THRESHOLD" ]]; then + PERF_ARGS="$PERF_ARGS --threshold $PERF_THRESHOLD" + fi + + xvfb-run node scripts/chat-simulation/test-chat-perf-regression.js \ + $PERF_ARGS \ $SCENARIO_ARGS \ 2>&1 | tee perf-output.log @@ -117,11 +128,13 @@ jobs: - name: Run memory leak check id: leak run: | - xvfb-run node scripts/chat-perf/test-chat-mem-leaks.js \ - --build "${{ env.TEST_COMMIT }}" \ - --messages 10 \ - --threshold 2 \ - --verbose \ + LEAK_ARGS="--verbose" + if [[ -n "$TEST_COMMIT" ]]; then + LEAK_ARGS="$LEAK_ARGS --build $TEST_COMMIT" + fi + + xvfb-run node scripts/chat-simulation/test-chat-mem-leaks.js \ + $LEAK_ARGS \ 2>&1 | tee leak-output.log exit ${PIPESTATUS[0]} @@ -130,18 +143,18 @@ jobs: - name: Write job summary if: always() run: | - if [[ -f .chat-perf-data/ci-summary.md ]]; then - cat .chat-perf-data/ci-summary.md >> "$GITHUB_STEP_SUMMARY" + if [[ -f .chat-simulation-data/ci-summary.md ]]; then + cat .chat-simulation-data/ci-summary.md >> "$GITHUB_STEP_SUMMARY" else echo "⚠️ No summary file generated. Check perf-output.log artifact." >> "$GITHUB_STEP_SUMMARY" fi - if [[ -f .chat-perf-data/chat-perf-leak-results.json ]]; then + if [[ -f .chat-simulation-data/chat-simulation-leak-results.json ]]; then echo "" >> "$GITHUB_STEP_SUMMARY" echo "## Memory Leak Check" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo '```json' >> "$GITHUB_STEP_SUMMARY" - cat .chat-perf-data/chat-perf-leak-results.json >> "$GITHUB_STEP_SUMMARY" + cat .chat-simulation-data/chat-simulation-leak-results.json >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" fi @@ -149,15 +162,15 @@ jobs: if: always() run: | # Find the most recent timestamped run directory - RUN_DIR=$(ls -td .chat-perf-data/20*/ 2>/dev/null | head -1) + RUN_DIR=$(ls -td .chat-simulation-data/20*/ 2>/dev/null | head -1) if [[ -n "$RUN_DIR" ]]; then # Zip everything: results JSON, CPU profiles, traces, heap snapshots - cd .chat-perf-data + cd .chat-simulation-data zip -r ../chat-perf-artifacts.zip \ "$(basename "$RUN_DIR")"/ \ ci-summary.md \ baseline-*.json \ - chat-perf-leak-results.json \ + chat-simulation-leak-results.json \ 2>/dev/null || true cd .. fi @@ -166,7 +179,7 @@ jobs: if: always() uses: actions/upload-artifact@v7 with: - name: chat-perf-${{ env.BASELINE_COMMIT }}-vs-${{ env.TEST_COMMIT }} + name: chat-perf-${{ env.BASELINE_COMMIT || 'default-baseline' }}-vs-${{ env.TEST_COMMIT }} path: | chat-perf-artifacts.zip perf-output.log diff --git a/scripts/chat-simulation/common/mock-llm-server.js b/scripts/chat-simulation/common/mock-llm-server.js index 4df12376e90c8..3a2c33c353133 100644 --- a/scripts/chat-simulation/common/mock-llm-server.js +++ b/scripts/chat-simulation/common/mock-llm-server.js @@ -876,10 +876,11 @@ function startServer(port = 0) { /** @type {Array<() => boolean>} */ let completionWaiters = []; - serverEvents.on('scenarioCompletion', () => { + const onCompletion = () => { completions++; completionWaiters = completionWaiters.filter(fn => !fn()); - }); + }; + serverEvents.on('scenarioCompletion', onCompletion); const server = http.createServer((req, res) => { reqCount++; @@ -894,6 +895,7 @@ function startServer(port = 0) { port: actualPort, url, close: () => /** @type {Promise} */(new Promise((resolve, reject) => { + serverEvents.removeListener('scenarioCompletion', onCompletion); server.close(err => err ? reject(err) : resolve(undefined)); })), /** Return total request count. */ diff --git a/scripts/chat-simulation/common/utils.js b/scripts/chat-simulation/common/utils.js index 142f151cd969d..db15e83921eed 100644 --- a/scripts/chat-simulation/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -7,6 +7,10 @@ /** * Shared utilities for chat performance benchmarks and leak checks. + * + * Platform: macOS and Linux only. Windows is not supported — several + * utilities (`sqlite3`, `sleep`, `pkill`) are Unix-specific. + * CI runs on ubuntu-latest. */ const path = require('path'); @@ -133,6 +137,8 @@ async function resolveBuild(buildArg) { * Pre-seed the VS Code storage database to prevent the * BuiltinChatExtensionEnablementMigration from disabling the copilot * extension on fresh user data directories. + * + * Requires `sqlite3` on PATH (pre-installed on macOS and Ubuntu). * @param {string} userDataDir */ function preseedStorage(userDataDir) { diff --git a/scripts/chat-simulation/test-chat-mem-leaks.js b/scripts/chat-simulation/test-chat-mem-leaks.js index a00dc187acdbd..3a10f393b621d 100644 --- a/scripts/chat-simulation/test-chat-mem-leaks.js +++ b/scripts/chat-simulation/test-chat-mem-leaks.js @@ -74,11 +74,12 @@ function parseArgs() { */ async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { const { userDataDir, extDir, logsDir } = prepareRunDir('leak-check', mockServer); + const isDevBuild = !electronPath.includes('.vscode-test'); const vscode = await launchVSCode( electronPath, - buildArgs(userDataDir, extDir, logsDir), - buildEnv(mockServer), + buildArgs(userDataDir, extDir, logsDir, { isDevBuild }), + buildEnv(mockServer, { isDevBuild }), { verbose }, ); const window = vscode.page; @@ -136,12 +137,21 @@ async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { }, chatEditorSel); const msg = `[scenario:text-only] Leak check message ${i + 1}`; - await window.evaluate(({ selector, text }) => { - // @ts-ignore — globalThis.driver is injected by --enable-smoke-test-driver - if (!globalThis.driver) { throw new Error('no driver'); } + const hasDriver = await window.evaluate(() => // @ts-ignore - return globalThis.driver.typeInEditor(selector, text); - }, { selector: inputSel, text: msg }); + !!globalThis.driver?.typeInEditor + ).catch(() => false); + + if (hasDriver) { + await window.evaluate(({ selector, text }) => { + // @ts-ignore + return globalThis.driver.typeInEditor(selector, text); + }, { selector: inputSel, text: msg }); + } else { + await window.click(inputSel); + await new Promise(r => setTimeout(r, 200)); + await window.locator(inputSel).pressSequentially(msg, { delay: 0 }); + } const compBefore = mockServer.completionCount(); await window.keyboard.press('Enter'); diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index 9453880f2fe5b..94a7d512ed64c 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -460,8 +460,11 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru /** @type {Array} */ const traceEvents = []; cdp.on('Tracing.dataCollected', (/** @type {any} */ data) => { traceEvents.push(...data.value); }); + const tracingComplete = new Promise(resolve => { + cdp.once('Tracing.tracingComplete', () => resolve(undefined)); + }); await cdp.send('Tracing.end'); - await new Promise(r => setTimeout(r, 500)); + await tracingComplete; const metricsAfter = await cdp.send('Performance.getMetrics'); // Save performance trace (Chrome DevTools format) From 1bb24d5bf630adeaef9739b7df2eb6744cae129e Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 15:39:41 -0700 Subject: [PATCH 08/13] clean --- .github/skills/chat-perf/SKILL.md | 47 +++++++++-------- .../test-chat-perf-regression.js | 50 +++++++++++++------ 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/.github/skills/chat-perf/SKILL.md b/.github/skills/chat-perf/SKILL.md index 9592b51649131..a3a7986ca7fde 100644 --- a/.github/skills/chat-perf/SKILL.md +++ b/.github/skills/chat-perf/SKILL.md @@ -1,6 +1,9 @@ -# Chat Performance Testing +--- +name: chat-perf +description: Run chat perf benchmarks and memory leak checks against the local dev build or any published VS Code version. Use when investigating chat rendering regressions, validating perf-sensitive changes to chat UI, or checking for memory leaks in the chat response pipeline. +--- -Run chat perf benchmarks and memory leak checks against the local dev build or any published VS Code version. Use when investigating chat rendering regressions, validating perf-sensitive changes to chat UI, or checking for memory leaks in the chat response pipeline. +# Chat Performance Testing ## When to use @@ -38,12 +41,16 @@ Launches VS Code via Playwright Electron, opens the chat panel, sends a message | Flag | Default | Description | |---|---|---| | `--runs ` | `5` | Runs per scenario. More = more stable. Use 5+ for CI. | -| `--scenario ` | all | Scenario to test (repeatable). See scenarios below. | -| `--build ` | local dev | Build to test. Accepts path or version (`1.110.0`, `insiders`). | -| `--baseline-build ` | `1.115.0` | Version to download and compare against. | +| `--scenario ` / `-s` | all | Scenario to test (repeatable). See `common/perf-scenarios.js`. | +| `--build ` / `-b` | local dev | Build to test. Accepts path or version (`1.110.0`, `insiders`, commit hash). | +| `--baseline ` | — | Compare against a previously saved baseline JSON file. | +| `--baseline-build ` | `1.115.0` | Version to download and benchmark as baseline. | | `--no-baseline` | — | Skip baseline comparison entirely. | +| `--save-baseline` | — | Save results as the new baseline (requires `--baseline `). | | `--resume ` | — | Resume a previous run, adding more iterations to increase confidence. | | `--threshold ` | `0.2` | Regression threshold (0.2 = flag if 20% slower). | +| `--no-cache` | — | Ignore cached baseline data, always run fresh. | +| `--ci` | — | CI mode: write Markdown summary to `ci-summary.md` (implies `--no-cache`). | | `--verbose` | — | Print per-run details including response content. | ### Comparing two remote builds @@ -87,18 +94,13 @@ Confidence levels reported: `high` (p < 0.01), `medium` (p < 0.05), `low` (p < 0 ### Scenarios -| ID | What it stresses | -|---|---| -| `text-only` | Baseline — plain text response | -| `large-codeblock` | Single TypeScript block with syntax highlighting | -| `many-codeblocks` | 10 fenced code blocks (~600 lines) | -| `many-small-chunks` | 200 small SSE chunks | -| `mixed-content` | Markdown with headers, code blocks, prose | -| `long-prose` | ~3000 words across 15 sections | -| `rich-markdown` | Nested lists, bold, italic, links, blockquotes | -| `giant-codeblock` | Single 200-line TypeScript block | -| `rapid-stream` | 1000 tiny SSE chunks | -| `file-links` | 32 file URI references with line anchors | +Scenarios are defined in `scripts/chat-simulation/common/perf-scenarios.js` and registered via `registerPerfScenarios()`. There are three categories: + +- **Content-only** — plain streaming responses (e.g. `text-only`, `large-codeblock`, `rapid-stream`) +- **Tool-call** — multi-turn scenarios with tool invocations (e.g. `tool-read-file`, `tool-edit-file`) +- **Multi-turn user** — multi-turn conversations with user follow-ups, thinking blocks (e.g. `thinking-response`, `multi-turn-user`, `long-conversation`) + +Run `npm run perf:chat -- --help` to see the full list of registered scenario IDs. ### Metrics collected @@ -121,8 +123,8 @@ Launches one VS Code session, sends N messages sequentially, forces GC between e | Flag | Default | Description | |---|---|---| -| `--messages ` | `10` | Number of messages to send. More = more accurate slope. | -| `--build ` | local dev | Build to test. | +| `--messages ` / `-n` | `10` | Number of messages to send. More = more accurate slope. | +| `--build ` / `-b` | local dev | Build to test. | | `--threshold ` | `2` | Max per-message heap growth in MB. | | `--verbose` | — | Print per-message heap/DOM counts. | @@ -144,7 +146,10 @@ Launches one VS Code session, sends N messages sequentially, forces GC between e scripts/chat-simulation/ ├── common/ │ ├── mock-llm-server.js # Mock CAPI server matching @vscode/copilot-api URL structure +│ ├── perf-scenarios.js # Built-in scenario definitions (content, tool-call, multi-turn) │ └── utils.js # Shared: paths, env setup, stats, launch helpers +├── config.jsonc # Default config (baseline version, runs, thresholds) +├── fixtures/ # TypeScript fixture files used by tool-call scenarios ├── test-chat-perf-regression.js └── test-chat-mem-leaks.js ``` @@ -163,6 +168,6 @@ The copilot extension connects to this server via `IS_SCENARIO_AUTOMATION=1` mod ### Adding a scenario -1. Add a new entry to the `SCENARIOS` object in `common/mock-llm-server.js` — an array of string chunks that will be streamed as SSE -2. Add the scenario ID to the `SCENARIOS` array in `common/utils.js` +1. Add a new entry to the appropriate object (`CONTENT_SCENARIOS`, `TOOL_CALL_SCENARIOS`, or `MULTI_TURN_SCENARIOS`) in `common/perf-scenarios.js` using the `ScenarioBuilder` API from `common/mock-llm-server.js` +2. The scenario is auto-registered by `registerPerfScenarios()` — no manual ID list to update 3. Run: `npm run perf:chat -- --scenario your-new-scenario --runs 1 --no-baseline --verbose` diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index 94a7d512ed64c..0a2cf6cbb6013 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -289,25 +289,43 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru } }); - // Start polling for code/chat/* perf marks inside the renderer. - // The marks are emitted during the request and cleared immediately - // after RequestComplete in the same microtask. We poll rapidly from - // the page context to capture them before they're cleared. + // Use a PerformanceObserver to capture code/chat/* marks as they're + // emitted. This is event-driven (no polling) and captures marks + // even if they're cleared immediately after emission. await window.evaluate(() => { // @ts-ignore globalThis._chatPerfCapture = []; - // @ts-ignore - globalThis._chatPerfPollId = setInterval(() => { + try { + // @ts-ignore + globalThis._chatPerfObserver = new PerformanceObserver((list) => { + for (const entry of list.getEntries()) { + if (entry.name.startsWith('code/chat/')) { + const timeOrigin = performance.timeOrigin ?? 0; + // @ts-ignore + globalThis._chatPerfCapture.push({ + name: entry.name, + startTime: Math.round(timeOrigin + entry.startTime), + }); + } + } + }); // @ts-ignore - const marks = globalThis.MonacoPerformanceMarks?.getMarks() ?? []; - for (const m of marks) { + globalThis._chatPerfObserver.observe({ type: 'mark', buffered: false }); + } catch { + // PerformanceObserver not available — fall back to polling + // @ts-ignore + globalThis._chatPerfPollId = setInterval(() => { // @ts-ignore - if (m.name.startsWith('code/chat/') && !globalThis._chatPerfCapture.some(c => c.name === m.name)) { + const marks = globalThis.MonacoPerformanceMarks?.getMarks() ?? []; + for (const m of marks) { // @ts-ignore - globalThis._chatPerfCapture.push({ name: m.name, startTime: m.startTime }); + if (m.name.startsWith('code/chat/') && !globalThis._chatPerfCapture.some(c => c.name === m.name)) { + // @ts-ignore + globalThis._chatPerfCapture.push({ name: m.name, startTime: m.startTime }); + } } - } - }, 16); // poll every frame (~60fps) + }, 16); + } }); // Submit @@ -425,15 +443,19 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`); } - // Collect perf marks from our polling capture and stop the poll + // Collect perf marks and tear down the observer/poll const chatMarks = await window.evaluate(() => { // @ts-ignore - clearInterval(globalThis._chatPerfPollId); + if (globalThis._chatPerfObserver) { globalThis._chatPerfObserver.disconnect(); } + // @ts-ignore + if (globalThis._chatPerfPollId) { clearInterval(globalThis._chatPerfPollId); } // @ts-ignore const marks = globalThis._chatPerfCapture ?? []; // @ts-ignore delete globalThis._chatPerfCapture; // @ts-ignore + delete globalThis._chatPerfObserver; + // @ts-ignore delete globalThis._chatPerfPollId; return marks; }); From 3bd7ba3425b5d889d2a9b3fd171725477438d079 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 16:06:03 -0700 Subject: [PATCH 09/13] pipeline fix --- .github/workflows/chat-perf.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index 843a0a5317093..febbcf2b0a433 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -43,20 +43,18 @@ concurrency: env: # Only set when explicitly provided; otherwise scripts read config.jsonc BASELINE_COMMIT: ${{ inputs.baseline_commit || '' }} - TEST_COMMIT: ${{ inputs.test_commit || 'main' }} + TEST_COMMIT: ${{ inputs.test_commit || '' }} PERF_RUNS: ${{ inputs.runs || '' }} PERF_THRESHOLD: ${{ inputs.threshold || '' }} jobs: chat-perf: - name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || 'main' }} + name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || github.sha }} runs-on: ubuntu-latest timeout-minutes: 120 steps: - name: Checkout test commit uses: actions/checkout@v6 - with: - ref: ${{ env.TEST_COMMIT }} - name: Setup Node.js uses: actions/setup-node@v6 From 1649a5dec7507a6349fd4ce9c6421be1b3de8e35 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 17:22:27 -0700 Subject: [PATCH 10/13] fix --- .github/workflows/chat-perf.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index febbcf2b0a433..4f7ed59bf9342 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -86,6 +86,9 @@ jobs: - name: Transpile source run: npm run transpile-client + - name: Download Electron + run: node build/lib/preLaunch.ts + - name: Install Playwright Chromium run: npx playwright install chromium From df6478ccd722d67460eec7b5fdd63c4d6102582a Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Tue, 14 Apr 2026 18:14:22 -0700 Subject: [PATCH 11/13] updates --- .github/workflows/chat-perf.yml | 6 +- .../chat-simulation/common/mock-llm-server.js | 51 ++- .../chat-simulation/common/perf-scenarios.js | 107 +++++ scripts/chat-simulation/common/utils.js | 123 +++++- scripts/chat-simulation/config.jsonc | 11 +- .../chat-simulation/test-chat-mem-leaks.js | 385 +++++++++++++----- .../test-chat-perf-regression.js | 165 +++++++- 7 files changed, 709 insertions(+), 139 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index 4f7ed59bf9342..307b0cfd23b65 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -5,8 +5,8 @@ on: paths: - '.github/workflows/chat-perf.yml' schedule: - # Every Friday at 12:00 AM PT (07:00 UTC) - - cron: '0 7 * * 5' + # Nightly at 12:00 AM PT (07:00 UTC) + - cron: '0 7 * * *' workflow_dispatch: inputs: baseline_commit: @@ -49,7 +49,7 @@ env: jobs: chat-perf: - name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || github.sha }} + name: Chat Perf runs-on: ubuntu-latest timeout-minutes: 120 steps: diff --git a/scripts/chat-simulation/common/mock-llm-server.js b/scripts/chat-simulation/common/mock-llm-server.js index 3a2c33c353133..2f64c5deaeca7 100644 --- a/scripts/chat-simulation/common/mock-llm-server.js +++ b/scripts/chat-simulation/common/mock-llm-server.js @@ -445,7 +445,7 @@ function handleRequest(req, res) { tokenizer: 'o200k_base', limits: { max_prompt_tokens: 128000, - max_output_tokens: 16384, + max_output_tokens: 131072, max_context_window_tokens: 128000, }, supports: { @@ -472,7 +472,7 @@ function handleRequest(req, res) { tokenizer: 'o200k_base', limits: { max_prompt_tokens: 128000, - max_output_tokens: 16384, + max_output_tokens: 131072, max_context_window_tokens: 128000, }, supports: { @@ -508,7 +508,7 @@ function handleRequest(req, res) { type: 'chat', family: 'gpt-4o', tokenizer: 'o200k_base', - limits: { max_prompt_tokens: 128000, max_output_tokens: 16384, max_context_window_tokens: 128000 }, + limits: { max_prompt_tokens: 128000, max_output_tokens: 131072, max_context_window_tokens: 128000 }, supports: { streaming: true, tool_calls: true, parallel_tool_calls: true, vision: false }, }, }); @@ -599,20 +599,36 @@ const serverEvents = new EventEmitter(); const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); /** - * Count the number of model turns already completed in the conversation. - * A model turn is one of: - * - An assistant message with tool_calls (tool-calls turn) - * - An assistant message with content but no tool_calls (content/thinking turn) - * The first assistant message after each user message counts as a new model - * turn. User turns in the scenario are detected by counting user messages - * beyond the initial one. + * Count the number of model turns already completed for the CURRENT scenario. + * Only counts assistant messages that appear after the last user message + * containing a [scenario:X] tag. This prevents assistant messages from + * previous scenarios (in the same chat session) from inflating the count. + * * @param {any[]} messages * @returns {number} */ function countCompletedModelTurns(messages) { + // Find the index of the last user message with a scenario tag + let scenarioMsgIdx = -1; + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role !== 'user') { continue; } + const content = typeof msg.content === 'string' + ? msg.content + : Array.isArray(msg.content) + ? msg.content.map((/** @type {any} */ c) => c.text || '').join('') + : ''; + if (/\[scenario:[^\]]+\]/.test(content)) { + scenarioMsgIdx = i; + break; + } + } + + // Count assistant messages after the scenario tag message let turns = 0; - for (const msg of messages) { - if (msg.role === 'assistant') { + const startIdx = scenarioMsgIdx >= 0 ? scenarioMsgIdx + 1 : 0; + for (let i = startIdx; i < messages.length; i++) { + if (messages[i].role === 'assistant') { turns++; } } @@ -680,9 +696,14 @@ async function handleChatCompletions(body, res) { console.log(`[mock-llm] ${ts} → ${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`); } - // Search all user messages for the scenario tag (not just the last one, - // since follow-up user messages in multi-turn scenarios won't have it). - for (const msg of messages) { + // Search user messages in reverse order (newest first) for the scenario + // tag. This ensures the most recent message's tag takes precedence when + // multiple messages with different tags exist in the same conversation + // (e.g. in the leak checker which sends many scenarios in one session). + // Follow-up user messages in multi-turn scenarios won't have a tag, so + // searching backwards still finds the correct tag from the initial message. + for (let mi = messages.length - 1; mi >= 0; mi--) { + const msg = messages[mi]; if (msg.role !== 'user') { continue; } const content = typeof msg.content === 'string' ? msg.content diff --git a/scripts/chat-simulation/common/perf-scenarios.js b/scripts/chat-simulation/common/perf-scenarios.js index 17752aba5772f..721a3fb5c6514 100644 --- a/scripts/chat-simulation/common/perf-scenarios.js +++ b/scripts/chat-simulation/common/perf-scenarios.js @@ -357,6 +357,113 @@ const TOOL_CALL_SCENARIOS = { ], }; })()), + + // Terminal tool: run commands, read output, run more commands. + // Simulates an agent installing dependencies, running tests, and + // diagnosing failures — a common agentic workflow. + 'tool-terminal': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ({ + type: 'multi-turn', + turns: [ + // Round 1: run initial commands (install + build) + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i, + arguments: { + command: 'echo "Installing dependencies..." && echo "added 1631 packages in 6m"', + explanation: 'Install project dependencies', + goal: 'Install dependencies', + mode: 'sync', + timeout: 30000, + }, + }, + ], + }, + // Round 2: run test command + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i, + arguments: { + command: 'echo "Running unit tests..." && echo " 42 passing (3s)" && echo " 2 failing" && echo "" && echo " 1) ChatService should dispose listeners" && echo " AssertionError: expected 0 to equal 1" && echo " 2) ChatModel should clear on new session" && echo " TypeError: Cannot read property dispose of undefined"', + explanation: 'Run the unit test suite to check for failures', + goal: 'Run tests', + mode: 'sync', + timeout: 60000, + }, + }, + ], + }, + // Round 3: read the failing test file for context + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /read.?file/i, + arguments: { filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), startLine: 1, endLine: 50 }, + }, + ], + }, + // Round 4: fix the issue with an edit + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + arguments: { + filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), + oldString: '// perf-benchmark-marker', + newString: '// perf-benchmark-marker (fixed)', + explanation: 'Fix the dispose call in the test', + }, + }, + ], + }, + // Round 5: re-run tests to confirm + { + kind: 'tool-calls', + toolCalls: [ + { + toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i, + arguments: { + command: 'echo "Running unit tests..." && echo " 44 passing (3s)" && echo " 0 failing"', + explanation: 'Re-run tests to verify the fix', + goal: 'Verify fix', + mode: 'sync', + timeout: 60000, + }, + }, + ], + }, + // Round 6: final summary + { + kind: 'content', + chunks: new ScenarioBuilder() + .wait(20, '## Test Failures Fixed\n\n') + .stream([ + 'I found and fixed 2 test failures:\n\n', + '### Root Cause\n', + 'The `ChatService` was not properly disposing event listeners when a session was cleared. ', + 'The `dispose()` method was missing a call to `this._store.dispose()`.\n\n', + '### Changes Made\n', + 'Updated `lifecycle.ts` to properly chain disposal:\n\n', + '```typescript\n', + 'override dispose(): void {\n', + ' this._store.dispose();\n', + ' super.dispose();\n', + '}\n', + '```\n\n', + '### Test Results\n', + '- **Before**: 42 passing, 2 failing\n', + '- **After**: 44 passing, 0 failing\n\n', + 'All tests pass now. The fix ensures listeners are cleaned up during session transitions.\n', + ], 15) + .build(), + }, + ], + }), }; // -- Multi-turn user conversation scenarios ----------------------------------- diff --git a/scripts/chat-simulation/common/utils.js b/scripts/chat-simulation/common/utils.js index db15e83921eed..429154dc04f6a 100644 --- a/scripts/chat-simulation/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -193,7 +193,7 @@ function buildEnv(mockServer, { isDevBuild = true } = {}) { * @param {string} logsDir * @returns {string[]} */ -function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) { +function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true, extHostInspectPort = 0 } = {}) { const args = [ ROOT, '--skip-release-notes', @@ -213,6 +213,13 @@ function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) { if (process.platform !== 'darwin') { args.push('--disable-gpu'); } + if (process.env.CI && process.platform === 'linux') { + args.push('--no-sandbox'); + } + // Enable extension host inspector for profiling/heap snapshots + if (extHostInspectPort > 0) { + args.push(`--inspect-extensions=${extHostInspectPort}`); + } return args; } @@ -228,6 +235,8 @@ function writeSettings(userDataDir, mockServer) { 'github.copilot.advanced.debug.overrideProxyUrl': mockServer.url, 'github.copilot.advanced.debug.overrideCapiUrl': mockServer.url, 'chat.allowAnonymousAccess': true, + // Start new chat sessions in agent mode so tools are available. + 'chat.newSession.defaultMode': 'agent', // Disable MCP servers — they start async and add unpredictable // delay that pollutes perf measurements. 'chat.mcp.discovery.enabled': false, @@ -275,6 +284,112 @@ function prepareRunDir(runId, mockServer) { // -- VS Code launch via CDP -------------------------------------------------- +// -- Extension host inspector ------------------------------------------------ + +/** @type {number} */ +let nextExtHostPort = 29222; + +/** @returns {number} */ +function getNextExtHostInspectPort() { + return nextExtHostPort++; +} + +/** + * Connect to the extension host's Node inspector via WebSocket. + * The extension host must be started with `--inspect-extensions=`. + * + * @param {number} port + * @param {{ verbose?: boolean, timeoutMs?: number }} [opts] + * @returns {Promise<{ send: (method: string, params?: any) => Promise, on: (event: string, listener: (params: any) => void) => void, close: () => void, port: number }>} + */ +async function connectToExtHostInspector(port, opts = {}) { + const { verbose = false, timeoutMs = 30_000 } = opts; + + // Wait for the inspector endpoint to be available + const deadline = Date.now() + timeoutMs; + /** @type {any} */ + let wsUrl; + while (Date.now() < deadline) { + try { + const targets = await getJson(`http://127.0.0.1:${port}/json`); + if (targets.length > 0 && targets[0].webSocketDebuggerUrl) { + wsUrl = targets[0].webSocketDebuggerUrl; + break; + } + } catch { } + await new Promise(r => setTimeout(r, 500)); + } + if (!wsUrl) { + throw new Error(`Timed out waiting for extension host inspector on port ${port}`); + } + + if (verbose) { + console.log(` [ext-host] Connected to inspector: ${wsUrl}`); + } + + const WebSocket = require('ws'); + const ws = new WebSocket(wsUrl); + await new Promise((resolve, reject) => { + ws.once('open', resolve); + ws.once('error', reject); + }); + + let msgId = 1; + /** @type {Map void, reject: (e: Error) => void }>} */ + const pending = new Map(); + /** @type {Map void)[]>} */ + const eventListeners = new Map(); + + ws.on('message', (/** @type {Buffer} */ data) => { + const msg = JSON.parse(data.toString()); + if (msg.id !== undefined) { + const p = pending.get(msg.id); + if (p) { + pending.delete(msg.id); + if (msg.error) { p.reject(new Error(msg.error.message)); } + else { p.resolve(msg.result); } + } + } else if (msg.method) { + const listeners = eventListeners.get(msg.method) || []; + for (const listener of listeners) { listener(msg.params); } + } + }); + + return { + port, + /** + * @param {string} method + * @param {any} [params] + * @returns {Promise} + */ + send(method, params) { + return new Promise((resolve, reject) => { + const id = msgId++; + pending.set(id, { resolve, reject }); + ws.send(JSON.stringify({ id, method, params })); + setTimeout(() => { + if (pending.has(id)) { + pending.delete(id); + reject(new Error(`Inspector call timed out: ${method}`)); + } + }, 30_000); + }); + }, + /** + * @param {string} event + * @param {(params: any) => void} listener + */ + on(event, listener) { + const list = eventListeners.get(event) || []; + list.push(listener); + eventListeners.set(event, list); + }, + close() { + ws.close(); + }, + }; +} + /** * Fetch JSON from a URL. Used to probe the CDP endpoint. * @param {string} url @@ -647,6 +762,10 @@ const METRIC_DEFS = [ ['frameCount', 'rendering', ''], ['compositeLayers', 'rendering', ''], ['paintCount', 'rendering', ''], + ['extHostHeapUsedBefore', 'extHost', 'MB'], + ['extHostHeapUsedAfter', 'extHost', 'MB'], + ['extHostHeapDelta', 'extHost', 'MB'], + ['extHostHeapDeltaPostGC', 'extHost', 'MB'], ]; module.exports = { @@ -670,4 +789,6 @@ module.exports = { summarize, markDuration, launchVSCode, + getNextExtHostInspectPort, + connectToExtHostInspector, }; diff --git a/scripts/chat-simulation/config.jsonc b/scripts/chat-simulation/config.jsonc index ec758bbef11df..3a4d8b29bda3d 100644 --- a/scripts/chat-simulation/config.jsonc +++ b/scripts/chat-simulation/config.jsonc @@ -10,10 +10,13 @@ "regressionThreshold": 0.2 }, "memLeaks": { - // Number of chat messages to send during the leak check - "messages": 10, + // Number of open→work→reset cycles + "iterations": 3, - // Max acceptable heap growth per message in MB - "leakThresholdMB": 2 + // Max acceptable total residual heap growth in MB. + // Each iteration cycles through ALL scenarios (text, code blocks, + // tool calls, thinking, terminal, multi-turn, etc.), so this needs + // to account for V8 internal caches that aren't immediately reclaimed. + "leakThresholdMB": 10 } } diff --git a/scripts/chat-simulation/test-chat-mem-leaks.js b/scripts/chat-simulation/test-chat-mem-leaks.js index 3a10f393b621d..5abcec9c8f562 100644 --- a/scripts/chat-simulation/test-chat-mem-leaks.js +++ b/scripts/chat-simulation/test-chat-mem-leaks.js @@ -6,17 +6,27 @@ // @ts-check /** - * Chat memory leak checker. + * Chat memory leak checker — state-based approach. * - * Sends multiple messages in a single VS Code session and tracks renderer - * heap and DOM node count after each message with forced GC. Uses linear - * regression to detect monotonic growth that indicates a memory leak. + * The idea: if you return to the same state you started from, memory should + * return to roughly the same level. Any residual growth is a potential leak. + * + * Each iteration: + * 1. Open a fresh chat (baseline state) + * 2. Measure heap + DOM nodes + * 3. Cycle through ALL registered perf scenarios (text, code blocks, + * tool calls, thinking, multi-turn, etc.) + * 4. Open a new chat (return to baseline state — clears previous session) + * 5. Measure heap + DOM nodes again + * 6. The delta is the "leaked" memory for that iteration + * + * Multiple iterations let us detect consistent leaks vs. one-time caching. * * Usage: - * npm run perf:chat-leak # 10 messages, 2MB/msg threshold - * npm run perf:chat-leak -- --messages 20 # more messages for accuracy - * npm run perf:chat-leak -- --threshold 1 # stricter (1MB/msg) - * npm run perf:chat-leak -- --build 1.115.0 # test a specific build + * npm run perf:chat-leak # defaults from config + * npm run perf:chat-leak -- --iterations 5 # more iterations + * npm run perf:chat-leak -- --threshold 5 # 5MB total threshold + * npm run perf:chat-leak -- --build 1.115.0 # test a specific build */ const fs = require('fs'); @@ -24,8 +34,14 @@ const path = require('path'); const { DATA_DIR, loadConfig, resolveBuild, buildEnv, buildArgs, prepareRunDir, - linearRegressionSlope, launchVSCode, + launchVSCode, } = require('./common/utils'); +const { + CONTENT_SCENARIOS, TOOL_CALL_SCENARIOS, MULTI_TURN_SCENARIOS, +} = require('./common/perf-scenarios'); +const { + getUserTurns, getModelTurnCount, +} = require('./common/mock-llm-server'); // -- Config (edit config.jsonc to change defaults) --------------------------- @@ -36,27 +52,30 @@ const CONFIG = loadConfig('memLeaks'); function parseArgs() { const args = process.argv.slice(2); const opts = { - messages: CONFIG.messages ?? 10, + iterations: CONFIG.iterations ?? 3, + messages: CONFIG.messages ?? 5, verbose: false, /** @type {string | undefined} */ build: undefined, - leakThresholdMB: CONFIG.leakThresholdMB ?? 2, + leakThresholdMB: CONFIG.leakThresholdMB ?? 5, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { + case '--iterations': opts.iterations = parseInt(args[++i], 10); break; case '--messages': case '-n': opts.messages = parseInt(args[++i], 10); break; case '--verbose': opts.verbose = true; break; case '--build': case '-b': opts.build = args[++i]; break; case '--threshold': opts.leakThresholdMB = parseFloat(args[++i]); break; case '--help': case '-h': console.log([ - 'Chat memory leak checker', + 'Chat memory leak checker (state-based)', '', 'Options:', - ' --messages Number of messages to send (default: 10)', + ' --iterations Number of open→work→reset cycles (default: 3)', + ' --messages Messages to send per iteration (default: 5)', ' --build Path to VS Code build or version to download', - ' --threshold Max per-message heap growth in MB (default: 2)', - ' --verbose Print per-message details', + ' --threshold Max total residual heap growth in MB (default: 5)', + ' --verbose Print per-step details', ].join('\n')); process.exit(0); } @@ -64,15 +83,169 @@ function parseArgs() { return opts; } +// -- Scenario list ----------------------------------------------------------- + +/** + * Build a flat list of scenario IDs to cycle through during leak testing. + * Includes all scenario types: content-only, tool-call, and multi-turn. + * + * Content scenarios exercise varied rendering (code blocks, markdown, etc.). + * Tool-call scenarios exercise the agent loop (model → tool → model → ...). + * Multi-turn scenarios exercise user follow-ups and thinking blocks. + */ +function getScenarioIds() { + return [ + ...Object.keys(CONTENT_SCENARIOS), + ...Object.keys(TOOL_CALL_SCENARIOS), + ...Object.keys(MULTI_TURN_SCENARIOS), + ]; +} + +// -- Helpers ----------------------------------------------------------------- + +const CHAT_VIEW = 'div[id="workbench.panel.chat"]'; +const CHAT_EDITOR_SEL = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`; + +/** + * Measure heap (MB) and DOM node count after forced GC. + * @param {any} cdp + * @param {import('playwright').Page} page + */ +async function measure(cdp, page) { + await cdp.send('HeapProfiler.collectGarbage'); + await new Promise(r => setTimeout(r, 500)); + await cdp.send('HeapProfiler.collectGarbage'); + await new Promise(r => setTimeout(r, 300)); + const heapInfo = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); + const heapMB = Math.round(heapInfo.usedSize / 1024 / 1024 * 100) / 100; + const domNodes = await page.evaluate(() => document.querySelectorAll('*').length); + return { heapMB, domNodes }; +} + +/** + * Open a new chat session via the command palette. + * @param {import('playwright').Page} page + */ +async function openNewChat(page) { + // Use keyboard shortcut to open a new chat (clears previous session) + const newChatShortcut = process.platform === 'darwin' ? 'Meta+KeyL' : 'Control+KeyL'; + await page.keyboard.press(newChatShortcut); + await new Promise(r => setTimeout(r, 1000)); + + // Verify the chat view is visible and ready + await page.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); + await page.waitForFunction( + (sel) => Array.from(document.querySelectorAll(sel)).some(el => el.getBoundingClientRect().width > 0), + CHAT_EDITOR_SEL, { timeout: 15_000 }, + ); + await new Promise(r => setTimeout(r, 500)); +} + +/** + * Send a single message and wait for the response to complete. + * For multi-turn scenarios where the model makes multiple tool-call rounds + * before producing content, `modelTurns` controls how many completions to + * wait for. + * @param {import('playwright').Page} page + * @param {{ completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer + * @param {string} text + * @param {number} [modelTurns=1] - number of model completions to wait for + */ +async function sendMessage(page, mockServer, text, modelTurns = 1) { + await page.click(CHAT_EDITOR_SEL); + await new Promise(r => setTimeout(r, 200)); + + const inputSel = await page.evaluate((editorSel) => { + const ed = document.querySelector(editorSel); + if (!ed) { throw new Error('no editor'); } + return ed.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea'; + }, CHAT_EDITOR_SEL); + + const hasDriver = await page.evaluate(() => + // @ts-ignore + !!globalThis.driver?.typeInEditor + ).catch(() => false); + + if (hasDriver) { + await page.evaluate(({ selector, t }) => { + // @ts-ignore + return globalThis.driver.typeInEditor(selector, t); + }, { selector: inputSel, t: text }); + } else { + await page.click(inputSel); + await new Promise(r => setTimeout(r, 200)); + await page.locator(inputSel).pressSequentially(text, { delay: 0 }); + } + + const compBefore = mockServer.completionCount(); + await page.keyboard.press('Enter'); + try { await mockServer.waitForCompletion(compBefore + modelTurns, 60_000); } catch { } + + const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`; + await page.waitForFunction( + (sel) => { + const responses = document.querySelectorAll(sel); + if (responses.length === 0) { return false; } + return !responses[responses.length - 1].classList.contains('chat-response-loading'); + }, + responseSelector, { timeout: 30_000 }, + ); + await new Promise(r => setTimeout(r, 500)); +} + +/** + * Run a full scenario: send the initial message, then handle any user + * follow-up turns for multi-turn scenarios. + * + * - Content-only scenarios: single message, 1 model turn. + * - Tool-call scenarios (no user turns): single message, N model turns + * (the extension automatically relays tool results back to the model). + * - Multi-turn with user turns: send initial message, wait for response, + * then for each user turn send the follow-up message and wait again. + * + * @param {import('playwright').Page} page + * @param {{ completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer + * @param {string} scenarioId + * @param {string} label - prefix for the message (e.g. "Warmup" or "Iteration 2") + */ +async function runScenario(page, mockServer, scenarioId, label) { + const userTurns = getUserTurns(scenarioId); + const totalModelTurns = getModelTurnCount(scenarioId); + + if (userTurns.length === 0) { + // Content-only or tool-call scenario: one message, wait for all model turns + await sendMessage(page, mockServer, `[scenario:${scenarioId}] ${label}`, totalModelTurns); + } else { + // Multi-turn with user follow-ups: send initial message and wait for + // the model turns before the first user turn, then alternate. + let modelTurnsSoFar = 0; + const firstUserAfter = userTurns[0].afterModelTurn; + const turnsBeforeFirstUser = firstUserAfter - modelTurnsSoFar; + await sendMessage(page, mockServer, `[scenario:${scenarioId}] ${label}`, turnsBeforeFirstUser); + modelTurnsSoFar = firstUserAfter; + + for (let u = 0; u < userTurns.length; u++) { + const nextModelStop = u + 1 < userTurns.length + ? userTurns[u + 1].afterModelTurn + : totalModelTurns; + const turnsUntilNext = nextModelStop - modelTurnsSoFar; + + // Send the user follow-up message + await sendMessage(page, mockServer, userTurns[u].message, turnsUntilNext); + modelTurnsSoFar = nextModelStop; + } + } +} + // -- Leak check -------------------------------------------------------------- /** * @param {string} electronPath * @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer - * @param {number} messageCount - * @param {boolean} verbose + * @param {{ iterations: number, verbose: boolean }} opts */ -async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { +async function runLeakCheck(electronPath, mockServer, opts) { + const { iterations, verbose } = opts; const { userDataDir, extDir, logsDir } = prepareRunDir('leak-check', mockServer); const isDevBuild = !electronPath.includes('.vscode-test'); @@ -82,24 +255,21 @@ async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { buildEnv(mockServer, { isDevBuild }), { verbose }, ); - const window = vscode.page; + const page = vscode.page; try { - await window.waitForSelector('.monaco-workbench', { timeout: 60_000 }); + await page.waitForSelector('.monaco-workbench', { timeout: 60_000 }); - const cdp = await window.context().newCDPSession(window); + const cdp = await page.context().newCDPSession(page); await cdp.send('HeapProfiler.enable'); - // Open chat + // Open chat panel const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI'; - await window.keyboard.press(chatShortcut); - - const CHAT_VIEW = 'div[id="workbench.panel.chat"]'; - const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`; - await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); - await window.waitForFunction( + await page.keyboard.press(chatShortcut); + await page.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); + await page.waitForFunction( (sel) => Array.from(document.querySelectorAll(sel)).some(el => el.getBoundingClientRect().width > 0), - chatEditorSel, { timeout: 15_000 }, + CHAT_EDITOR_SEL, { timeout: 15_000 }, ); // Wait for extension activation @@ -107,85 +277,81 @@ async function runLeakCheck(electronPath, mockServer, messageCount, verbose) { try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { } await new Promise(r => setTimeout(r, 3000)); - /** @type {number[]} */ - const heapSamples = []; - /** @type {number[]} */ - const domNodeSamples = []; - - for (let i = 0; i < messageCount; i++) { - // Force GC and measure - await cdp.send('HeapProfiler.collectGarbage'); - await new Promise(r => setTimeout(r, 200)); - const heapInfo = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); - const heapMB = Math.round(heapInfo.usedSize / 1024 / 1024 * 100) / 100; - const domNodes = await window.evaluate(() => document.querySelectorAll('*').length); - heapSamples.push(heapMB); - domNodeSamples.push(domNodes); + // --- Warmup iteration (not measured) --- + // Cycle through all scenarios once to settle one-time caches and lazy init + const scenarioIds = getScenarioIds(); + if (verbose) { + console.log(` [leak] Warmup: cycling through ${scenarioIds.length} scenarios to settle caches...`); + } + for (let m = 0; m < scenarioIds.length; m++) { + if (verbose) { + console.log(` [leak] warmup: ${scenarioIds[m]}`); + } + await runScenario(page, mockServer, scenarioIds[m], 'Warmup'); + } + await openNewChat(page); + await new Promise(r => setTimeout(r, 1000)); + + // --- Baseline measurement (fresh chat, post-warmup) --- + const baseline = await measure(cdp, page); + if (verbose) { + console.log(` [leak] Baseline (post-warmup): heap=${baseline.heapMB}MB, domNodes=${baseline.domNodes}`); + } + + /** @type {{ beforeHeapMB: number, afterHeapMB: number, deltaHeapMB: number, beforeDomNodes: number, afterDomNodes: number, deltaDomNodes: number }[]} */ + const iterationResults = []; + + for (let iter = 0; iter < iterations; iter++) { + // Measure at start of iteration (should be in "clean" state) + const before = await measure(cdp, page); if (verbose) { - console.log(` [leak] Message ${i + 1}/${messageCount}: heap=${heapMB}MB, domNodes=${domNodes}`); + console.log(` [leak] Iteration ${iter + 1}/${iterations}: start heap=${before.heapMB}MB, domNodes=${before.domNodes}`); } - // Focus and type - await window.click(chatEditorSel); - await new Promise(r => setTimeout(r, 200)); - - const inputSel = await window.evaluate((editorSel) => { - const ed = document.querySelector(editorSel); - if (!ed) { throw new Error('no editor'); } - return ed.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea'; - }, chatEditorSel); - - const msg = `[scenario:text-only] Leak check message ${i + 1}`; - const hasDriver = await window.evaluate(() => - // @ts-ignore - !!globalThis.driver?.typeInEditor - ).catch(() => false); - - if (hasDriver) { - await window.evaluate(({ selector, text }) => { - // @ts-ignore - return globalThis.driver.typeInEditor(selector, text); - }, { selector: inputSel, text: msg }); - } else { - await window.click(inputSel); - await new Promise(r => setTimeout(r, 200)); - await window.locator(inputSel).pressSequentially(msg, { delay: 0 }); + // Do work: cycle through all scenarios + for (let m = 0; m < scenarioIds.length; m++) { + const sid = scenarioIds[m]; + await runScenario(page, mockServer, sid, `Iteration ${iter + 1}`); + if (verbose) { + console.log(` [leak] Sent ${sid} (${m + 1}/${scenarioIds.length})`); + } } - const compBefore = mockServer.completionCount(); - await window.keyboard.press('Enter'); - try { await mockServer.waitForCompletion(compBefore + 1, 30_000); } catch { } - - // Wait for response - const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`; - await window.waitForFunction( - (sel) => { - const responses = document.querySelectorAll(sel); - if (responses.length === 0) { return false; } - return !responses[responses.length - 1].classList.contains('chat-response-loading'); - }, - responseSelector, { timeout: 30_000 }, - ); - await new Promise(r => setTimeout(r, 500)); - } + // Return to clean state: open a new empty chat + await openNewChat(page); + await new Promise(r => setTimeout(r, 1000)); - // Final measurement - await cdp.send('HeapProfiler.collectGarbage'); - await new Promise(r => setTimeout(r, 200)); - const finalHeap = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); - heapSamples.push(Math.round(finalHeap.usedSize / 1024 / 1024 * 100) / 100); - domNodeSamples.push(await window.evaluate(() => document.querySelectorAll('*').length)); + // Measure after returning to clean state + const after = await measure(cdp, page); + const deltaHeapMB = Math.round((after.heapMB - before.heapMB) * 100) / 100; + const deltaDomNodes = after.domNodes - before.domNodes; - if (verbose) { - console.log(` [leak] Final: heap=${heapSamples[heapSamples.length - 1]}MB, domNodes=${domNodeSamples[domNodeSamples.length - 1]}`); + iterationResults.push({ + beforeHeapMB: before.heapMB, + afterHeapMB: after.heapMB, + deltaHeapMB, + beforeDomNodes: before.domNodes, + afterDomNodes: after.domNodes, + deltaDomNodes, + }); + + if (verbose) { + console.log(` [leak] Iteration ${iter + 1}/${iterations}: end heap=${after.heapMB}MB (delta=${deltaHeapMB}MB), domNodes=${after.domNodes} (delta=${deltaDomNodes})`); + } } + // Final measurement + const final = await measure(cdp, page); + const totalResidualMB = Math.round((final.heapMB - baseline.heapMB) * 100) / 100; + const totalResidualNodes = final.domNodes - baseline.domNodes; + return { - heapSamples, - domNodeSamples, - leakPerMessageMB: Math.round(linearRegressionSlope(heapSamples) * 100) / 100, - leakPerMessageNodes: Math.round(linearRegressionSlope(domNodeSamples)), + baseline, + final: { heapMB: final.heapMB, domNodes: final.domNodes }, + totalResidualMB, + totalResidualNodes, + iterations: iterationResults, }; } finally { await vscode.close(); @@ -208,21 +374,24 @@ async function main() { registerPerfScenarios(); const mockServer = await startServer(0); - console.log(`[chat-simulation] Leak check: ${opts.messages} messages, threshold ${opts.leakThresholdMB}MB/msg`); + console.log(`[chat-simulation] Leak check: ${opts.iterations} iterations × ${getScenarioIds().length} scenarios, threshold ${opts.leakThresholdMB}MB total`); console.log(`[chat-simulation] Build: ${electronPath}`); console.log(''); - const result = await runLeakCheck(electronPath, mockServer, opts.messages, opts.verbose); + const result = await runLeakCheck(electronPath, mockServer, opts); console.log('[chat-simulation] =================== Leak Check Results ==================='); console.log(''); - console.log(` Heap samples (MB): ${result.heapSamples.join(' → ')}`); - console.log(` DOM node samples: ${result.domNodeSamples.join(' → ')}`); + console.log(` Baseline (post-warmup): heap=${result.baseline.heapMB}MB, domNodes=${result.baseline.domNodes}`); + console.log(` Final: heap=${result.final.heapMB}MB, domNodes=${result.final.domNodes}`); + console.log(''); + for (let i = 0; i < result.iterations.length; i++) { + const it = result.iterations[i]; + console.log(` Iteration ${i + 1}: ${it.beforeHeapMB}MB → ${it.afterHeapMB}MB (residual: ${it.deltaHeapMB > 0 ? '+' : ''}${it.deltaHeapMB}MB, DOM: ${it.deltaDomNodes > 0 ? '+' : ''}${it.deltaDomNodes} nodes)`); + } console.log(''); - const totalHeapDelta = Math.round((result.heapSamples[result.heapSamples.length - 1] - result.heapSamples[0]) * 100) / 100; - console.log(` Heap growth: ${result.heapSamples[0]}MB → ${result.heapSamples[result.heapSamples.length - 1]}MB (delta${totalHeapDelta}MB total)`); - console.log(` Per-message heap growth: ${result.leakPerMessageMB}MB/msg`); - console.log(` Per-message DOM growth: ${result.leakPerMessageNodes} nodes/msg`); + console.log(` Total residual heap growth: ${result.totalResidualMB > 0 ? '+' : ''}${result.totalResidualMB}MB`); + console.log(` Total residual DOM growth: ${result.totalResidualNodes > 0 ? '+' : ''}${result.totalResidualNodes} nodes`); console.log(''); // Write JSON @@ -230,12 +399,12 @@ async function main() { fs.writeFileSync(jsonPath, JSON.stringify({ timestamp: new Date().toISOString(), ...result }, null, 2)); console.log(`[chat-simulation] Results written to ${jsonPath}`); - const leaked = result.leakPerMessageMB > opts.leakThresholdMB; + const leaked = result.totalResidualMB > opts.leakThresholdMB; console.log(''); if (leaked) { - console.log(`[chat-simulation] LEAK DETECTED — ${result.leakPerMessageMB}MB/msg exceeds ${opts.leakThresholdMB}MB/msg threshold`); + console.log(`[chat-simulation] LEAK DETECTED — ${result.totalResidualMB}MB residual exceeds ${opts.leakThresholdMB}MB threshold`); } else { - console.log(`[chat-simulation] No leak detected (${result.leakPerMessageMB}MB/msg < ${opts.leakThresholdMB}MB/msg)`); + console.log(`[chat-simulation] No leak detected (${result.totalResidualMB}MB residual < ${opts.leakThresholdMB}MB threshold)`); } await mockServer.close(); diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index 0a2cf6cbb6013..7cf88368711d4 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -27,6 +27,7 @@ const { DATA_DIR, METRIC_DEFS, loadConfig, resolveBuild, buildEnv, buildArgs, prepareRunDir, robustStats, welchTTest, summarize, markDuration, launchVSCode, + getNextExtHostInspectPort, connectToExtHostInspector, } = require('./common/utils'); const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server'); const { registerPerfScenarios } = require('./common/perf-scenarios'); @@ -134,6 +135,12 @@ function parseArgs() { * profilePath: string, * tracePath: string, * snapshotPath: string, + * extHostHeapUsedBefore: number, + * extHostHeapUsedAfter: number, + * extHostHeapDelta: number, + * extHostHeapDeltaPostGC: number, + * extHostProfilePath: string, + * extHostSnapshotPath: string, * }} RunMetrics */ @@ -165,15 +172,22 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, '')); fs.mkdirSync(runDiagDir, { recursive: true }); + const extHostInspectPort = getNextExtHostInspectPort(); const vscode = await launchVSCode( electronPath, - buildArgs(userDataDir, extDir, logsDir, { isDevBuild }), + buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort }), buildEnv(mockServer, { isDevBuild }), { verbose }, ); activeVSCode = vscode; const window = vscode.page; + // Declared outside try so the finally block can clean up + /** @type {{ send: (method: string, params?: any) => Promise, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */ + let extHostInspector = null; + /** @type {{ usedSize: number, totalSize: number } | null} */ + let extHostHeapBefore = null; + try { await window.waitForSelector('.monaco-workbench', { timeout: 60_000 }); @@ -223,6 +237,22 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`); } + // Connect to extension host inspector for profiling/heap data + try { + extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 }); + await extHostInspector.send('HeapProfiler.enable'); + await extHostInspector.send('Profiler.enable'); + await extHostInspector.send('Profiler.start'); + extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage'); + if (verbose) { + console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`); + } + } catch (err) { + if (verbose) { + console.log(` [ext-host] Could not connect to inspector: ${err}`); + } + } + // Wait for model resolution await new Promise(r => setTimeout(r, 3000)); await dismissDialog(); @@ -503,6 +533,62 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); fs.writeFileSync(snapshotPath, snapshotChunks.join('')); + // -- Extension host metrics ------------------------------------------ + let extHostHeapUsedBefore = -1; + let extHostHeapUsedAfter = -1; + let extHostHeapDelta = -1; + let extHostHeapDeltaPostGC = -1; + let extHostProfilePath = ''; + let extHostSnapshotPath = ''; + if (extHostInspector && extHostHeapBefore) { + try { + extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024); + + // Stop CPU profiler and save + const extProfile = await extHostInspector.send('Profiler.stop'); + extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile'); + fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile)); + if (verbose) { + console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`); + } + + // Heap usage after interaction + const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage'); + extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024); + extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore; + + // Force GC and measure retained heap + try { + await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true }); + await new Promise(r => setTimeout(r, 200)); + const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage'); + extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore; + } catch { + extHostHeapDeltaPostGC = -1; + } + + // Take ext host heap snapshot + extHostSnapshotPath = path.join(runDiagDir, 'exthost-heap.heapsnapshot'); + const extSnapshotChunks = /** @type {string[]} */ ([]); + extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => { + extSnapshotChunks.push(params.chunk); + }); + await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); + fs.writeFileSync(extHostSnapshotPath, extSnapshotChunks.join('')); + + if (verbose) { + console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`); + console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`); + } + } catch (err) { + if (verbose) { + console.log(` [ext-host] Error collecting metrics: ${err}`); + } + } finally { + extHostInspector.close(); + } + } + // Parse timing — prefer internal code/chat/* marks (precise, in-process) // with client-side Date.now() as fallback for older builds without marks. const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated'); @@ -576,8 +662,17 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru profilePath, tracePath, snapshotPath, + extHostHeapUsedBefore, + extHostHeapUsedAfter, + extHostHeapDelta, + extHostHeapDeltaPostGC, + extHostProfilePath, + extHostSnapshotPath, }; } finally { + if (extHostInspector) { + try { extHostInspector.close(); } catch { } + } activeVSCode = null; await vscode.close(); } @@ -585,6 +680,42 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru // -- CI summary generation --------------------------------------------------- +const GITHUB_REPO = 'https://github.com/microsoft/vscode'; + +/** + * Format a build identifier as a Markdown link when possible. + * - Commit SHAs link to the commit page. + * - Semver versions link to the release tag page. + * - Everything else (e.g. "baseline", "dev (local)") is returned as inline code. + * @param {string} label + * @returns {string} + */ +function formatBuildLink(label) { + if (/^[0-9a-f]{7,40}$/.test(label)) { + const short = label.substring(0, 7); + return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`; + } + if (/^\d+\.\d+\.\d+/.test(label)) { + return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`; + } + return `\`${label}\``; +} + +/** + * Build a GitHub compare link between two build identifiers, if both are + * commit-like or version-like references. Returns empty string otherwise. + * @param {string} base + * @param {string} test + * @returns {string} + */ +function formatCompareLink(base, test) { + const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v); + if (!isRef(base) || !isRef(test)) { + return ''; + } + return `[compare](${GITHUB_REPO}/compare/${base}...${test})`; +} + /** * Generate a detailed Markdown summary table for CI. * Printed to stdout and written to ci-summary.md. @@ -596,6 +727,9 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru function generateCISummary(jsonReport, baseline, opts) { const baseLabel = opts.baselineBuild || 'baseline'; const testLabel = opts.build || 'dev (local)'; + const baseLink = formatBuildLink(baseLabel); + const testLink = formatBuildLink(testLabel); + const compareLink = formatCompareLink(baseLabel, testLabel); const allMetrics = [ ['timeToFirstToken', 'timing', 'ms'], ['timeToComplete', 'timing', 'ms'], @@ -611,6 +745,8 @@ function generateCISummary(jsonReport, baseline, opts) { ['heapDelta', 'memory', 'MB'], ['heapDeltaPostGC', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], + ['extHostHeapDelta', 'extHost', 'MB'], + ['extHostHeapDeltaPostGC', 'extHost', 'MB'], ]; const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'layoutCount', 'recalcStyleCount', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']); @@ -621,8 +757,11 @@ function generateCISummary(jsonReport, baseline, opts) { lines.push(''); lines.push(`| | |`); lines.push(`|---|---|`); - lines.push(`| **Baseline** | \`${baseLabel}\` |`); - lines.push(`| **Test** | \`${testLabel}\` |`); + lines.push(`| **Baseline** | ${baseLink} |`); + lines.push(`| **Test** | ${testLink} |`); + if (compareLink) { + lines.push(`| **Diff** | ${compareLink} |`); + } lines.push(`| **Runs per scenario** | ${opts.runs} |`); lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`); lines.push(`| **Scenarios** | ${scenarios.length} |`); @@ -856,12 +995,12 @@ async function main() { } // Recompute stats with merged data - const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: prevTestRuns }); + const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); } prevResults.scenarios[scenario] = sd; if (prevBaseline?.scenarios?.[scenario]) { - const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: prevBaseRuns }); + const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns }); for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); } prevBaseline.scenarios[scenario] = bsd; } @@ -954,7 +1093,7 @@ async function main() { } const allRuns = [...existingRuns, ...newResults]; if (allRuns.length > 0) { - const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, rawRuns: allRuns }); + const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); } cachedBaseline.scenarios[scenario] = sd; } @@ -986,7 +1125,7 @@ async function main() { scenarios: /** @type {Record} */ ({}), }; for (const [scenario, results] of Object.entries(baselineResults)) { - const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } baselineReport.scenarios[scenario] = sd; } @@ -1053,13 +1192,21 @@ async function main() { console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB')); console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB')); console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms')); + if (results.some(r => r.extHostHeapDelta >= 0)) { + console.log(''); + console.log(' Extension Host:'); + console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB')); + console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB')); + console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB')); + console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB')); + } } // -- JSON output ----------------------------------------------------- const jsonPath = path.join(runDir, 'results.json'); const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, scenarios: Record, _resultsPath?: string }} */ ({ timestamp: new Date().toISOString(), platform: process.platform, runsPerScenario: opts.runs, scenarios: /** @type {Record} */ ({}) }); for (const [scenario, results] of Object.entries(allResults)) { - const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, rawRuns: results }); + const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } jsonReport.scenarios[scenario] = sd; } @@ -1111,6 +1258,8 @@ async function printComparison(jsonReport, opts) { const infoMetrics = [ ['heapDelta', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], + ['extHostHeapDelta', 'extHost', 'MB'], + ['extHostHeapDeltaPostGC', 'extHost', 'MB'], ]; for (const scenario of Object.keys(jsonReport.scenarios)) { From e756e47102698686b1e4ce39a14c2d454a79d08c Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Wed, 15 Apr 2026 00:12:38 -0700 Subject: [PATCH 12/13] PR --- .github/workflows/chat-perf.yml | 4 ++++ scripts/chat-simulation/common/perf-scenarios.js | 15 ++++++--------- scripts/chat-simulation/common/utils.js | 2 -- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index 307b0cfd23b65..2721af3c042d2 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -86,6 +86,10 @@ jobs: - name: Transpile source run: npm run transpile-client + - name: Build copilot extension + run: npm run compile + working-directory: extensions/copilot + - name: Download Electron run: node build/lib/preLaunch.ts diff --git a/scripts/chat-simulation/common/perf-scenarios.js b/scripts/chat-simulation/common/perf-scenarios.js index 721a3fb5c6514..390c675063828 100644 --- a/scripts/chat-simulation/common/perf-scenarios.js +++ b/scripts/chat-simulation/common/perf-scenarios.js @@ -316,21 +316,19 @@ const TOOL_CALL_SCENARIOS = { kind: 'tool-calls', toolCalls: [ { - toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + toolNamePattern: /insert.?edit|replace.?string|apply.?patch/i, arguments: { filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), - oldString: '// perf-benchmark-marker', - newString: '// perf-benchmark-marker (updated)', explanation: 'Update the benchmark marker comment in lifecycle.ts', + code: '// perf-benchmark-marker (updated)', }, }, { - toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + toolNamePattern: /insert.?edit|replace.?string|apply.?patch/i, arguments: { filePath: path.join(FIXTURES_DIR, 'event.ts'), - oldString: '// perf-benchmark-marker', - newString: '// perf-benchmark-marker (updated)', explanation: 'Update the benchmark marker comment in event.ts', + code: '// perf-benchmark-marker (updated)', }, }, ], @@ -411,12 +409,11 @@ const TOOL_CALL_SCENARIOS = { kind: 'tool-calls', toolCalls: [ { - toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i, + toolNamePattern: /insert.?edit|replace.?string|apply.?patch/i, arguments: { filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), - oldString: '// perf-benchmark-marker', - newString: '// perf-benchmark-marker (fixed)', explanation: 'Fix the dispose call in the test', + code: '// perf-benchmark-marker (fixed)', }, }, ], diff --git a/scripts/chat-simulation/common/utils.js b/scripts/chat-simulation/common/utils.js index 429154dc04f6a..d2e3a26d15d96 100644 --- a/scripts/chat-simulation/common/utils.js +++ b/scripts/chat-simulation/common/utils.js @@ -235,8 +235,6 @@ function writeSettings(userDataDir, mockServer) { 'github.copilot.advanced.debug.overrideProxyUrl': mockServer.url, 'github.copilot.advanced.debug.overrideCapiUrl': mockServer.url, 'chat.allowAnonymousAccess': true, - // Start new chat sessions in agent mode so tools are available. - 'chat.newSession.defaultMode': 'agent', // Disable MCP servers — they start async and add unpredictable // delay that pollutes perf measurements. 'chat.mcp.discovery.enabled': false, From 3f6aac34c0bde9ef8f1abb2de4cfe5109b1edda4 Mon Sep 17 00:00:00 2001 From: Paul Wang Date: Wed, 15 Apr 2026 09:00:26 -0700 Subject: [PATCH 13/13] update --- .github/workflows/chat-perf.yml | 12 ++++++--- .../test-chat-perf-regression.js | 25 +++++++++++-------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml index 2721af3c042d2..b92b5117c49b4 100644 --- a/.github/workflows/chat-perf.yml +++ b/.github/workflows/chat-perf.yml @@ -32,6 +32,11 @@ on: required: false type: number default: 0.2 + skip_leak_check: + description: 'Skip the memory leak check step' + required: false + type: boolean + default: true permissions: contents: read @@ -132,6 +137,7 @@ jobs: - name: Run memory leak check id: leak + if: inputs.skip_leak_check != true run: | LEAK_ARGS="--verbose" if [[ -n "$TEST_COMMIT" ]]; then @@ -154,7 +160,7 @@ jobs: echo "⚠️ No summary file generated. Check perf-output.log artifact." >> "$GITHUB_STEP_SUMMARY" fi - if [[ -f .chat-simulation-data/chat-simulation-leak-results.json ]]; then + if [[ "${{ inputs.skip_leak_check }}" != "true" && -f .chat-simulation-data/chat-simulation-leak-results.json ]]; then echo "" >> "$GITHUB_STEP_SUMMARY" echo "## Memory Leak Check" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" @@ -192,12 +198,12 @@ jobs: retention-days: 30 - name: Fail on regression - if: steps.perf.outcome == 'failure' || steps.leak.outcome == 'failure' + if: steps.perf.outcome == 'failure' || (inputs.skip_leak_check != true && steps.leak.outcome == 'failure') run: | if [[ "${{ steps.perf.outcome }}" == "failure" ]]; then echo "::error::Chat performance regression detected. See job summary for details." fi - if [[ "${{ steps.leak.outcome }}" == "failure" ]]; then + if [[ "${{ inputs.skip_leak_check }}" != "true" && "${{ steps.leak.outcome }}" == "failure" ]]; then echo "::error::Chat memory leak detected. See leak-output.log for details." fi exit 1 diff --git a/scripts/chat-simulation/test-chat-perf-regression.js b/scripts/chat-simulation/test-chat-perf-regression.js index 7cf88368711d4..af2b968cba8ea 100644 --- a/scripts/chat-simulation/test-chat-perf-regression.js +++ b/scripts/chat-simulation/test-chat-perf-regression.js @@ -244,7 +244,7 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await extHostInspector.send('Profiler.enable'); await extHostInspector.send('Profiler.start'); extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage'); - if (verbose) { + if (verbose && extHostHeapBefore) { console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`); } } catch (err) { @@ -417,11 +417,6 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 }); } - // Note current response count before submitting - const responseCountBefore = await window.evaluate((sel) => { - return document.querySelectorAll(sel).length; - }, responseSelector); - // Submit follow-up const utCompBefore = mockServer.completionCount(); await window.keyboard.press('Enter'); @@ -429,15 +424,25 @@ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, ru // Wait for mock server to serve the response for this turn try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { } - // Wait for a new response element to appear and settle + // Wait for the new response to finish rendering. + // The chat list is virtualized — old response elements are + // recycled out of the DOM as new ones appear, so we cannot + // rely on counting DOM elements. Instead, scroll to the + // bottom and wait for no response to be in loading state. await dismissDialog(); + await window.evaluate((chatViewSel) => { + const input = document.querySelector(chatViewSel + ' .interactive-input-part'); + if (input) { input.scrollIntoView({ block: 'end' }); } + }, CHAT_VIEW); + await new Promise(r => setTimeout(r, 200)); + await window.waitForFunction( - ({ sel, prevCount }) => { + (sel) => { const responses = document.querySelectorAll(sel); - if (responses.length <= prevCount) { return false; } + if (responses.length === 0) { return false; } return !responses[responses.length - 1].classList.contains('chat-response-loading'); }, - { sel: responseSelector, prevCount: responseCountBefore }, + responseSelector, { timeout: 30_000 }, ); responseCompleteTime = Date.now();