From 5bb73b5e5cb9f82b2ba4a17bc9c09d2485babeac Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 10:55:42 +0200 Subject: [PATCH 01/42] evals: add types.ts with Dataset, Example, EvalResult and related types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the canonical TypeScript type definitions for the eval pipeline: - `ToolCall` / `Trajectory` — MCP host loop output primitives - `ExpectedBehavior` — optional `tools`, `criteria`, `skill` fields (evaluators return `'N/A'` when a field they need is absent) - `Example` / `Dataset` — test-case and collection shapes - `EvaluatorResult` / `EvalResult` — per-evaluator and per-example results - `Evaluator` — async-compatible function contract all evaluator modules satisfy Also adds `evals/**/*` to tsconfig.json includes so tsc covers eval files. Co-Authored-By: Claude Sonnet 4.6 --- evals/types.ts | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ tsconfig.json | 2 +- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 evals/types.ts diff --git a/evals/types.ts b/evals/types.ts new file mode 100644 index 0000000..4722075 --- /dev/null +++ b/evals/types.ts @@ -0,0 +1,75 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** A single tool invocation captured during an MCP host loop run. */ +export interface ToolCall { + tool: string; + args: Record; + result?: unknown; +} + +/** Ordered sequence of tool calls produced by one eval run. */ +export type Trajectory = ToolCall[]; + +/** + * What a passing run should look like. + * `tools` and `criteria` are both optional — evaluators that depend on them + * return `'N/A'` when the field is absent, so a dataset can omit whichever + * dimension is irrelevant for a given example. + */ +export interface ExpectedBehavior { + /** Ordered list of tool names the host should call. Used by trajectory / tool-selection evaluators. */ + tools?: string[]; + /** Natural-language assertions checked by the criteria (LLM-as-judge) evaluator. */ + criteria?: string[]; + /** Skill ID that should be activated. Used by the skill-activation evaluator. */ + skill?: string; +} + +/** One test case inside a dataset. */ +export interface Example { + /** Stable identifier — used as a key in result tables and CI summaries. */ + id: string; + /** The user message sent to the LLM host at the start of the simulation. */ + input: string; + expected: ExpectedBehavior; +} + +/** A named collection of examples that can be loaded by the runner. */ +export interface Dataset { + name: string; + examples: Example[]; +} + +/** + * Output of a single evaluator for one example. + * `score` is a value in [0, 1] when the evaluator ran, or `'N/A'` when the + * evaluator skipped (e.g. `expected.tools` was absent for trajectory evaluator). + */ +export interface EvaluatorResult { + score: number | 'N/A'; + /** Human-readable explanation of the score, required when score is numeric. */ + reason?: string; +} + +/** Aggregate result for one example after all evaluators have run. */ +export interface EvalResult { + exampleId: string; + input: string; + trajectory: Trajectory; + /** Keys are evaluator names (e.g. `'skill-activation'`, `'trajectory'`). */ + evaluators: Record; +} + +/** + * Contract every evaluator module must satisfy. + * Async to accommodate LLM-as-judge evaluators that call an LLM provider. + */ +export type Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +) => EvaluatorResult | Promise; diff --git a/tsconfig.json b/tsconfig.json index 23b7968..5dc2901 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -14,6 +14,6 @@ "lib": ["ES2022", "DOM", "DOM.Iterable"], "types": ["vitest/globals", "@testing-library/jest-dom"] }, - "include": ["src/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"], + "include": ["src/**/*", "evals/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"], "exclude": ["node_modules", "dist"] } From 06d830cee537349bf9be1b5f24188a83b9480668 Mon Sep 17 00:00:00 2001 From: patryks-treadmill Date: Fri, 15 May 2026 10:55:59 +0200 Subject: [PATCH 02/42] ao(create-evals-types-ts-with-typescript-definitions--0): Create `evals/types.ts` with TypeScript definitions for `Dataset`, `Exam Auto-committed by patryks-treadmill orchestrator. plan=automatic-migration-mcp-app job=64319163-2da8-44b5-b087-3dee6e9e4c14 attempt=1 --- package-lock.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index 08e9dde..d34696e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5750,14 +5750,6 @@ "node": ">= 18" } }, - "node_modules/monaco-promql": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/monaco-promql/-/monaco-promql-1.8.0.tgz", - "integrity": "sha512-XdgRojBzEe/rKtrJaHbSfoMFOMD5TXymDHIitTngmBT6XEjtAirnA7Rb2YJAO1SZrJfgvAo4LFCzJ71fH7+WOw==", - "license": "MIT", - "optional": true, - "peer": true - }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", From b3ad86e8c0ee4455c46237788006f9ec0b6a3ed6 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:02:10 +0200 Subject: [PATCH 03/42] evals: add runner.ts orchestrator, runMcpHostLoop stub, and eval vitest config runner.ts exports `runDataset(dataset, evaluators, options?)` which: - Wraps all examples in `describe.skipIf(!process.env.RUN_LLM_EVALS)` so regular `npm test` never makes LLM calls or requires API keys - Creates one `it` per example: runs runMcpHostLoop, scores via evaluators, asserts numeric scores >= passingScore (default 0.5) - Emits a Markdown table summary via afterAll for CI job summaries runMcpHostLoop.ts is a typed stub (throws); full InMemoryTransport implementation comes in the next commit. evals/vitest.config.ts runs in node environment with 120 s timeout, scoped to evals/**/*.{test,spec,eval}.ts and *.dataset.ts patterns. Also: - Adds `test:evals` script to package.json (cross-env RUN_LLM_EVALS=1) - Adds evals/**/*.ts to eslint.config.js file patterns so eval files are linted and license-header-checked Co-Authored-By: Claude Sonnet 4.6 --- eslint.config.js | 1 + evals/runMcpHostLoop.ts | 19 +++++++ evals/runner.ts | 112 ++++++++++++++++++++++++++++++++++++++++ evals/vitest.config.ts | 24 +++++++++ package.json | 1 + 5 files changed, 157 insertions(+) create mode 100644 evals/runMcpHostLoop.ts create mode 100644 evals/runner.ts create mode 100644 evals/vitest.config.ts diff --git a/eslint.config.js b/eslint.config.js index 382ca72..cde436c 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -16,6 +16,7 @@ export default tseslint.config( files: [ 'src/**/*.ts', 'src/**/*.tsx', + 'evals/**/*.ts', '*.ts', 'scripts/**/*.js', '*.mjs', diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts new file mode 100644 index 0000000..be80a6d --- /dev/null +++ b/evals/runMcpHostLoop.ts @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Trajectory } from "./types.js"; + +/** + * Simulates one MCP host loop turn using the SDK's InMemoryTransport and + * returns the ordered sequence of tool calls the LLM made. + * + * Full implementation lands in the next commit; this stub satisfies the + * import so runner.ts type-checks now. + */ +export async function runMcpHostLoop(_input: string): Promise { + throw new Error("runMcpHostLoop is not yet implemented"); +} diff --git a/evals/runner.ts b/evals/runner.ts new file mode 100644 index 0000000..50035ca --- /dev/null +++ b/evals/runner.ts @@ -0,0 +1,112 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, afterAll } from "vitest"; +import type { Dataset, EvalResult, EvaluatorResult, Evaluator } from "./types.js"; +import { runMcpHostLoop } from "./runMcpHostLoop.js"; + +export interface RunnerOptions { + /** Minimum numeric score [0–1] for a test to pass. Defaults to 0.5. */ + passingScore?: number; +} + +/** + * Registers a Vitest suite for every example in `dataset`. + * + * The entire suite is skipped unless `RUN_LLM_EVALS=1` is set in the + * environment, so regular `npm test` incurs zero LLM cost. + * + * Each example becomes one `it` that: + * 1. Runs the in-process MCP host loop to collect a trajectory. + * 2. Passes the trajectory to every evaluator. + * 3. Asserts that numeric scores meet `passingScore`. + * + * After all examples complete, a Markdown summary is written to stdout so + * the GitHub Actions job summary (>> $GITHUB_STEP_SUMMARY) can capture it. + */ +export function runDataset( + dataset: Dataset, + evaluators: Record, + options: RunnerOptions = {} +): void { + const { passingScore = 0.5 } = options; + + describe.skipIf(!process.env.RUN_LLM_EVALS)(dataset.name, () => { + const results: EvalResult[] = []; + + for (const example of dataset.examples) { + it(example.id, async () => { + const trajectory = await runMcpHostLoop(example.input); + + const evalResults: Record = {}; + for (const [name, evaluator] of Object.entries(evaluators)) { + evalResults[name] = await evaluator(trajectory, example.expected); + } + + const result: EvalResult = { + exampleId: example.id, + input: example.input, + trajectory, + evaluators: evalResults, + }; + results.push(result); + + for (const [name, evalResult] of Object.entries(evalResults)) { + if (evalResult.score !== "N/A") { + expect( + evalResult.score, + `[${name}] score ${evalResult.score.toFixed(2)} < ${passingScore}` + + (evalResult.reason ? `: ${evalResult.reason}` : "") + ).toBeGreaterThanOrEqual(passingScore); + } + } + }); + } + + afterAll(() => { + process.stdout.write(buildMarkdownSummary(dataset.name, results) + "\n"); + }); + }); +} + +function buildMarkdownSummary(datasetName: string, results: EvalResult[]): string { + if (results.length === 0) { + return `## Eval results: ${datasetName}\n\n_No examples ran._\n`; + } + + const evaluatorNames = Array.from( + new Set(results.flatMap((r) => Object.keys(r.evaluators))) + ); + + const headers = ["id", "input", ...evaluatorNames]; + const separator = headers.map(() => "---"); + + const rows = results.map((r) => { + const scoreCells = evaluatorNames.map((name) => { + const e = r.evaluators[name]; + if (!e) return "—"; + if (e.score === "N/A") return "N/A"; + return `${(e.score * 100).toFixed(0)}%`; + }); + return [r.exampleId, truncate(r.input, 60), ...scoreCells]; + }); + + const lines = [ + `## Eval results: ${datasetName}`, + "", + `| ${headers.join(" | ")} |`, + `| ${separator.join(" | ")} |`, + ...rows.map((row) => `| ${row.join(" | ")} |`), + "", + ]; + + return lines.join("\n"); +} + +function truncate(s: string, maxLen: number): string { + return s.length <= maxLen ? s : `${s.slice(0, maxLen - 1)}…`; +} diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts new file mode 100644 index 0000000..7d2f4d5 --- /dev/null +++ b/evals/vitest.config.ts @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { defineConfig } from "vitest/config"; + +/** + * Vitest config for LLM eval suites. Intentionally separate from the main + * vitest.config.ts so `npm test` never picks up eval files (and thus never + * makes LLM calls or requires API keys in a regular dev/CI run). + * + * Run via: npm run test:evals + */ +export default defineConfig({ + test: { + environment: "node", + globals: true, + include: ["evals/**/*.{test,spec,eval}.ts", "evals/datasets/**/*.dataset.ts"], + testTimeout: 120_000, + }, +}); diff --git a/package.json b/package.json index 983e3ca..043ee51 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "test": "vitest", "test:run": "vitest run", "test:coverage": "vitest run --coverage", + "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts", "prepublishOnly": "npm run build", "prepare": "husky", "version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json" From 21b3030ea8ada62dc5efc067e617293c41ba5112 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:21:13 +0200 Subject: [PATCH 04/42] evals: implement runMcpHostLoop with InMemoryTransport and LLM provider types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runMcpHostLoop wires an MCP Client to the server via InMemoryTransport (in-process, no network), lists available tools, and drives a loop of up to MAX_TURNS=8 turns: LLM → tool calls → client.callTool() → result fed back → repeat Options allow callers to inject a pre-built McpServer (for mocked-service datasets) or a custom LlmProvider (for deterministic tests). Both default to the real implementations when omitted. evals/llm/types.ts introduces the LlmProvider interface and LlmMessage discriminated union (OpenAI-style, compatible with LiteLLM proxies). evals/llm/index.ts exposes createDefaultLlmProvider(), which auto-selects by env var (ANTHROPIC_API_KEY first, then OPENAI_API_KEY); the concrete adapters (anthropic.ts / openai.ts) land in the next commit — this stub surfaces a clear error until they do. Co-Authored-By: Claude Sonnet 4.6 --- evals/llm/index.ts | 38 ++++++++++++ evals/llm/types.ts | 54 +++++++++++++++++ evals/runMcpHostLoop.ts | 130 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 215 insertions(+), 7 deletions(-) create mode 100644 evals/llm/index.ts create mode 100644 evals/llm/types.ts diff --git a/evals/llm/index.ts b/evals/llm/index.ts new file mode 100644 index 0000000..d3c254b --- /dev/null +++ b/evals/llm/index.ts @@ -0,0 +1,38 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { LlmProvider } from "./types.js"; + +/** + * Returns the default LLM provider by inspecting environment variables. + * + * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5) + * OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini) + * + * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are + * implemented in the next commit; this stub ensures runMcpHostLoop.ts + * type-checks now and surfaces a clear error at runtime when evals are run + * before the adapters land. + */ +export function createDefaultLlmProvider(): LlmProvider { + if (process.env.ANTHROPIC_API_KEY) { + throw new Error( + "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " + + "It will land in the next commit." + ); + } + if (process.env.OPENAI_API_KEY) { + throw new Error( + "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " + + "It will land in the next commit." + ); + } + throw new Error( + "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " + + "before running evals (npm run test:evals)." + ); +} diff --git a/evals/llm/types.ts b/evals/llm/types.ts new file mode 100644 index 0000000..b5fef9b --- /dev/null +++ b/evals/llm/types.ts @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** A single tool the LLM may call, described in JSON Schema. */ +export interface LlmToolDefinition { + name: string; + description: string; + /** JSON Schema object describing the tool's input parameters. */ + parameters: Record; +} + +/** One tool invocation requested by the LLM in an assistant turn. */ +export interface LlmToolCallRequest { + id: string; + type: "function"; + function: { + name: string; + /** JSON-encoded argument object. */ + arguments: string; + }; +} + +/** + * Discriminated union covering every role that can appear in a chat thread. + * Shaped after the OpenAI chat messages API so a single interface works for + * both the OpenAI and Anthropic adapters (and any LiteLLM proxy in between). + */ +export type LlmMessage = + | { role: "user"; content: string } + | { + role: "assistant"; + content: string | null; + tool_calls?: LlmToolCallRequest[]; + } + | { role: "tool"; content: string; tool_call_id: string }; + +/** Narrowed assistant message — what LlmProvider.chat() must return. */ +export type AssistantMessage = Extract; + +/** + * Minimal provider contract every LLM adapter must satisfy. + * The interface is intentionally thin: give it a message history + tool + * catalogue, get back the next assistant turn (possibly with tool calls). + */ +export interface LlmProvider { + chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise; +} diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts index be80a6d..d6a732c 100644 --- a/evals/runMcpHostLoop.ts +++ b/evals/runMcpHostLoop.ts @@ -5,15 +5,131 @@ * 2.0. */ -import type { Trajectory } from "./types.js"; +import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { createServer } from "../src/server.js"; +import type { Trajectory, ToolCall } from "./types.js"; +import type { LlmProvider, LlmMessage } from "./llm/types.js"; +import { createDefaultLlmProvider } from "./llm/index.js"; + +/** Maximum LLM → tool-call turns before halting to prevent runaway evals. */ +const MAX_TURNS = 8; + +export interface HostLoopOptions { + /** + * Pre-built MCP server to test against. + * + * Pass a server constructed with mocked services for dataset-level evals + * that don't need a live cluster. Omit to use `createServer()`, which reads + * CLUSTERS_JSON / CLUSTERS_FILE and requires a real Elastic cluster. + * + * Each call to `runMcpHostLoop` should receive a **fresh** server instance; + * reusing a connected server across calls is not supported. + */ + server?: McpServer; + /** + * LLM provider used to simulate the MCP host making tool-call decisions. + * Defaults to auto-selecting from ANTHROPIC_API_KEY / OPENAI_API_KEY. + */ + llm?: LlmProvider; + /** + * Maximum number of LLM→tool-call turns per run. + * Defaults to MAX_TURNS (8). + */ + maxTurns?: number; +} /** - * Simulates one MCP host loop turn using the SDK's InMemoryTransport and - * returns the ordered sequence of tool calls the LLM made. + * Simulates one MCP host loop run entirely in-process. + * + * Architecture: + * LLM ↔ Client ↔─InMemoryTransport─↔ McpServer ↔ (ES / Kibana clients) * - * Full implementation lands in the next commit; this stub satisfies the - * import so runner.ts type-checks now. + * The function: + * 1. Wires a fresh Client to the server via InMemoryTransport. + * 2. Lists available MCP tools and hands them to the LLM as tool definitions. + * 3. Loops up to `maxTurns` times: + * a. Asks the LLM for the next assistant turn. + * b. If the LLM emits tool calls, executes each via client.callTool(). + * c. Records every call in the trajectory. + * d. Feeds results back into the message history. + * e. Breaks when the LLM emits no tool calls (task complete). + * 4. Closes the client and returns the trajectory. */ -export async function runMcpHostLoop(_input: string): Promise { - throw new Error("runMcpHostLoop is not yet implemented"); +export async function runMcpHostLoop( + input: string, + { server, llm, maxTurns = MAX_TURNS }: HostLoopOptions = {} +): Promise { + const resolvedServer = server ?? createServer(); + const resolvedLlm = llm ?? createDefaultLlmProvider(); + + const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair(); + await resolvedServer.connect(serverTransport); + + const client = new Client({ name: "eval-host", version: "1.0.0" }); + await client.connect(clientTransport); + + try { + const { tools: mcpTools } = await client.listTools(); + const toolDefs = mcpTools.map((t) => ({ + name: t.name, + description: t.description ?? "", + parameters: t.inputSchema as Record, + })); + + const messages: LlmMessage[] = [{ role: "user", content: input }]; + const trajectory: Trajectory = []; + + for (let turn = 0; turn < maxTurns; turn++) { + const response = await resolvedLlm.chat(messages, toolDefs); + messages.push(response); + + if (!response.tool_calls || response.tool_calls.length === 0) { + // LLM chose not to call a tool — simulation complete. + break; + } + + for (const toolCall of response.tool_calls) { + const toolName = toolCall.function.name; + let toolArgs: Record; + try { + toolArgs = JSON.parse(toolCall.function.arguments) as Record< + string, + unknown + >; + } catch { + // Malformed JSON from the LLM; record the call with empty args + // so the trajectory evaluator can detect the failure. + toolArgs = {}; + } + + const result = await client.callTool({ + name: toolName, + arguments: toolArgs, + }); + + const record: ToolCall = { + tool: toolName, + args: toolArgs, + result: result.content, + }; + trajectory.push(record); + + // Feed the tool result back so the LLM can reason about it. + messages.push({ + role: "tool", + content: JSON.stringify(result.content), + tool_call_id: toolCall.id, + }); + } + } + + return trajectory; + } finally { + // Closing the client also closes clientTransport, which triggers + // serverTransport.onclose() — the InMemoryTransport linked pair + // tears down cleanly without needing an explicit server.close(). + await client.close(); + } } From 066f7cfdc27c1d8c3669ab772b44ffbde195bcd9 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:28:17 +0200 Subject: [PATCH 05/42] evals: add OpenAiProvider with LiteLLM proxy support and wire default provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAiProvider (evals/llm/openai.ts): - Implements LlmProvider.chat() via the openai SDK (gpt-4o-mini default) - Accepts baseURL to point at a LiteLLM proxy for any compatible provider - Maps LlmMessage ↔ ChatCompletionMessageParam in both directions; narrows ChatCompletionMessageToolCall to FunctionToolCall before accessing .function - Strips tools argument when the list is empty (avoids API errors) evals/llm/index.ts: - createDefaultLlmProvider() now returns a real OpenAiProvider when OPENAI_API_KEY is set; picks up LITELLM_BASE_URL automatically - Preserves the ANTHROPIC_API_KEY branch with a clear "coming soon" error until evals/llm/anthropic.ts lands Adds openai@^6.37.0 as a devDependency (npm install --save-dev openai). Co-Authored-By: Claude Sonnet 4.6 --- evals/llm/index.ts | 25 +++++---- evals/llm/openai.ts | 128 ++++++++++++++++++++++++++++++++++++++++++++ package-lock.json | 23 ++++++++ package.json | 1 + 4 files changed, 164 insertions(+), 13 deletions(-) create mode 100644 evals/llm/openai.ts diff --git a/evals/llm/index.ts b/evals/llm/index.ts index d3c254b..d58aeda 100644 --- a/evals/llm/index.ts +++ b/evals/llm/index.ts @@ -6,33 +6,32 @@ */ import type { LlmProvider } from "./types.js"; +import { OpenAiProvider } from "./openai.js"; /** * Returns the default LLM provider by inspecting environment variables. * - * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5) - * OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini) + * Priority order: + * 1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon + * 2. OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini) * - * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are - * implemented in the next commit; this stub ensures runMcpHostLoop.ts - * type-checks now and surfaces a clear error at runtime when evals are run - * before the adapters land. + * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM + * proxy, e.g. to use Claude via the OpenAI-compatible endpoint. */ export function createDefaultLlmProvider(): LlmProvider { if (process.env.ANTHROPIC_API_KEY) { throw new Error( "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " + - "It will land in the next commit." + "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter." ); } if (process.env.OPENAI_API_KEY) { - throw new Error( - "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " + - "It will land in the next commit." - ); + return new OpenAiProvider({ + baseURL: process.env.LITELLM_BASE_URL, + }); } throw new Error( - "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " + - "before running evals (npm run test:evals)." + "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " + + "once the Anthropic adapter lands) before running evals (npm run test:evals)." ); } diff --git a/evals/llm/openai.ts b/evals/llm/openai.ts new file mode 100644 index 0000000..a6dd59b --- /dev/null +++ b/evals/llm/openai.ts @@ -0,0 +1,128 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import OpenAI from "openai"; +import type { + AssistantMessage, + LlmMessage, + LlmProvider, + LlmToolDefinition, +} from "./types.js"; + +const DEFAULT_MODEL = "gpt-4o-mini"; + +export interface OpenAiProviderOptions { + /** Chat model to use. Defaults to gpt-4o-mini. */ + model?: string; + /** + * Override the API base URL. Point this at a LiteLLM proxy to route calls + * through any provider the proxy supports without changing client code. + */ + baseURL?: string; + /** + * API key. Defaults to the OPENAI_API_KEY environment variable, which is + * the standard OpenAI SDK default. + */ + apiKey?: string; +} + +export class OpenAiProvider implements LlmProvider { + private readonly client: OpenAI; + private readonly model: string; + + constructor({ + model = DEFAULT_MODEL, + baseURL, + apiKey, + }: OpenAiProviderOptions = {}) { + this.model = model; + this.client = new OpenAI({ + ...(apiKey !== undefined ? { apiKey } : {}), + ...(baseURL !== undefined ? { baseURL } : {}), + }); + } + + async chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise { + const response = await this.client.chat.completions.create({ + model: this.model, + messages: messages.map(toOaiMessage), + ...(tools.length > 0 ? { tools: tools.map(toOaiTool) } : {}), + }); + + const choice = response.choices[0]; + if (!choice) { + throw new Error("OpenAI returned no choices"); + } + + const msg = choice.message; + return { + role: "assistant", + content: msg.content ?? null, + ...(msg.tool_calls + ? { + tool_calls: msg.tool_calls + .filter( + (tc): tc is OpenAI.ChatCompletionMessageFunctionToolCall => + tc.type === "function" + ) + .map((tc) => ({ + id: tc.id, + type: "function" as const, + function: { + name: tc.function.name, + arguments: tc.function.arguments, + }, + })), + } + : {}), + }; + } +} + +function toOaiMessage(msg: LlmMessage): OpenAI.ChatCompletionMessageParam { + switch (msg.role) { + case "user": + return { role: "user", content: msg.content }; + case "assistant": + return { + role: "assistant", + content: msg.content, + ...(msg.tool_calls + ? { + tool_calls: msg.tool_calls.map((tc) => ({ + id: tc.id, + type: "function" as const, + function: { + name: tc.function.name, + arguments: tc.function.arguments, + }, + })), + } + : {}), + }; + case "tool": + return { + role: "tool", + content: msg.content, + tool_call_id: msg.tool_call_id, + }; + } +} + +function toOaiTool(tool: LlmToolDefinition): OpenAI.ChatCompletionTool { + return { + type: "function", + function: { + name: tool.name, + description: tool.description, + parameters: tool.parameters, + }, + }; +} diff --git a/package-lock.json b/package-lock.json index d34696e..2207e24 100644 --- a/package-lock.json +++ b/package-lock.json @@ -54,6 +54,7 @@ "husky": "^9.1.7", "jsdom": "^29.1.1", "lint-staged": "^16.4.0", + "openai": "^6.37.0", "tailwindcss": "^4.2.2", "tsx": "^4.21.0", "typescript": "^6.0.2", @@ -5860,6 +5861,28 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/openai": { + "version": "6.37.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-6.37.0.tgz", + "integrity": "sha512-0H5dEGFmmLv6KSd0W1w2nyL8WsLkX6yoLeQpU+dZAOuGcany5qkYQMmj35ZrKgb6yiyYqpUzFOpR8mZQkgqeEQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", diff --git a/package.json b/package.json index 043ee51..e328e65 100644 --- a/package.json +++ b/package.json @@ -103,6 +103,7 @@ "husky": "^9.1.7", "jsdom": "^29.1.1", "lint-staged": "^16.4.0", + "openai": "^6.37.0", "tailwindcss": "^4.2.2", "tsx": "^4.21.0", "typescript": "^6.0.2", From 9f47372445c80abf2295e64ac332f13c3453e30e Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:33:59 +0200 Subject: [PATCH 06/42] evals: add AnthropicProvider and wire it as the default when ANTHROPIC_API_KEY is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AnthropicProvider (evals/llm/anthropic.ts): - Implements LlmProvider.chat() via @anthropic-ai/sdk (claude-haiku-4-5-20251001) - toAnthropicMessages() handles the structural gap between OpenAI-style messages and Anthropic's API: no `tool` role exists; tool results go as `user` messages with `tool_result` content blocks; consecutive tool results are merged into a single user turn to avoid adjacent-user-turn API errors - Tool input is round-tripped JSON.parse (from LlmToolCallRequest.arguments) → object for the request, then JSON.stringify back for the response to maintain the OpenAI-compatible LlmToolCallRequest shape - input_schema is cast from LlmToolDefinition.parameters (already JSON Schema) evals/llm/index.ts: - createDefaultLlmProvider() now returns AnthropicProvider when ANTHROPIC_API_KEY is set (priority 1), falls back to OpenAiProvider for OPENAI_API_KEY (priority 2) Adds @anthropic-ai/sdk@^0.96.0 as a devDependency. Co-Authored-By: Claude Sonnet 4.6 --- evals/llm/anthropic.ts | 150 +++++++++++++++++++++++++++++++++++++++++ evals/llm/index.ts | 16 ++--- package-lock.json | 69 +++++++++++++++++++ package.json | 1 + 4 files changed, 227 insertions(+), 9 deletions(-) create mode 100644 evals/llm/anthropic.ts diff --git a/evals/llm/anthropic.ts b/evals/llm/anthropic.ts new file mode 100644 index 0000000..70e9adc --- /dev/null +++ b/evals/llm/anthropic.ts @@ -0,0 +1,150 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Anthropic from "@anthropic-ai/sdk"; +import type { + AssistantMessage, + LlmMessage, + LlmProvider, + LlmToolDefinition, +} from "./types.js"; + +const DEFAULT_MODEL = "claude-haiku-4-5-20251001"; + +/** Max tokens to request from the Anthropic API per turn. */ +const MAX_TOKENS = 4096; + +export interface AnthropicProviderOptions { + /** Chat model to use. Defaults to claude-haiku-4-5-20251001. */ + model?: string; + /** + * API key. Defaults to the ANTHROPIC_API_KEY environment variable, which is + * the standard Anthropic SDK default. + */ + apiKey?: string; +} + +export class AnthropicProvider implements LlmProvider { + private readonly client: Anthropic; + private readonly model: string; + + constructor({ + model = DEFAULT_MODEL, + apiKey, + }: AnthropicProviderOptions = {}) { + this.model = model; + this.client = new Anthropic({ + ...(apiKey !== undefined ? { apiKey } : {}), + }); + } + + async chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise { + const response = await this.client.messages.create({ + model: this.model, + max_tokens: MAX_TOKENS, + messages: toAnthropicMessages(messages), + ...(tools.length > 0 ? { tools: tools.map(toAnthropicTool) } : {}), + }); + + const textBlocks = response.content.filter( + (c): c is Anthropic.TextBlock => c.type === "text" + ); + const toolUseBlocks = response.content.filter( + (c): c is Anthropic.ToolUseBlock => c.type === "tool_use" + ); + + return { + role: "assistant", + content: textBlocks.map((b) => b.text).join("") || null, + ...(toolUseBlocks.length > 0 + ? { + tool_calls: toolUseBlocks.map((tu) => ({ + id: tu.id, + type: "function" as const, + function: { + name: tu.name, + // Anthropic returns a parsed object; re-encode to match the + // OpenAI-style LlmToolCallRequest.function.arguments shape. + arguments: JSON.stringify(tu.input), + }, + })), + } + : {}), + }; + } +} + +/** + * Converts OpenAI-style LlmMessage[] to Anthropic MessageParam[]. + * + * Structural differences from OpenAI: + * - Anthropic has no `tool` role. Tool results go as `user` messages with + * `tool_result` content blocks. + * - Consecutive tool-result messages are merged into a single user message + * so the API never receives two adjacent user turns. + * - Assistant content is an array of TextBlockParam / ToolUseBlockParam. + */ +function toAnthropicMessages( + messages: LlmMessage[] +): Anthropic.MessageParam[] { + const result: Anthropic.MessageParam[] = []; + + for (const msg of messages) { + if (msg.role === "user") { + result.push({ role: "user", content: msg.content }); + } else if (msg.role === "assistant") { + const content: Anthropic.ContentBlockParam[] = []; + if (msg.content) { + content.push({ type: "text", text: msg.content }); + } + for (const tc of msg.tool_calls ?? []) { + let input: unknown; + try { + input = JSON.parse(tc.function.arguments); + } catch { + input = {}; + } + content.push({ type: "tool_use", id: tc.id, name: tc.function.name, input }); + } + result.push({ role: "assistant", content }); + } else { + // msg.role === "tool" + const block: Anthropic.ToolResultBlockParam = { + type: "tool_result", + tool_use_id: msg.tool_call_id, + content: msg.content, + }; + + // Merge into the preceding user message when it already holds + // tool_result blocks — the Anthropic API rejects two adjacent user turns. + const prev = result[result.length - 1]; + if ( + prev?.role === "user" && + Array.isArray(prev.content) && + (prev.content as Anthropic.ContentBlockParam[])[0]?.type === + "tool_result" + ) { + (prev.content as Anthropic.ContentBlockParam[]).push(block); + } else { + result.push({ role: "user", content: [block] }); + } + } + } + + return result; +} + +function toAnthropicTool(tool: LlmToolDefinition): Anthropic.Tool { + return { + name: tool.name, + description: tool.description, + input_schema: tool.parameters as Anthropic.Tool.InputSchema, + }; +} diff --git a/evals/llm/index.ts b/evals/llm/index.ts index d58aeda..5698ff5 100644 --- a/evals/llm/index.ts +++ b/evals/llm/index.ts @@ -6,24 +6,22 @@ */ import type { LlmProvider } from "./types.js"; +import { AnthropicProvider } from "./anthropic.js"; import { OpenAiProvider } from "./openai.js"; /** * Returns the default LLM provider by inspecting environment variables. * * Priority order: - * 1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon - * 2. OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini) + * 1. ANTHROPIC_API_KEY → AnthropicProvider (claude-haiku-4-5-20251001) + * 2. OPENAI_API_KEY → OpenAiProvider / LiteLLM proxy (gpt-4o-mini) * * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM - * proxy, e.g. to use Claude via the OpenAI-compatible endpoint. + * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint. */ export function createDefaultLlmProvider(): LlmProvider { if (process.env.ANTHROPIC_API_KEY) { - throw new Error( - "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " + - "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter." - ); + return new AnthropicProvider(); } if (process.env.OPENAI_API_KEY) { return new OpenAiProvider({ @@ -31,7 +29,7 @@ export function createDefaultLlmProvider(): LlmProvider { }); } throw new Error( - "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " + - "once the Anthropic adapter lands) before running evals (npm run test:evals)." + "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " + + "before running evals (npm run test:evals)." ); } diff --git a/package-lock.json b/package-lock.json index 2207e24..156fc31 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,6 +33,7 @@ "elastic-security-mcp-app": "dist/main.js" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.96.0", "@tailwindcss/vite": "^4.2.2", "@testing-library/jest-dom": "^6.9.1", "@testing-library/react": "^16.3.2", @@ -74,6 +75,28 @@ "dev": true, "license": "MIT" }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.96.0", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.96.0.tgz", + "integrity": "sha512-KlCsODtTyb17bLUVCSDC2HtSvAbJf60sEiPEax9dInF+aDF92vS4TZJ5XD7YCQXNb1/5icYaw8Y7wMjPlIV9Zg==", + "dev": true, + "license": "MIT", + "dependencies": { + "json-schema-to-ts": "^3.1.1", + "standardwebhooks": "^1.0.0" + }, + "bin": { + "anthropic-ai-sdk": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.25.0 || ^4.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, "node_modules/@asamuzakjp/css-color": { "version": "5.1.11", "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", @@ -1860,6 +1883,13 @@ ], "peer": true }, + "node_modules/@stablelib/base64": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@stablelib/base64/-/base64-1.0.1.tgz", + "integrity": "sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@standard-schema/spec": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", @@ -4240,6 +4270,13 @@ "dev": true, "license": "MIT" }, + "node_modules/fast-sha256": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/fast-sha256/-/fast-sha256-1.3.0.tgz", + "integrity": "sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==", + "dev": true, + "license": "Unlicense" + }, "node_modules/fast-uri": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", @@ -4945,6 +4982,20 @@ "dev": true, "license": "MIT" }, + "node_modules/json-schema-to-ts": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz", + "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.18.3", + "ts-algebra": "^2.0.0" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -6647,6 +6698,17 @@ "dev": true, "license": "MIT" }, + "node_modules/standardwebhooks": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/standardwebhooks/-/standardwebhooks-1.0.0.tgz", + "integrity": "sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@stablelib/base64": "^1.0.0", + "fast-sha256": "^1.3.0" + } + }, "node_modules/state-local": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/state-local/-/state-local-1.0.7.tgz", @@ -6886,6 +6948,13 @@ "tree-kill": "cli.js" } }, + "node_modules/ts-algebra": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", + "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==", + "dev": true, + "license": "MIT" + }, "node_modules/ts-api-utils": { "version": "2.5.0", "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz", diff --git a/package.json b/package.json index e328e65..83ef515 100644 --- a/package.json +++ b/package.json @@ -82,6 +82,7 @@ "react-dom": "^19.2.4" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.96.0", "@tailwindcss/vite": "^4.2.2", "@testing-library/jest-dom": "^6.9.1", "@testing-library/react": "^16.3.2", From ab6ac677b250128f3bef38046668cafcf3e36c8b Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:36:50 +0200 Subject: [PATCH 07/42] evals: add --reporter=verbose to test:evals script Makes per-example test names visible in CI output and in the GitHub Actions job summary, which is where the Markdown eval table lands. Co-Authored-By: Claude Sonnet 4.6 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 83ef515..2308b39 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,7 @@ "test": "vitest", "test:run": "vitest run", "test:coverage": "vitest run --coverage", - "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts", + "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts --reporter=verbose", "prepublishOnly": "npm run build", "prepare": "husky", "version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json" From 9c7c1ddab13cf4b99a52f7b927fec43ec95e4593 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:37:44 +0200 Subject: [PATCH 08/42] evals: add skill-activation evaluator (binary score) Returns 1 if the trajectory contains at least one call to the skill's entry-point tool (expected.skill), 0 if not, or 'N/A' when expected.skill is absent so datasets that don't test skill routing can omit the field. The failure reason includes the full tool-name list from the trajectory to make CI output actionable without re-running the eval. Co-Authored-By: Claude Sonnet 4.6 --- evals/evaluators/skill-activation.ts | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 evals/evaluators/skill-activation.ts diff --git a/evals/evaluators/skill-activation.ts b/evals/evaluators/skill-activation.ts new file mode 100644 index 0000000..b7deb8d --- /dev/null +++ b/evals/evaluators/skill-activation.ts @@ -0,0 +1,37 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Binary evaluator: did the LLM call the skill's entry-point tool? + * + * Each MCP skill has a single model-facing entry-point tool (e.g. `migrate-rules` + * for the automatic-migration skill, `manage-rules` for detection-rule-management). + * `expected.skill` holds that tool name. The evaluator checks whether the + * trajectory contains at least one call to that tool. + * + * Returns `'N/A'` when `expected.skill` is absent so datasets that don't + * care about skill routing can omit the field without failing the run. + */ +export const skillActivation: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.skill) { + return { score: "N/A" }; + } + + const activated = trajectory.some((tc) => tc.tool === expected.skill); + + return { + score: activated ? 1 : 0, + reason: activated + ? `Tool "${expected.skill}" was called` + : `Tool "${expected.skill}" was never called (trajectory: [${trajectory.map((t) => t.tool).join(", ") || "empty"}])`, + }; +}; From 7849ed50d4158205f861e068adf4451620f3cb68 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:39:33 +0200 Subject: [PATCH 09/42] evals: add negative-activation evaluator for distractor examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary complement of skill-activation: returns 1 when the skill's entry-point tool (expected.skill) is absent from the trajectory (correct — LLM was not falsely triggered), 0 when the tool appears (false positive). Returns 'N/A' when expected.skill is absent, matching the skill-activation convention so both evaluators behave consistently on examples that don't declare a skill. CI gate intent: datasets should require 100% on this evaluator for distractor examples — any false positive means the skill's SKILL.md is over-triggering on unrelated queries in production. Co-Authored-By: Claude Sonnet 4.6 --- evals/evaluators/negative-activation.ts | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 evals/evaluators/negative-activation.ts diff --git a/evals/evaluators/negative-activation.ts b/evals/evaluators/negative-activation.ts new file mode 100644 index 0000000..e08d315 --- /dev/null +++ b/evals/evaluators/negative-activation.ts @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Binary evaluator for distractor examples: did the LLM correctly avoid + * calling the skill's entry-point tool? + * + * This is the complement of `skillActivation`. Use it on examples where the + * user query should NOT trigger the skill — e.g. a migration skill dataset + * includes unrelated queries (case management, threat hunting) to confirm the + * LLM doesn't call `migrate-rules` for everything. + * + * Score semantics (binary): + * 1 — skill tool absent from trajectory (correct — not distracted) + * 0 — skill tool present in trajectory (false positive — skill over-triggered) + * + * Returns `'N/A'` when `expected.skill` is absent, consistent with how + * `skillActivation` handles missing skill declarations. + * + * CI gate: datasets should require 100% on this evaluator for distractor + * examples — a false positive means the skill's SKILL.md is too aggressive + * and will fire on unrelated queries in production. + */ +export const negativeActivation: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.skill) { + return { score: "N/A" }; + } + + const falsePositive = trajectory.some((tc) => tc.tool === expected.skill); + + return { + score: falsePositive ? 0 : 1, + reason: falsePositive + ? `Tool "${expected.skill}" was called but should not have been (false positive)` + : `Tool "${expected.skill}" was correctly absent from the trajectory`, + }; +}; From ed6ce7de2aa6877994e22558440fab8ab5d170b7 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:42:05 +0200 Subject: [PATCH 10/42] evals: add tool-selection evaluator (precision/recall F1 against expected.tools) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Computes set-based precision, recall, and F1 against expected.tools. Deduplicates both the trajectory and the expected list — order/repetition is the trajectory evaluator's job. Score = F1 ∈ [0, 1]. Returns 'N/A' when expected.tools is absent so datasets that only test skill routing don't need to declare tool lists. The reason string includes missed and extra tool names to make CI failures immediately actionable without re-running the eval. CI gate intent: ≥0.8 (80%) on positive examples. Co-Authored-By: Claude Sonnet 4.6 --- evals/evaluators/tool-selection.ts | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 evals/evaluators/tool-selection.ts diff --git a/evals/evaluators/tool-selection.ts b/evals/evaluators/tool-selection.ts new file mode 100644 index 0000000..71cf7b1 --- /dev/null +++ b/evals/evaluators/tool-selection.ts @@ -0,0 +1,60 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Set-based tool-selection evaluator: how well did the LLM pick the right tools? + * + * Computes precision, recall, and their harmonic mean (F1) against the + * set of tool names in `expected.tools`. Deduplicates both sides — order + * and repetition are tested by the trajectory evaluator instead. + * + * precision = |called ∩ expected| / |called| (no spurious calls) + * recall = |called ∩ expected| / |expected| (no missed calls) + * score = F1 = 2·P·R / (P+R) ∈ [0, 1] + * + * Returns `'N/A'` when `expected.tools` is absent so datasets that only + * care about skill routing don't need to declare tool lists. + * + * CI gate: datasets should require ≥0.8 (80%) on positive examples. + * The failure reason lists missed and extra tools to make debugging fast. + */ +export const toolSelection: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.tools) { + return { score: "N/A" }; + } + + const expectedSet = new Set(expected.tools); + const calledSet = new Set(trajectory.map((tc) => tc.tool)); + + if (expectedSet.size === 0 && calledSet.size === 0) { + return { score: 1, reason: "No tools expected and none called" }; + } + + const tp = [...calledSet].filter((t) => expectedSet.has(t)).length; + const precision = calledSet.size > 0 ? tp / calledSet.size : 0; + const recall = expectedSet.size > 0 ? tp / expectedSet.size : 0; + const f1 = + precision + recall > 0 + ? (2 * precision * recall) / (precision + recall) + : 0; + + const missed = [...expectedSet].filter((t) => !calledSet.has(t)); + const extra = [...calledSet].filter((t) => !expectedSet.has(t)); + + const parts = [ + `F1=${f1.toFixed(2)} (precision=${precision.toFixed(2)}, recall=${recall.toFixed(2)})`, + ...(missed.length > 0 ? [`missed: [${missed.join(", ")}]`] : []), + ...(extra.length > 0 ? [`extra: [${extra.join(", ")}]`] : []), + ]; + + return { score: f1, reason: parts.join(" | ") }; +}; From 304df8df49bbe587c36176c61c3b963018a4ccde Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:44:10 +0200 Subject: [PATCH 11/42] evals: add trajectory evaluator (LCS-based sequence score) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Computes score = lcs(actual, expected) / max(|actual|, |expected|). Dividing by the max penalises both missing tools (recall gap) and extra spurious tools (precision gap) in a single metric. Sequence matters here, unlike tool-selection which is set-based. Returns 'N/A' when expected.tools is absent — this guard prevents the evaluator from emitting meaningless 0-scores on examples that declare no ordered expectation, which would mask real regressions elsewhere. LCS is O(m·n) time via a flat DP array to avoid nested-array allocation. Co-Authored-By: Claude Sonnet 4.6 --- evals/evaluators/trajectory.ts | 79 ++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 evals/evaluators/trajectory.ts diff --git a/evals/evaluators/trajectory.ts b/evals/evaluators/trajectory.ts new file mode 100644 index 0000000..4e71ec8 --- /dev/null +++ b/evals/evaluators/trajectory.ts @@ -0,0 +1,79 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Sequence-aware evaluator: how closely did the LLM follow the expected tool order? + * + * Computes the Longest Common Subsequence (LCS) of the actual tool-call + * sequence against `expected.tools`, then normalises by the longer of the + * two sequences: + * + * score = lcs(actual, expected) / max(|actual|, |expected|) ∈ [0, 1] + * + * Dividing by the max penalises both missing tools (low recall) and extra + * spurious tools (low precision) without needing separate P/R components — + * those are tool-selection's job. + * + * Returns `'N/A'` when `expected.tools` is absent so datasets that don't + * specify an ordered tool sequence don't fail on this evaluator. This guard + * is load-bearing: running LCS against an undefined expectation would produce + * meaningless 0-scores that mask real regressions in other evaluators. + */ +export const trajectoryScore: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.tools) { + return { score: "N/A" }; + } + + const actual = trajectory.map((tc) => tc.tool); + const exp = expected.tools; + + if (actual.length === 0 && exp.length === 0) { + return { score: 1, reason: "Both actual and expected sequences are empty" }; + } + + const lcsLen = lcs(actual, exp); + const denom = Math.max(actual.length, exp.length); + const score = lcsLen / denom; + + return { + score, + reason: + `LCS=${lcsLen} / max(|actual|=${actual.length}, |expected|=${exp.length})` + + `=${denom} → score=${score.toFixed(2)}` + + (score < 1 + ? ` | actual=[${actual.join(", ")}] expected=[${exp.join(", ")}]` + : ""), + }; +}; + +/** + * Classic O(m·n) DP implementation of Longest Common Subsequence length. + * Compares elements by identity (===), which is correct for tool name strings. + */ +function lcs(a: string[], b: string[]): number { + const m = a.length; + const n = b.length; + // Single flat array instead of Array> avoids inner allocation + const dp = new Array((m + 1) * (n + 1)).fill(0); + const idx = (i: number, j: number) => i * (n + 1) + j; + + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + dp[idx(i, j)] = + a[i - 1] === b[j - 1] + ? dp[idx(i - 1, j - 1)] + 1 + : Math.max(dp[idx(i - 1, j)], dp[idx(i, j - 1)]); + } + } + + return dp[idx(m, n)]; +} From b838b009740b16bb015dd605d8d1e7a81386642d Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:47:42 +0200 Subject: [PATCH 12/42] evals: add criteria (LLM-as-judge) evaluator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit createCriteriaEvaluator(llm) returns an Evaluator that sends the trajectory and expected.criteria to a judge LLM with a structured rubric prompt asking for JSON {score, reasoning}. Returns 'N/A' when expected.criteria is absent. The factory pattern closes over the LLM provider so datasets can inject different judges (e.g. a stronger model for criteria, haiku for routing). Parsing: primary path extracts the first JSON object from the response and clamps score to [0, 1]. Falls back to a bare-number regex for models that ignore the JSON instruction, and finally returns score=0 with the raw text if neither succeeds. The judge prompt serialises only {tool, args} per call — omitting result avoids token bloat from large tool outputs while still giving the judge enough signal to evaluate routing decisions. Co-Authored-By: Claude Sonnet 4.6 --- evals/evaluators/criteria.ts | 142 +++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 evals/evaluators/criteria.ts diff --git a/evals/evaluators/criteria.ts b/evals/evaluators/criteria.ts new file mode 100644 index 0000000..1994eac --- /dev/null +++ b/evals/evaluators/criteria.ts @@ -0,0 +1,142 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; +import type { LlmProvider } from "../llm/types.js"; + +/** + * LLM-as-judge evaluator: asks an LLM to score the trajectory against + * the natural-language assertions in `expected.criteria`. + * + * Returns `'N/A'` when `expected.criteria` is absent or empty so datasets + * that rely only on structural evaluators don't incur extra LLM calls. + * + * Usage: + * import { createCriteriaEvaluator } from "./criteria.js"; + * import { createDefaultLlmProvider } from "../llm/index.js"; + * + * runDataset(dataset, { + * criteria: createCriteriaEvaluator(createDefaultLlmProvider()), + * }); + * + * The factory pattern is necessary because the `Evaluator` type is a plain + * function — the LLM provider is closed over rather than passed as an arg. + */ +export function createCriteriaEvaluator(llm: LlmProvider): Evaluator { + return async ( + trajectory: Trajectory, + expected: ExpectedBehavior + ): Promise => { + if (!expected.criteria || expected.criteria.length === 0) { + return { score: "N/A" }; + } + + const prompt = buildJudgePrompt(trajectory, expected.criteria); + const response = await llm.chat([{ role: "user", content: prompt }], []); + const text = response.content ?? ""; + + return parseJudgeResponse(text); + }; +} + +/** + * Builds the rubric prompt sent to the judge LLM. + * + * Asks for a JSON object with `score` (0–1) and `reasoning` (string) so + * parsing is deterministic. The trajectory is serialised as a compact JSON + * array of `{tool, args}` pairs — `result` is omitted to avoid token bloat + * from large tool outputs. + */ +function buildJudgePrompt(trajectory: Trajectory, criteria: string[]): string { + const trajectoryStr = JSON.stringify( + trajectory.map(({ tool, args }) => ({ tool, args })), + null, + 2 + ); + + const criteriaList = criteria + .map((c, i) => `${i + 1}. ${c}`) + .join("\n"); + + return `You are an impartial evaluator assessing the quality of an AI assistant's tool-calling behaviour. + +## Trajectory (tools the assistant called, in order) + +\`\`\`json +${trajectoryStr} +\`\`\` + +## Evaluation criteria + +${criteriaList} + +## Task + +Score how well the trajectory satisfies ALL of the criteria above on a scale from 0.0 to 1.0: +- 1.0 All criteria fully satisfied +- 0.75 Most criteria satisfied with minor gaps +- 0.5 About half the criteria satisfied +- 0.25 Most criteria unmet with only minor satisfaction +- 0.0 No criteria satisfied at all + +Respond with a single JSON object — no markdown fences, no extra text: +{"score": , "reasoning": ""}`; +} + +/** + * Parses the judge LLM's response into an EvaluatorResult. + * + * Tries JSON.parse first. Falls back to a regex that extracts a bare number + * from the text in case the model wraps the response in prose. + */ +function parseJudgeResponse(text: string): EvaluatorResult { + const trimmed = text.trim(); + + // Primary: extract the first {...} object in the response + const jsonMatch = trimmed.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + const parsed = JSON.parse(jsonMatch[0]) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "score" in parsed && + typeof (parsed as Record).score === "number" + ) { + const { score, reasoning } = parsed as { + score: number; + reasoning?: unknown; + }; + const clampedScore = Math.min(1, Math.max(0, score)); + return { + score: clampedScore, + reason: + typeof reasoning === "string" + ? reasoning + : `raw judge response: ${trimmed}`, + }; + } + } catch { + // fall through to regex fallback + } + } + + // Fallback: look for a bare decimal / integer in [0, 1] + const numMatch = trimmed.match(/\b(1(?:\.0+)?|0(?:\.\d+)?)\b/); + if (numMatch) { + const score = parseFloat(numMatch[1]); + return { + score, + reason: `score parsed from prose; raw response: ${trimmed.slice(0, 200)}`, + }; + } + + return { + score: 0, + reason: `judge response could not be parsed; raw response: ${trimmed.slice(0, 200)}`, + }; +} From 60eebb35d26dc3286303457249643769530b6c35 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:50:52 +0200 Subject: [PATCH 13/42] evals: add detection-rule-management dataset (4 positives + 4 distractors) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Proves the eval harness end-to-end against the existing manage-rules skill. Positives (drm-pos-01..04): natural-language queries about viewing/finding detection rules — the LLM should call manage-rules. Evaluated with skill-activation + tool-selection (≥80% gate). Distractors (drm-neg-01..04): case creation, alert triage, ES|QL hunting, host investigation — the LLM should NOT call manage-rules. Evaluated with negative-activation (100% gate — any false positive is a regression). Two separate runDataset calls wire the correct evaluators and thresholds to each example group without mixing evaluator semantics across types. Co-Authored-By: Claude Sonnet 4.6 --- .../detection-rule-management.dataset.ts | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 evals/datasets/detection-rule-management.dataset.ts diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts new file mode 100644 index 0000000..09c6563 --- /dev/null +++ b/evals/datasets/detection-rule-management.dataset.ts @@ -0,0 +1,125 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Dataset, Example } from "../types.js"; +import { runDataset } from "../runner.js"; +import { skillActivation } from "../evaluators/skill-activation.js"; +import { negativeActivation } from "../evaluators/negative-activation.js"; +import { toolSelection } from "../evaluators/tool-selection.js"; + +/** + * The model-facing entry-point tool registered by the + * detection-rule-management skill (src/tools/detection-rules.ts). + */ +const SKILL_TOOL = "manage-rules"; + +// --------------------------------------------------------------------------- +// Positive examples — the LLM should call manage-rules +// --------------------------------------------------------------------------- + +const positiveExamples: Example[] = [ + { + id: "drm-pos-01", + input: "Show me my noisy rules — which detection rules are generating the most alerts?", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-02", + input: "List all my currently enabled detection rules", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-03", + input: "Find high severity detection rules related to PowerShell execution", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-04", + input: "What detection rules do I have covering initial access tactics?", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, +]; + +// --------------------------------------------------------------------------- +// Distractor examples — the LLM should NOT call manage-rules +// --------------------------------------------------------------------------- + +const distractorExamples: Example[] = [ + { + id: "drm-neg-01", + input: "Create a new case for a ransomware incident I'm currently investigating", + expected: { + // skill is set so negativeActivation knows which tool to check for absence + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-02", + input: "Show me all critical alerts that fired in the last hour", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-03", + input: "Run an ES|QL query to find failed SSH login attempts on my Linux hosts", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-04", + input: "A process on host web-01 just spawned cmd.exe — help me investigate", + expected: { + skill: SKILL_TOOL, + }, + }, +]; + +// --------------------------------------------------------------------------- +// Export the full dataset for reference / cross-dataset tooling +// --------------------------------------------------------------------------- + +export const detectionRuleManagementDataset: Dataset = { + name: "detection-rule-management", + examples: [...positiveExamples, ...distractorExamples], +}; + +// --------------------------------------------------------------------------- +// Vitest eval suites +// Each runDataset call registers a describe block gated on RUN_LLM_EVALS. +// Positives and distractors use different evaluators and passing thresholds. +// --------------------------------------------------------------------------- + +runDataset( + { name: "detection-rule-management: positives", examples: positiveExamples }, + { + "skill-activation": skillActivation, + "tool-selection": toolSelection, + }, + { passingScore: 0.8 } +); + +runDataset( + { name: "detection-rule-management: distractors", examples: distractorExamples }, + { + "negative-activation": negativeActivation, + }, + { passingScore: 1.0 } // 100% — any false positive is a regression +); From 726b3bd549438db0788ded80109bf83824d1121a Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:54:39 +0200 Subject: [PATCH 14/42] evals: add detection-rule-management.eval.test.ts; split dataset from test orchestration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separates data from test concerns: - detection-rule-management.dataset.ts now only exports data (positiveExamples, distractorExamples, detectionRuleManagementDataset); no runDataset calls - detection-rule-management.eval.test.ts is the Vitest entry point that imports the sub-arrays and calls runDataset with the correct evaluators Gate layout (unchanged from before): positives — skill-activation + tool-selection, passingScore: 0.8 distractors — negative-activation, passingScore: 1.0 The .eval.test.ts suffix matches the include glob in evals/vitest.config.ts so `npm run test:evals` picks it up without further config changes. Co-Authored-By: Claude Sonnet 4.6 --- .../detection-rule-management.dataset.ts | 30 +---------- evals/detection-rule-management.eval.test.ts | 54 +++++++++++++++++++ 2 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 evals/detection-rule-management.eval.test.ts diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts index 09c6563..a1e2a2c 100644 --- a/evals/datasets/detection-rule-management.dataset.ts +++ b/evals/datasets/detection-rule-management.dataset.ts @@ -6,10 +6,6 @@ */ import type { Dataset, Example } from "../types.js"; -import { runDataset } from "../runner.js"; -import { skillActivation } from "../evaluators/skill-activation.js"; -import { negativeActivation } from "../evaluators/negative-activation.js"; -import { toolSelection } from "../evaluators/tool-selection.js"; /** * The model-facing entry-point tool registered by the @@ -21,7 +17,7 @@ const SKILL_TOOL = "manage-rules"; // Positive examples — the LLM should call manage-rules // --------------------------------------------------------------------------- -const positiveExamples: Example[] = [ +export const positiveExamples: Example[] = [ { id: "drm-pos-01", input: "Show me my noisy rules — which detection rules are generating the most alerts?", @@ -60,7 +56,7 @@ const positiveExamples: Example[] = [ // Distractor examples — the LLM should NOT call manage-rules // --------------------------------------------------------------------------- -const distractorExamples: Example[] = [ +export const distractorExamples: Example[] = [ { id: "drm-neg-01", input: "Create a new case for a ransomware incident I'm currently investigating", @@ -101,25 +97,3 @@ export const detectionRuleManagementDataset: Dataset = { examples: [...positiveExamples, ...distractorExamples], }; -// --------------------------------------------------------------------------- -// Vitest eval suites -// Each runDataset call registers a describe block gated on RUN_LLM_EVALS. -// Positives and distractors use different evaluators and passing thresholds. -// --------------------------------------------------------------------------- - -runDataset( - { name: "detection-rule-management: positives", examples: positiveExamples }, - { - "skill-activation": skillActivation, - "tool-selection": toolSelection, - }, - { passingScore: 0.8 } -); - -runDataset( - { name: "detection-rule-management: distractors", examples: distractorExamples }, - { - "negative-activation": negativeActivation, - }, - { passingScore: 1.0 } // 100% — any false positive is a regression -); diff --git a/evals/detection-rule-management.eval.test.ts b/evals/detection-rule-management.eval.test.ts new file mode 100644 index 0000000..ec9cab3 --- /dev/null +++ b/evals/detection-rule-management.eval.test.ts @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * End-to-end eval spec for the detection-rule-management skill. + * + * Proves the eval harness (runner → runMcpHostLoop → evaluators) works + * against a real registered skill using the proof dataset. Run via: + * + * npm run test:evals + * + * This suite is skipped in regular `npm test` because runDataset wraps + * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`. + * + * Gate summary: + * positives — skill-activation + tool-selection ≥ 80% + * distractors — negative-activation = 100% (any false positive is a regression) + */ + +import { runDataset } from "./runner.js"; +import { + positiveExamples, + distractorExamples, +} from "./datasets/detection-rule-management.dataset.js"; +import { skillActivation } from "./evaluators/skill-activation.js"; +import { negativeActivation } from "./evaluators/negative-activation.js"; +import { toolSelection } from "./evaluators/tool-selection.js"; + +runDataset( + { + name: "detection-rule-management: positives", + examples: positiveExamples, + }, + { + "skill-activation": skillActivation, + "tool-selection": toolSelection, + }, + { passingScore: 0.8 } +); + +runDataset( + { + name: "detection-rule-management: distractors", + examples: distractorExamples, + }, + { + "negative-activation": negativeActivation, + }, + { passingScore: 1.0 } // 100% — any false positive is a regression +); From 77844857b9b62e452690740a39bc4befd9f4b8b3 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 11:57:55 +0200 Subject: [PATCH 15/42] ci: add evals.yml GitHub Actions workflow Triggers: - workflow_dispatch manual run from Actions UI - schedule (0 2 * * *) nightly at 02:00 UTC - pull_request_target only when 'evals' label is added; gated by label write permission so only maintainers can trigger Concurrency group 'evals-' cancels in-progress runs on new pushes, preventing redundant jobs from burning LLM quota. The 'Run evals' step sets RUN_LLM_EVALS=1 and passes four secrets: EVAL_ANTHROPIC_API_KEY Claude Haiku (priority) EVAL_OPENAI_API_KEY GPT-4o-mini fallback EVAL_LITELLM_BASE_URL optional LiteLLM proxy base URL EVAL_CLUSTERS_JSON Elastic cluster credentials for the MCP server Output is captured with tee so it appears in the job log AND in eval-output.txt. A separate 'Post eval results' step (if: always()) appends '## Eval results' plus the full output to $GITHUB_STEP_SUMMARY so the rendered Markdown tables from the runner appear in the Actions job summary. For pull_request_target the checkout uses the PR head SHA so evals run against the proposed changes rather than the base branch. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/evals.yml | 87 +++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 .github/workflows/evals.yml diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 0000000..c4b951a --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,87 @@ +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0; you may not use this file except in compliance with the Elastic License +# 2.0. + +name: Evals + +on: + # Manually trigger a run from the Actions UI (useful for ad-hoc evaluation). + workflow_dispatch: + + # Nightly run at 02:00 UTC to catch regressions before the work day starts. + schedule: + - cron: "0 2 * * *" + + # Run when a PR is labeled with `evals`. Labels require write permission, so + # this implicitly limits triggering to maintainers — acceptable because + # pull_request_target runs with base-repo secrets. + pull_request_target: + types: [labeled] + +# Cancel any in-progress run for the same ref so a fast push doesn't queue up +# redundant eval jobs that waste LLM quota. +concurrency: + group: evals-${{ github.ref }} + cancel-in-progress: true + +jobs: + evals: + name: LLM Eval Suite + runs-on: ubuntu-latest + + # For pull_request_target, gate strictly on the evals label so the job + # doesn't fire for every other label event. + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + (github.event_name == 'pull_request_target' && github.event.label.name == 'evals') + + steps: + - uses: actions/checkout@v4 + with: + # For pull_request_target, check out the PR head so the eval runs + # against the proposed changes, not the base branch. + ref: >- + ${{ + github.event_name == 'pull_request_target' + && github.event.pull_request.head.sha + || github.sha + }} + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Run evals + env: + RUN_LLM_EVALS: "1" + # Set ANTHROPIC_API_KEY to use Claude Haiku (preferred); fall back to + # OPENAI_API_KEY for GPT-4o-mini. Set EVAL_LITELLM_BASE_URL to route + # through a LiteLLM proxy instead of the direct OpenAI endpoint. + ANTHROPIC_API_KEY: ${{ secrets.EVAL_ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.EVAL_OPENAI_API_KEY }} + LITELLM_BASE_URL: ${{ secrets.EVAL_LITELLM_BASE_URL }} + # JSON array describing the Elastic cluster the MCP server targets. + # Shape: [{"name":"primary","elasticsearchUrl":"...","kibanaUrl":"...","elasticsearchApiKey":"..."}] + CLUSTERS_JSON: ${{ secrets.EVAL_CLUSTERS_JSON }} + run: | + set -o pipefail + npm run test:evals 2>&1 | tee eval-output.txt + + - name: Post eval results to job summary + if: always() + run: | + if [ -f eval-output.txt ]; then + echo "## Eval results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + cat eval-output.txt >> "$GITHUB_STEP_SUMMARY" + else + echo "## Eval results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "_No eval output captured._" >> "$GITHUB_STEP_SUMMARY" + fi From ac864b8ad20b5c958979e0a9506f1097c3857390 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:02:27 +0200 Subject: [PATCH 16/42] =?UTF-8?q?docs:=20add=20evals.md=20=E2=80=94=20harn?= =?UTF-8?q?ess=20design,=20dataset=20shape,=20evaluator=20catalog,=20CI=20?= =?UTF-8?q?gating?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers: - Architecture diagram showing runner → runMcpHostLoop → evaluators pipeline - Key design choices table (in-process transport, skip-if guard, N/A semantics) - Dataset shape reference with all three optional expected fields documented - Positive vs distractor example pattern with runDataset code snippets - Evaluator catalog: type, score range, N/A condition, and recommended gate for all five evaluators (skill-activation, negative-activation, tool-selection, trajectory, criteria) - Step-by-step how-to-add-dataset guide with copy-paste templates - CI gating: workflow triggers, required secrets table, passing threshold table Co-Authored-By: Claude Sonnet 4.6 --- docs/evals.md | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 docs/evals.md diff --git a/docs/evals.md b/docs/evals.md new file mode 100644 index 0000000..486af33 --- /dev/null +++ b/docs/evals.md @@ -0,0 +1,260 @@ +# Eval Harness + +LLM-powered evals for the Elastic Security MCP app's skill layer. The harness +tests whether the LLM host activates the right skill, calls the right tools in +the right order, and does not fire on unrelated queries. + +Regular `npm test` never touches this harness — it only runs when +`RUN_LLM_EVALS=1` is set, so CI stays fast and free of LLM costs. + +--- + +## Architecture + +``` +Dataset (examples) + │ + ▼ +runner.ts ─ describe.skipIf(!RUN_LLM_EVALS)(dataset.name, () => { + │ for each example: + │ trajectory = await runMcpHostLoop(input) + │ scores = await evaluators[*](trajectory, expected) + │ assert score >= passingScore + │ afterAll: print Markdown table to stdout + │ }) + │ + ├── runMcpHostLoop(input) + │ InMemoryTransport ─ Client ─ McpServer + │ LLM provider (Anthropic / OpenAI / LiteLLM) + │ loop ≤ MAX_TURNS=8: LLM → tool calls → results → repeat + │ returns Trajectory (ordered ToolCall[]) + │ + └── Evaluators + skill-activation binary: was skill tool called? + negative-activation binary: was skill tool correctly absent? + tool-selection F1 precision/recall against expected.tools + trajectory LCS similarity of actual vs expected sequence + criteria LLM-as-judge against natural-language assertions +``` + +### Key design choices + +| Decision | Rationale | +|---|---| +| In-process via `InMemoryTransport` | No network, no server process — evals run anywhere | +| `describe.skipIf(!RUN_LLM_EVALS)` | Zero LLM cost in regular `npm test` | +| `Evaluator` is a plain function | Easy to compose; factory pattern for stateful evaluators (criteria) | +| `'N/A'` return instead of 0 | Datasets omit irrelevant evaluator dimensions without masking real regressions | +| LCS for trajectory | Order matters; set-based coverage is tool-selection's job | + +--- + +## Dataset shape + +A dataset is a `Dataset` object exported from a `*.dataset.ts` file: + +```typescript +import type { Dataset } from "../types.js"; + +export const myDataset: Dataset = { + name: "my-skill", + examples: [ + { + id: "ms-pos-01", // stable, unique — appears in CI summaries + input: "user message to the LLM", // the query sent to runMcpHostLoop + expected: { + skill: "entry-point-tool-name", // tool the skill SKILL.md instructs the LLM to call + tools: ["entry-point-tool-name"], // ordered list for trajectory/tool-selection + criteria: [ // natural-language assertions for LLM-as-judge + "The model called the correct entry-point tool", + ], + }, + }, + ], +}; +``` + +All three `expected` fields are **optional**: + +| Field | Evaluators that use it | Omit when… | +|---|---|---| +| `skill` | `skill-activation`, `negative-activation` | Dataset doesn't test skill routing | +| `tools` | `tool-selection`, `trajectory` | No ordered tool expectation | +| `criteria` | `criteria` | No LLM-as-judge needed (saves cost) | + +Omitting a field causes the evaluator to return `'N/A'` for that example rather than a false 0. + +### Positive vs distractor examples + +A **positive** example is a query that *should* activate the skill. +A **distractor** example is an unrelated query that *should not*. + +Use separate `runDataset` calls with different evaluators for each group: + +```typescript +// Positive: skill should fire +runDataset( + { name: "my-skill: positives", examples: positiveExamples }, + { "skill-activation": skillActivation, "tool-selection": toolSelection }, + { passingScore: 0.8 } +); + +// Distractor: skill must NOT fire (gate is 100%) +runDataset( + { name: "my-skill: distractors", examples: distractorExamples }, + { "negative-activation": negativeActivation }, + { passingScore: 1.0 } +); +``` + +--- + +## Evaluator catalog + +### `skill-activation` + +**Type**: binary · **Score**: `1` if `expected.skill` found in trajectory, `0` otherwise +**Returns `'N/A'`**: when `expected.skill` is absent +**Gate**: ≥ 0.8 on positive examples (use `passingScore: 0.8`) + +Tests whether the LLM called the skill's model-facing entry-point tool at +least once. + +### `negative-activation` + +**Type**: binary · **Score**: `1` if `expected.skill` is *absent* from trajectory, `0` if present +**Returns `'N/A'`**: when `expected.skill` is absent +**Gate**: 1.0 on distractor examples (use `passingScore: 1.0`) + +Tests that the skill does not over-trigger on unrelated queries. Any false +positive here means the skill's SKILL.md is too broad. + +### `tool-selection` + +**Type**: F1 · **Score**: harmonic mean of precision and recall against `expected.tools` (set-based) +**Returns `'N/A'`**: when `expected.tools` is absent +**Gate**: ≥ 0.8 on positive examples + +Tests *which* tools were called, ignoring order. Missed tools lower recall; +spurious tools lower precision. Failure reason includes `missed: [...]` and +`extra: [...]`. + +### `trajectory` + +**Type**: LCS similarity · **Score**: `lcs(actual, expected) / max(|actual|, |expected|)` +**Returns `'N/A'`**: when `expected.tools` is absent +**Gate**: ≥ 0.7 on positive examples (sequence matching is looser than set matching) + +Tests *order*. Dividing by `max` penalises both missing and extra steps. +Use alongside `tool-selection` for full coverage. + +### `criteria` + +**Type**: LLM-as-judge · **Score**: `0.0–1.0` parsed from a rubric prompt response +**Returns `'N/A'`**: when `expected.criteria` is absent +**Gate**: ≥ 0.7 + +Calls the judge LLM with the trajectory `{tool, args}` pairs and the +criteria list. Asks for `{"score": <0–1>, "reasoning": "..."}`. Falls back +to regex number extraction if JSON parse fails. Use for semantic assertions +that structural evaluators can't express. + +**Cost**: one extra LLM call per example. Omit `expected.criteria` to skip. + +--- + +## How to add a dataset + +1. **Create the data file** `evals/datasets/.dataset.ts`: + + ```typescript + import type { Dataset, Example } from "../types.js"; + + const SKILL_TOOL = "my-tool"; // the model-facing entry-point tool + + export const positiveExamples: Example[] = [ + { id: "ms-pos-01", input: "...", expected: { skill: SKILL_TOOL, tools: [SKILL_TOOL] } }, + // add ≥ 4 examples + ]; + + export const distractorExamples: Example[] = [ + { id: "ms-neg-01", input: "...", expected: { skill: SKILL_TOOL } }, + // add ≥ 4 examples + ]; + + export const myDataset: Dataset = { + name: "", + examples: [...positiveExamples, ...distractorExamples], + }; + ``` + +2. **Create the eval spec** `evals/.eval.test.ts`: + + ```typescript + import { runDataset } from "./runner.js"; + import { positiveExamples, distractorExamples } from "./datasets/.dataset.js"; + import { skillActivation } from "./evaluators/skill-activation.js"; + import { negativeActivation } from "./evaluators/negative-activation.js"; + import { toolSelection } from "./evaluators/tool-selection.js"; + + runDataset( + { name: ": positives", examples: positiveExamples }, + { "skill-activation": skillActivation, "tool-selection": toolSelection }, + { passingScore: 0.8 } + ); + + runDataset( + { name: ": distractors", examples: distractorExamples }, + { "negative-activation": negativeActivation }, + { passingScore: 1.0 } + ); + ``` + +3. **Run locally**: + + ```bash + # Anthropic (preferred) + ANTHROPIC_API_KEY=sk-ant-... CLUSTERS_JSON='[{...}]' npm run test:evals + + # OpenAI / LiteLLM proxy + OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... CLUSTERS_JSON='[{...}]' npm run test:evals + ``` + +4. **Trigger in CI**: open a PR and add the `evals` label (requires write access). + +--- + +## CI gating + +### Workflow: `.github/workflows/evals.yml` + +| Trigger | When | +|---|---| +| `workflow_dispatch` | Manual run from Actions UI | +| `schedule` | Nightly at 02:00 UTC | +| `pull_request_target` | When `evals` label is added to a PR | + +The concurrency group `evals-` cancels superseded runs to avoid wasting +LLM quota on stale pushes. + +### Required secrets + +| Secret | Purpose | +|---|---| +| `EVAL_ANTHROPIC_API_KEY` | Anthropic API key (priority provider) | +| `EVAL_OPENAI_API_KEY` | OpenAI / LiteLLM API key (fallback) | +| `EVAL_LITELLM_BASE_URL` | Optional LiteLLM proxy base URL | +| `EVAL_CLUSTERS_JSON` | Elastic cluster credentials for the MCP server | + +### Passing thresholds (recommended defaults) + +| Evaluator | Positives | Distractors | +|---|---|---| +| `skill-activation` | ≥ 0.8 | — | +| `negative-activation` | — | = 1.0 | +| `tool-selection` | ≥ 0.8 | — | +| `trajectory` | ≥ 0.7 | — | +| `criteria` | ≥ 0.7 | — | + +Results are posted as a Markdown table to the GitHub Actions job summary +(`$GITHUB_STEP_SUMMARY`) after every run. From e9a23fa184702423613da968d674ad707fed9459 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:08:54 +0200 Subject: [PATCH 17/42] feat: add MigrationsService wrapping 14 /internal/siem_migrations/* Kibana routes Service injects KibanaClient directly (no separate *Client indirection since these are internal-only Kibana routes with no public API equivalent). The KibanaClient already supplies x-elastic-internal-origin: Kibana; each method adds elastic-api-version: 2023-10-31 via MIGRATION_HEADERS per-request. 14 methods, one per route: createMigration POST /internal/siem_migrations/rules listMigrations GET /internal/siem_migrations/rules getMigration GET /internal/siem_migrations/rules/:id deleteMigration DELETE /internal/siem_migrations/rules/:id uploadRules POST /internal/siem_migrations/rules/:id/rules getTranslatedRules GET /internal/siem_migrations/rules/:id/rules getTranslatedRule GET /internal/siem_migrations/rules/:id/rules/:ruleId updateTranslatedRule PUT /internal/siem_migrations/rules/:id/rules/:ruleId startTranslation POST /internal/siem_migrations/rules/:id/start stopTranslation POST /internal/siem_migrations/rules/:id/stop getResources GET /internal/siem_migrations/resources/:id upsertResources POST /internal/siem_migrations/resources/:id installRules POST /internal/siem_migrations/rules/:id/install getStats GET /internal/siem_migrations/rules/:id/stats MigrationApiError wraps every non-2xx response with typed status (extracted from the Kibana client's "Kibana [cluster] STATUS: body" error format) and the request path so callers can surface actionable error messages. Domain types: SiemMigration, TranslatedRule, MigrationResource, MigrationStats and associated option/result interfaces, all barrel-exported from service/index. Co-Authored-By: Claude Sonnet 4.6 --- src/elastic/service/index.ts | 15 + src/elastic/service/migrationsService.ts | 361 +++++++++++++++++++++++ 2 files changed, 376 insertions(+) create mode 100644 src/elastic/service/migrationsService.ts diff --git a/src/elastic/service/index.ts b/src/elastic/service/index.ts index 38671ee..3c6e574 100644 --- a/src/elastic/service/index.ts +++ b/src/elastic/service/index.ts @@ -19,3 +19,18 @@ export type { ScenarioRuleDef, } from "./sampleDataService.js"; export { SampleDataService, SCENARIO_NAMES, SCENARIO_RULES } from "./sampleDataService.js"; +export type { + SiemMigration, + TranslatedRule, + MigrationResource, + MigrationStats, + ListTranslatedRulesOptions, + ListTranslatedRulesResult, + InstallRulesOptions, + InstallRulesResult, +} from "./migrationsService.js"; +export { + MigrationApiError, + MigrationsService, + SIEM_MIGRATIONS_API_BASE, +} from "./migrationsService.js"; diff --git a/src/elastic/service/migrationsService.ts b/src/elastic/service/migrationsService.ts new file mode 100644 index 0000000..ffd0dd4 --- /dev/null +++ b/src/elastic/service/migrationsService.ts @@ -0,0 +1,361 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { KibanaClient } from "../kibana-client/index.js"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +export const SIEM_MIGRATIONS_API_BASE = "/internal/siem_migrations"; + +/** + * Per-request headers required by the Kibana internal SIEM migrations API. + * `x-elastic-internal-origin: Kibana` is pre-baked into `KibanaClient`; + * only the versioning header needs to be added on each call. + */ +const MIGRATION_HEADERS = { + "elastic-api-version": "2023-10-31", +} as const; + +// --------------------------------------------------------------------------- +// Domain types +// --------------------------------------------------------------------------- + +export interface SiemMigration { + id: string; + name: string; + /** Lifecycle status of the migration. */ + status: "ready" | "running" | "finished" | "error"; + created_at: string; + last_updated_at: string; + rules: { + total: number; + pending: number; + processing: number; + completed: number; + failed: number; + installable: number; + installed: number; + partially_translated: number; + untranslatable: number; + }; +} + +export interface TranslatedRule { + id: string; + migration_id: string; + status: "pending" | "processing" | "completed" | "failed"; + translation_result?: "full" | "partial" | "untranslatable"; + elastic_rule?: Record; + original_rule: Record; + comments?: string[]; +} + +export interface MigrationResource { + type: "macro" | "lookup"; + name: string; + content: string; +} + +export interface MigrationStats { + id: string; + status: SiemMigration["status"]; + rules: SiemMigration["rules"]; +} + +export interface ListTranslatedRulesOptions { + readonly page?: number; + readonly perPage?: number; + readonly filter?: string; +} + +export interface ListTranslatedRulesResult { + data: TranslatedRule[]; + total: number; +} + +export interface InstallRulesOptions { + /** Specific rule IDs to install; omit to install all installable rules. */ + ids?: string[]; +} + +export interface InstallRulesResult { + installed: number; + failed: number; +} + +// --------------------------------------------------------------------------- +// Typed error +// --------------------------------------------------------------------------- + +/** + * Thrown by every {@link MigrationsService} method on a non-2xx response. + * + * The Kibana client's response interceptor formats AxiosErrors as + * `"Kibana [] : "` before they reach here, so + * `status` is extracted from that message when available. + */ +export class MigrationApiError extends Error { + readonly status: number; + readonly path: string; + + constructor(path: string, cause: unknown) { + const causeMsg = cause instanceof Error ? cause.message : String(cause); + // Match the Kibana client error format: "Kibana [name] STATUS: detail" + const statusMatch = causeMsg.match(/\b([1-5]\d{2})\b/); + const status = statusMatch ? parseInt(statusMatch[1], 10) : 0; + + super(`SIEM Migrations API error on ${path}: ${causeMsg}`); + this.name = "MigrationApiError"; + this.status = status; + this.path = path; + if (cause instanceof Error) { + this.cause = cause; + } + } +} + +// --------------------------------------------------------------------------- +// Service +// --------------------------------------------------------------------------- + +interface MigrationsServiceOptions { + readonly kibanaClient: KibanaClient; +} + +/** + * Thin wrapper over the 14 `/internal/siem_migrations/*` Kibana routes. + * + * Every method adds `elastic-api-version: 2023-10-31`; the underlying + * {@link KibanaClient} supplies `x-elastic-internal-origin: Kibana` and + * authentication on every request. Non-2xx responses are re-thrown as + * {@link MigrationApiError}. + */ +export class MigrationsService { + private readonly client: KibanaClient; + + constructor(options: MigrationsServiceOptions) { + this.client = options.kibanaClient; + } + + // ── Migration lifecycle ────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules */ + async createMigration(name: string): Promise<{ migration_id: string }> { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules`; + try { + const { data } = await this.client.post<{ migration_id: string }>( + path, + { name }, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules */ + async listMigrations(): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules/:migrationId */ + async getMigration(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** DELETE /internal/siem_migrations/rules/:migrationId */ + async deleteMigration(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`; + try { + await this.client.delete(path, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Splunk rule upload ─────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/rules */ + async uploadRules( + migrationId: string, + rules: Record[] + ): Promise<{ total: number }> { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`; + try { + const { data } = await this.client.post<{ total: number }>( + path, + rules, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Translated rules ───────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/rules/:migrationId/rules */ + async getTranslatedRules( + migrationId: string, + options: ListTranslatedRulesOptions = {} + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`; + const params: Record = { + page: String(options.page ?? 1), + per_page: String(options.perPage ?? 20), + }; + if (options.filter) params.filter = options.filter; + + try { + const { data } = await this.client.get(path, { + params, + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules/:migrationId/rules/:ruleId */ + async getTranslatedRule( + migrationId: string, + ruleId: string + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** PUT /internal/siem_migrations/rules/:migrationId/rules/:ruleId */ + async updateTranslatedRule( + migrationId: string, + ruleId: string, + updates: Partial> + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`; + try { + const { data } = await this.client.put(path, updates, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Translation control ────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/start */ + async startTranslation(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/start`; + try { + await this.client.post(path, {}, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** POST /internal/siem_migrations/rules/:migrationId/stop */ + async stopTranslation(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stop`; + try { + await this.client.post(path, {}, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Resources ──────────────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/resources/:migrationId */ + async getResources(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** POST /internal/siem_migrations/resources/:migrationId */ + async upsertResources( + migrationId: string, + resources: MigrationResource[] + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`; + try { + await this.client.post(path, resources, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Installation ───────────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/install */ + async installRules( + migrationId: string, + options: InstallRulesOptions = {} + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/install`; + try { + const { data } = await this.client.post( + path, + options.ids ? { ids: options.ids } : {}, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Stats ──────────────────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/rules/:migrationId/stats */ + async getStats(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stats`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } +} From 2a4ec7d1ab9eacb5c7ad90b8664cd29295c6b372 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:13:58 +0200 Subject: [PATCH 18/42] test: add MigrationsService tests covering all 14 route methods and error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 19 tests across 14 describe blocks — one per route method plus three error-handling tests: Migration lifecycle: createMigration, listMigrations, getMigration, deleteMigration Rule upload: uploadRules Translated rules: getTranslatedRules (default+custom pagination), getTranslatedRule, updateTranslatedRule Translation control: startTranslation, stopTranslation Resources: getResources, upsertResources Installation: installRules (no-ids + with-ids) Stats: getStats MigrationApiError: status parsed from Kibana error format; status=0 fallback; all mutating methods surface MigrationApiError Also adds `put: vi.fn()` to MockHttpClient / makeMock in mockHttpClient.ts so MigrationsService.updateTranslatedRule can be exercised. Co-Authored-By: Claude Sonnet 4.6 --- src/elastic/service/migrationsService.test.ts | 329 ++++++++++++++++++ src/test/helpers/mockHttpClient.ts | 2 + 2 files changed, 331 insertions(+) create mode 100644 src/elastic/service/migrationsService.test.ts diff --git a/src/elastic/service/migrationsService.test.ts b/src/elastic/service/migrationsService.test.ts new file mode 100644 index 0000000..0c184e7 --- /dev/null +++ b/src/elastic/service/migrationsService.test.ts @@ -0,0 +1,329 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, beforeEach } from "vitest"; +import { + MigrationsService, + MigrationApiError, + SIEM_MIGRATIONS_API_BASE, +} from "./migrationsService.js"; +import type { KibanaClient } from "../kibana-client/index.js"; +import { + createMockKibanaClient, + dataEnvelope, + type MockHttpClient, +} from "../../test/helpers/mockHttpClient.js"; +import type { SiemMigration, TranslatedRule, MigrationResource } from "./migrationsService.js"; + +const BASE = SIEM_MIGRATIONS_API_BASE; +const HEADERS = { headers: { "elastic-api-version": "2023-10-31" } }; + +const MIGRATION_ID = "migration-1"; +const RULE_ID = "rule-1"; + +const fakeMigration: SiemMigration = { + id: MIGRATION_ID, + name: "test-migration", + status: "ready", + created_at: "2026-01-01T00:00:00Z", + last_updated_at: "2026-01-01T00:00:00Z", + rules: { + total: 0, pending: 0, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, +}; + +const fakeRule: TranslatedRule = { + id: RULE_ID, + migration_id: MIGRATION_ID, + status: "completed", + translation_result: "full", + original_rule: { name: "splunk-rule" }, +}; + +const fakeResource: MigrationResource = { + type: "macro", + name: "my_macro", + content: "| where true", +}; + +describe("MigrationsService", () => { + let kibanaClient: KibanaClient & MockHttpClient; + let service: MigrationsService; + + beforeEach(() => { + kibanaClient = createMockKibanaClient(); + service = new MigrationsService({ kibanaClient }); + }); + + // ── Migration lifecycle ──────────────────────────────────────────────────── + + describe("createMigration", () => { + it("POSTs to /rules with the migration name and returns migration_id", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ migration_id: MIGRATION_ID })); + + const result = await service.createMigration("My Migration"); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules`, + { name: "My Migration" }, + HEADERS + ); + expect(result).toEqual({ migration_id: MIGRATION_ID }); + }); + }); + + describe("listMigrations", () => { + it("GETs /rules and returns the array", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeMigration])); + + const result = await service.listMigrations(); + + expect(kibanaClient.get).toHaveBeenCalledWith(`${BASE}/rules`, HEADERS); + expect(result).toEqual([fakeMigration]); + }); + }); + + describe("getMigration", () => { + it("GETs /rules/:migrationId and returns the migration", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeMigration)); + + const result = await service.getMigration(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}`, + HEADERS + ); + expect(result).toEqual(fakeMigration); + }); + }); + + describe("deleteMigration", () => { + it("DELETEs /rules/:migrationId", async () => { + await service.deleteMigration(MIGRATION_ID); + + expect(kibanaClient.delete).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}`, + HEADERS + ); + }); + }); + + // ── Rule upload ──────────────────────────────────────────────────────────── + + describe("uploadRules", () => { + it("POSTs rules array to /rules/:migrationId/rules and returns totals", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ total: 5 })); + const splunkRules = [{ search: "index=main" }, { search: "index=security" }]; + + const result = await service.uploadRules(MIGRATION_ID, splunkRules); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules`, + splunkRules, + HEADERS + ); + expect(result).toEqual({ total: 5 }); + }); + }); + + // ── Translated rules ─────────────────────────────────────────────────────── + + describe("getTranslatedRules", () => { + it("GETs /rules/:migrationId/rules with default pagination", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [fakeRule], total: 1 })); + + const result = await service.getTranslatedRules(MIGRATION_ID); + + const [path, config] = kibanaClient.get.mock.calls[0] as [string, Record]; + expect(path).toBe(`${BASE}/rules/${MIGRATION_ID}/rules`); + expect(config.params).toMatchObject({ page: "1", per_page: "20" }); + expect(result).toEqual({ data: [fakeRule], total: 1 }); + }); + + it("forwards custom page, perPage and filter params", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [], total: 0 })); + + await service.getTranslatedRules(MIGRATION_ID, { page: 2, perPage: 50, filter: "status:completed" }); + + const [, config] = kibanaClient.get.mock.calls[0] as [string, Record]; + expect(config.params).toEqual({ page: "2", per_page: "50", filter: "status:completed" }); + }); + }); + + describe("getTranslatedRule", () => { + it("GETs /rules/:migrationId/rules/:ruleId", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeRule)); + + const result = await service.getTranslatedRule(MIGRATION_ID, RULE_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`, + HEADERS + ); + expect(result).toEqual(fakeRule); + }); + }); + + describe("updateTranslatedRule", () => { + it("PUTs updates to /rules/:migrationId/rules/:ruleId and returns the updated rule", async () => { + const updated = { ...fakeRule, translation_result: "partial" as const }; + kibanaClient.put.mockResolvedValueOnce(dataEnvelope(updated)); + + const result = await service.updateTranslatedRule(MIGRATION_ID, RULE_ID, { + translation_result: "partial", + }); + + expect(kibanaClient.put).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`, + { translation_result: "partial" }, + HEADERS + ); + expect(result).toEqual(updated); + }); + }); + + // ── Translation control ──────────────────────────────────────────────────── + + describe("startTranslation", () => { + it("POSTs to /rules/:migrationId/start", async () => { + await service.startTranslation(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/start`, + {}, + HEADERS + ); + }); + }); + + describe("stopTranslation", () => { + it("POSTs to /rules/:migrationId/stop", async () => { + await service.stopTranslation(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/stop`, + {}, + HEADERS + ); + }); + }); + + // ── Resources ────────────────────────────────────────────────────────────── + + describe("getResources", () => { + it("GETs /resources/:migrationId and returns the array", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeResource])); + + const result = await service.getResources(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/resources/${MIGRATION_ID}`, + HEADERS + ); + expect(result).toEqual([fakeResource]); + }); + }); + + describe("upsertResources", () => { + it("POSTs resources array to /resources/:migrationId", async () => { + await service.upsertResources(MIGRATION_ID, [fakeResource]); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/resources/${MIGRATION_ID}`, + [fakeResource], + HEADERS + ); + }); + }); + + // ── Installation ─────────────────────────────────────────────────────────── + + describe("installRules", () => { + it("POSTs empty body to /rules/:migrationId/install when no ids given", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 3, failed: 0 })); + + const result = await service.installRules(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/install`, + {}, + HEADERS + ); + expect(result).toEqual({ installed: 3, failed: 0 }); + }); + + it("includes ids in the body when provided", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 1, failed: 0 })); + + await service.installRules(MIGRATION_ID, { ids: ["r1", "r2"] }); + + const [, body] = kibanaClient.post.mock.calls[0] as [string, Record]; + expect(body).toEqual({ ids: ["r1", "r2"] }); + }); + }); + + // ── Stats ────────────────────────────────────────────────────────────────── + + describe("getStats", () => { + it("GETs /rules/:migrationId/stats and returns the stats", async () => { + const stats = { id: MIGRATION_ID, status: "ready" as const, rules: fakeMigration.rules }; + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(stats)); + + const result = await service.getStats(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/stats`, + HEADERS + ); + expect(result).toEqual(stats); + }); + }); + + // ── MigrationApiError ────────────────────────────────────────────────────── + + describe("MigrationApiError", () => { + it("wraps non-2xx with status parsed from Kibana error format", async () => { + const path = `${BASE}/rules/${MIGRATION_ID}`; + kibanaClient.get.mockRejectedValue( + new Error("Kibana [test-cluster] 404: migration not found") + ); + + await expect(service.getMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.getMigration(MIGRATION_ID)).rejects.toMatchObject({ + status: 404, + path, + message: expect.stringContaining(path) as string, + }); + }); + + it("sets status 0 when error message has no HTTP status code", async () => { + kibanaClient.get.mockRejectedValueOnce(new Error("network timeout")); + + const err = await service.getMigration(MIGRATION_ID).catch((e) => e as MigrationApiError); + expect(err).toBeInstanceOf(MigrationApiError); + expect(err.status).toBe(0); + }); + + it("surfaces a MigrationApiError from every mutating method", async () => { + const netErr = new Error("Kibana [test-cluster] 503: service unavailable"); + + kibanaClient.post.mockRejectedValue(netErr); + kibanaClient.put.mockRejectedValue(netErr); + kibanaClient.delete.mockRejectedValue(netErr); + + await expect(service.createMigration("x")).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.uploadRules(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.startTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.stopTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.upsertResources(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.installRules(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {})).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.deleteMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + }); + }); +}); diff --git a/src/test/helpers/mockHttpClient.ts b/src/test/helpers/mockHttpClient.ts index b843524..f640f2c 100644 --- a/src/test/helpers/mockHttpClient.ts +++ b/src/test/helpers/mockHttpClient.ts @@ -17,6 +17,7 @@ import type { KibanaClient } from "../../elastic/kibana-client/kibana-client.js" export interface MockHttpClient { get: Mock; post: Mock; + put: Mock; patch: Mock; delete: Mock; clusterName: string; @@ -48,6 +49,7 @@ function makeMock(clusterName: string): MockHttpClient { return { get: vi.fn().mockResolvedValue({ data: undefined }), post: vi.fn().mockResolvedValue({ data: undefined }), + put: vi.fn().mockResolvedValue({ data: undefined }), patch: vi.fn().mockResolvedValue({ data: undefined }), delete: vi.fn().mockResolvedValue({ data: undefined }), clusterName, From 16e4d062d9434c14cb544ff249f641dcc7edaeb7 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:17:08 +0200 Subject: [PATCH 19/42] feat: register migration tools (1 model-facing + 10 app-only) migrate-rules (model-facing): _meta.ui.resourceUri = ui://migrate-rules/mcp-app.html Callback seeds the workbench with a compact migration list so the LLM gets immediate context. App-only tools (_meta.ui.visibility: ["app"]): list-migrations GET all migrations get-migration GET single migration by ID get-translated-rules paginated translated rule listing (vendor-gated) start-translation kick off AI translation (vendor-gated) stop-translation halt in-progress translation (vendor-gated) update-translated-rule patch elastic_rule / translation_result / comments (vendor-gated) get-resources list macros/lookups (vendor-gated) upsert-resource create/replace single macro or lookup (vendor-gated) install-rules install translated rules, optional id filter (vendor-gated) get-stats per-migration translation/installation stats Vendor gate: SUPPORTED_VENDORS = ["splunk"]. If a vendor param is provided and not in the list, returns { error: "vendorNotSupported", vendor } without hitting Kibana. Re-enabling a vendor is a one-line change to the constant. Also registers the migration workbench HTML via registerAppResource; the view file is resolved at request time (resolveViewPath("migration")) so the tool works once the view is built in a subsequent commit. Co-Authored-By: Claude Sonnet 4.6 --- src/tools/migration.ts | 353 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 src/tools/migration.ts diff --git a/src/tools/migration.ts b/src/tools/migration.ts new file mode 100644 index 0000000..5502bd2 --- /dev/null +++ b/src/tools/migration.ts @@ -0,0 +1,353 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { + registerAppTool, + registerAppResource, + RESOURCE_MIME_TYPE, +} from "@modelcontextprotocol/ext-apps/server"; +import { z } from "zod"; +import fs from "fs"; +import type { MigrationsService } from "../elastic/service/index.js"; +import { resolveViewPath } from "./view-path.js"; + +const RESOURCE_URI = "ui://migrate-rules/mcp-app.html"; + +/** + * Vendors for which the Kibana SIEM migrations translator is production-ready. + * Re-enabling a vendor is a one-line change to this array once the translator + * matures — QRadar and Sentinel-One are the next candidates. + */ +const SUPPORTED_VENDORS: readonly string[] = ["splunk"]; + +export interface MigrationToolDeps { + readonly migrationsService: MigrationsService; +} + +/** Returns a vendor-gate error response for app-only tools. */ +function vendorNotSupportedResponse(vendor: string) { + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ error: "vendorNotSupported", vendor }), + }, + ], + }; +} + +/** Returns true when `vendor` is explicitly provided but not in SUPPORTED_VENDORS. */ +function isUnsupportedVendor(vendor: string | undefined): vendor is string { + return vendor !== undefined && !SUPPORTED_VENDORS.includes(vendor); +} + +export function registerMigrationTools( + server: McpServer, + deps: MigrationToolDeps +) { + const { migrationsService } = deps; + + // ── Model-facing entry-point ─────────────────────────────────────────────── + + registerAppTool( + server, + "migrate-rules", + { + title: "Migrate Rules", + description: + "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security. " + + "Opens an interactive migration workbench for uploading, translating, reviewing, " + + "and installing rules. Vendor support: Splunk (active), QRadar / Sentinel-One (coming soon).", + inputSchema: {}, + _meta: { ui: { resourceUri: RESOURCE_URI } }, + }, + async () => { + const migrations = await migrationsService.listMigrations(); + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ + message: "Opening SIEM migration workbench", + migrations: migrations.map(({ id, name, status }) => ({ id, name, status })), + }), + }, + ], + }; + } + ); + + // ── App-only tools ───────────────────────────────────────────────────────── + + registerAppTool( + server, + "list-migrations", + { + title: "List Migrations", + description: "List all SIEM rule migrations.", + inputSchema: {}, + _meta: { ui: { visibility: ["app"] } }, + }, + async () => { + const migrations = await migrationsService.listMigrations(); + return { + content: [{ type: "text" as const, text: JSON.stringify(migrations) }], + }; + } + ); + + registerAppTool( + server, + "get-migration", + { + title: "Get Migration", + description: "Get details for a specific SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId }) => { + const migration = await migrationsService.getMigration(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(migration) }], + }; + } + ); + + registerAppTool( + server, + "get-translated-rules", + { + title: "Get Translated Rules", + description: "Get translated rules for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + page: z.number().optional(), + perPage: z.number().optional(), + filter: z.string().optional(), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, page, perPage, filter }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const result = await migrationsService.getTranslatedRules(migrationId, { + page, + perPage, + filter, + }); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "start-translation", + { + title: "Start Translation", + description: "Start the AI translation process for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.startTranslation(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "started" }) }], + }; + } + ); + + registerAppTool( + server, + "stop-translation", + { + title: "Stop Translation", + description: "Stop the AI translation process for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.stopTranslation(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "stopped" }) }], + }; + } + ); + + registerAppTool( + server, + "update-translated-rule", + { + title: "Update Translated Rule", + description: "Update a translated rule in a SIEM migration (e.g. fix its Elastic rule JSON).", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + ruleId: z.string().describe("Translated rule ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + elasticRule: z + .string() + .optional() + .describe("JSON-encoded Elastic rule updates"), + translationResult: z + .enum(["full", "partial", "untranslatable"]) + .optional(), + comments: z.array(z.string()).optional(), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, ruleId, vendor, elasticRule, translationResult, comments }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const updates: Record = {}; + if (elasticRule !== undefined) + updates.elastic_rule = JSON.parse(elasticRule) as Record; + if (translationResult !== undefined) updates.translation_result = translationResult; + if (comments !== undefined) updates.comments = comments; + const result = await migrationsService.updateTranslatedRule(migrationId, ruleId, updates); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "get-resources", + { + title: "Get Resources", + description: "Get macro/lookup resources for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const resources = await migrationsService.getResources(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(resources) }], + }; + } + ); + + registerAppTool( + server, + "upsert-resource", + { + title: "Upsert Resource", + description: "Create or update a macro/lookup resource in a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + type: z.enum(["macro", "lookup"]).describe("Resource type"), + name: z.string().describe("Resource name"), + content: z.string().describe("Resource content"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, type, name, content }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.upsertResources(migrationId, [{ type, name, content }]); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "ok" }) }], + }; + } + ); + + registerAppTool( + server, + "install-rules", + { + title: "Install Rules", + description: "Install translated rules from a SIEM migration into Elastic Security.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + ids: z + .array(z.string()) + .optional() + .describe("Specific rule IDs to install. Omit to install all installable rules."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, ids }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const result = await migrationsService.installRules(migrationId, { ids }); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "get-stats", + { + title: "Get Stats", + description: "Get translation and installation statistics for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId }) => { + const stats = await migrationsService.getStats(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(stats) }], + }; + } + ); + + // ── App resource (HTML workbench) ────────────────────────────────────────── + + const viewPath = resolveViewPath("migration"); + registerAppResource( + server, + RESOURCE_URI, + RESOURCE_URI, + { mimeType: RESOURCE_MIME_TYPE }, + async () => { + const html = fs.readFileSync(viewPath, "utf-8"); + return { + contents: [{ uri: RESOURCE_URI, mimeType: RESOURCE_MIME_TYPE, text: html }], + }; + } + ); +} From 6b9c8bce468a8097f48bf7cc5c12986ced10cbe1 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:21:47 +0200 Subject: [PATCH 20/42] test: add migration tool tests (tool registrations + vendor gating) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 20 tests covering: Registration: all 11 tools + HTML resource registered under the correct names migrate-rules: workbench message + compact migration list returned to LLM app-only tool happy paths: list-migrations, get-migration, get-translated-rules (with pagination), start-translation, stop-translation, update-translated-rule (parses elasticRule JSON), get-resources, upsert-resource (single-element array), install-rules (with ids), get-stats Vendor gating (per gated tool): - vendor="qradar" / "sentinel-one" / unknown → { error: "vendorNotSupported" } without calling the service - vendor absent → proceeds (defaults to Splunk path) get-stats has no vendor gate — confirmed by calling without vendor Also adds createMockMigrationsService() to mockServices.ts covering all 14 MigrationsService methods. Co-Authored-By: Claude Sonnet 4.6 --- src/test/helpers/mockServices.ts | 20 ++ src/tools/migration.test.ts | 408 +++++++++++++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 src/tools/migration.test.ts diff --git a/src/test/helpers/mockServices.ts b/src/test/helpers/mockServices.ts index bb77c48..819e95c 100644 --- a/src/test/helpers/mockServices.ts +++ b/src/test/helpers/mockServices.ts @@ -13,6 +13,7 @@ import type { EntityDetailService } from "../../elastic/service/entityDetailServ import type { EsqlService } from "../../elastic/service/esqlService.js"; import type { IndicesService } from "../../elastic/service/indicesService.js"; import type { InvestigateService } from "../../elastic/service/investigateService.js"; +import type { MigrationsService } from "../../elastic/service/migrationsService.js"; import type { RulesService } from "../../elastic/service/rulesService.js"; import type { SampleDataService } from "../../elastic/service/sampleDataService.js"; @@ -99,6 +100,25 @@ export function createMockRulesService(): RulesService { ]); } +export function createMockMigrationsService(): MigrationsService { + return mockService([ + "createMigration", + "listMigrations", + "getMigration", + "deleteMigration", + "uploadRules", + "getTranslatedRules", + "getTranslatedRule", + "updateTranslatedRule", + "startTranslation", + "stopTranslation", + "getResources", + "upsertResources", + "installRules", + "getStats", + ]); +} + export function createMockSampleDataService(): SampleDataService { return mockService([ "generateSampleData", diff --git a/src/tools/migration.test.ts b/src/tools/migration.test.ts new file mode 100644 index 0000000..7193075 --- /dev/null +++ b/src/tools/migration.test.ts @@ -0,0 +1,408 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import fs from "fs"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; + +import { registerMigrationTools } from "./migration.js"; +import { + createMockMcpServer, + parseToolText, + type MockMcpServer, +} from "../test/helpers/mockMcpServer.js"; +import { createMockMigrationsService } from "../test/helpers/mockServices.js"; +import type { MigrationsService } from "../elastic/service/index.js"; + +const RESOURCE_URI = "ui://migrate-rules/mcp-app.html"; +const MIGRATION_ID = "m-1"; +const RULE_ID = "r-1"; + +function setup() { + const server = createMockMcpServer(); + const migrationsService = createMockMigrationsService(); + vi.spyOn(fs, "existsSync").mockReturnValue(false); + vi.spyOn(fs, "readFileSync").mockReturnValue("migration"); + registerMigrationTools(server as unknown as McpServer, { migrationsService }); + return { server, migrationsService }; +} + +describe("registerMigrationTools", () => { + let server: MockMcpServer; + let migrationsService: MigrationsService; + + beforeEach(() => { + ({ server, migrationsService } = setup()); + }); + + // ── Registration ─────────────────────────────────────────────────────────── + + it("registers all 11 tools and the HTML resource", () => { + expect([...server.tools.keys()].sort()).toEqual( + [ + "migrate-rules", + "list-migrations", + "get-migration", + "get-translated-rules", + "start-translation", + "stop-translation", + "update-translated-rule", + "get-resources", + "upsert-resource", + "install-rules", + "get-stats", + ].sort() + ); + expect([...server.resources.keys()]).toEqual([RESOURCE_URI]); + }); + + // ── migrate-rules (model-facing) ─────────────────────────────────────────── + + describe("migrate-rules", () => { + it("returns a compact migration list for the LLM to see", async () => { + vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([ + { + id: MIGRATION_ID, + name: "Splunk prod", + status: "ready", + created_at: "2026-01-01T00:00:00Z", + last_updated_at: "2026-01-01T00:00:00Z", + rules: { + total: 10, pending: 5, processing: 0, completed: 5, failed: 0, + installable: 5, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }, + ]); + + const out = parseToolText<{ message: string; migrations: unknown[] }>( + await server.tool("migrate-rules").callback({}) + ); + + expect(out.message).toContain("workbench"); + expect(out.migrations).toHaveLength(1); + expect(out.migrations[0]).toMatchObject({ id: MIGRATION_ID, name: "Splunk prod" }); + }); + }); + + // ── list-migrations ──────────────────────────────────────────────────────── + + describe("list-migrations", () => { + it("delegates to migrationsService.listMigrations and returns the array", async () => { + vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([]); + + const out = parseToolText( + await server.tool("list-migrations").callback({}) + ); + + expect(migrationsService.listMigrations).toHaveBeenCalledTimes(1); + expect(out).toEqual([]); + }); + }); + + // ── get-migration ────────────────────────────────────────────────────────── + + describe("get-migration", () => { + it("calls getMigration with the provided ID", async () => { + vi.mocked(migrationsService.getMigration).mockResolvedValueOnce({ + id: MIGRATION_ID, + name: "test", + status: "ready", + created_at: "", + last_updated_at: "", + rules: { + total: 0, pending: 0, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }); + + await server.tool("get-migration").callback({ migrationId: MIGRATION_ID }); + + expect(migrationsService.getMigration).toHaveBeenCalledWith(MIGRATION_ID); + }); + }); + + // ── get-translated-rules ─────────────────────────────────────────────────── + + describe("get-translated-rules", () => { + it("forwards pagination params to getTranslatedRules", async () => { + vi.mocked(migrationsService.getTranslatedRules).mockResolvedValueOnce({ + data: [], + total: 0, + }); + + await server.tool("get-translated-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + page: 2, + perPage: 50, + filter: "status:completed", + }); + + expect(migrationsService.getTranslatedRules).toHaveBeenCalledWith( + MIGRATION_ID, + { page: 2, perPage: 50, filter: "status:completed" } + ); + }); + + it("returns vendorNotSupported for a non-Splunk vendor", async () => { + const out = parseToolText<{ error: string; vendor: string }>( + await server.tool("get-translated-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out).toEqual({ error: "vendorNotSupported", vendor: "qradar" }); + expect(migrationsService.getTranslatedRules).not.toHaveBeenCalled(); + }); + }); + + // ── start-translation ────────────────────────────────────────────────────── + + describe("start-translation", () => { + it("calls startTranslation and returns { status: 'started' }", async () => { + vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("start-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.startTranslation).toHaveBeenCalledWith(MIGRATION_ID); + expect(out.status).toBe("started"); + }); + + it("returns vendorNotSupported for sentinel-one", async () => { + const out = parseToolText<{ error: string; vendor: string }>( + await server.tool("start-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "sentinel-one", + }) + ); + + expect(out).toEqual({ error: "vendorNotSupported", vendor: "sentinel-one" }); + expect(migrationsService.startTranslation).not.toHaveBeenCalled(); + }); + }); + + // ── stop-translation ─────────────────────────────────────────────────────── + + describe("stop-translation", () => { + it("calls stopTranslation and returns { status: 'stopped' }", async () => { + vi.mocked(migrationsService.stopTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("stop-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.stopTranslation).toHaveBeenCalledWith(MIGRATION_ID); + expect(out.status).toBe("stopped"); + }); + + it("returns vendorNotSupported for an unknown vendor", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("stop-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "unknown-siem", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.stopTranslation).not.toHaveBeenCalled(); + }); + }); + + // ── update-translated-rule ───────────────────────────────────────────────── + + describe("update-translated-rule", () => { + it("parses elasticRule JSON and passes updates to service", async () => { + vi.mocked(migrationsService.updateTranslatedRule).mockResolvedValueOnce({ + id: RULE_ID, + migration_id: MIGRATION_ID, + status: "completed", + translation_result: "partial", + original_rule: {}, + }); + const elasticRule = { name: "Fixed rule", type: "query" }; + + await server.tool("update-translated-rule").callback({ + migrationId: MIGRATION_ID, + ruleId: RULE_ID, + vendor: "splunk", + elasticRule: JSON.stringify(elasticRule), + translationResult: "partial", + }); + + expect(migrationsService.updateTranslatedRule).toHaveBeenCalledWith( + MIGRATION_ID, + RULE_ID, + expect.objectContaining({ + elastic_rule: elasticRule, + translation_result: "partial", + }) + ); + }); + + it("returns vendorNotSupported without calling service", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("update-translated-rule").callback({ + migrationId: MIGRATION_ID, + ruleId: RULE_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.updateTranslatedRule).not.toHaveBeenCalled(); + }); + }); + + // ── get-resources ────────────────────────────────────────────────────────── + + describe("get-resources", () => { + it("calls getResources with migrationId", async () => { + vi.mocked(migrationsService.getResources).mockResolvedValueOnce([ + { type: "macro", name: "my_macro", content: "| where true" }, + ]); + + const out = parseToolText( + await server.tool("get-resources").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.getResources).toHaveBeenCalledWith(MIGRATION_ID); + expect(out).toHaveLength(1); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("get-resources").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + }); + }); + + // ── upsert-resource ──────────────────────────────────────────────────────── + + describe("upsert-resource", () => { + it("calls upsertResources with a single-element array", async () => { + vi.mocked(migrationsService.upsertResources).mockResolvedValueOnce(undefined); + + await server.tool("upsert-resource").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + type: "macro", + name: "splunk_macro", + content: "| eval x=1", + }); + + expect(migrationsService.upsertResources).toHaveBeenCalledWith( + MIGRATION_ID, + [{ type: "macro", name: "splunk_macro", content: "| eval x=1" }] + ); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("upsert-resource").callback({ + migrationId: MIGRATION_ID, + vendor: "sentinel-one", + type: "macro", + name: "m", + content: "", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.upsertResources).not.toHaveBeenCalled(); + }); + }); + + // ── install-rules ────────────────────────────────────────────────────────── + + describe("install-rules", () => { + it("passes ids array to installRules", async () => { + vi.mocked(migrationsService.installRules).mockResolvedValueOnce({ + installed: 2, + failed: 0, + }); + + const out = parseToolText<{ installed: number; failed: number }>( + await server.tool("install-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + ids: ["r-1", "r-2"], + }) + ); + + expect(migrationsService.installRules).toHaveBeenCalledWith( + MIGRATION_ID, + { ids: ["r-1", "r-2"] } + ); + expect(out).toEqual({ installed: 2, failed: 0 }); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("install-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.installRules).not.toHaveBeenCalled(); + }); + }); + + // ── get-stats ────────────────────────────────────────────────────────────── + + describe("get-stats", () => { + it("calls getStats and returns the result (no vendor gate)", async () => { + const stats = { + id: MIGRATION_ID, + status: "ready" as const, + rules: { + total: 5, pending: 5, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }; + vi.mocked(migrationsService.getStats).mockResolvedValueOnce(stats); + + const out = parseToolText( + await server.tool("get-stats").callback({ migrationId: MIGRATION_ID }) + ); + + expect(migrationsService.getStats).toHaveBeenCalledWith(MIGRATION_ID); + expect(out).toEqual(stats); + }); + }); + + // ── Vendor gate: undefined vendor is allowed ─────────────────────────────── + + it("proceeds when vendor parameter is absent (defaults to Splunk path)", async () => { + vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("start-translation").callback({ migrationId: MIGRATION_ID }) + ); + + expect(out.status).toBe("started"); + expect(migrationsService.startTranslation).toHaveBeenCalled(); + }); +}); From 1c3177978c7f43c42581f71146e184b25373aff2 Mon Sep 17 00:00:00 2001 From: Patryk Kopycinski Date: Fri, 15 May 2026 12:28:10 +0200 Subject: [PATCH 21/42] feat: add migration workbench view with WorkbenchState machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/views/migration/App.tsx — full state machine: WorkbenchState discriminated union (8 stages): vendor-select → user picks vendor → creates migration upload → paste Splunk rules JSON → upload + start translation translating → polls get-stats every 3s → advances on completion review → lists translated rules with status badges + fix actions fix-rule-drawer → slide-over editor for single rule JSON + result enum fix-resources-drawer → slide-over for macro/lookup create/update install → confirmation step before calling install-rules done → success summary with installed/failed counts Vendor gate (5-LOC client check): SUPPORTED_VENDORS = ["splunk"] VENDOR_CATALOGUE entries not in SUPPORTED_VENDORS render as disabled with "Coming soon" badge — re-enabling a vendor is a one-line change. MCP integration: All data via app.callServerTool() through the 10 app-only tools. translating stage schedules a 3-second poll loop that stops and transitions to review when stats.rules.processing === 0. Supporting files: mcp-app.html — minimal HTML shell (title: "SIEM Migration") mcp-app.tsx — standard React 18 createRoot mount styles.css — vendor-grid, upload-area, progress-bar, rule status badges, drawer layout Co-Authored-By: Claude Sonnet 4.6 --- src/views/migration/App.tsx | 873 +++++++++++++++++++++++++++++++ src/views/migration/mcp-app.html | 12 + src/views/migration/mcp-app.tsx | 12 + src/views/migration/styles.css | 165 ++++++ 4 files changed, 1062 insertions(+) create mode 100644 src/views/migration/App.tsx create mode 100644 src/views/migration/mcp-app.html create mode 100644 src/views/migration/mcp-app.tsx create mode 100644 src/views/migration/styles.css diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx new file mode 100644 index 0000000..8add033 --- /dev/null +++ b/src/views/migration/App.tsx @@ -0,0 +1,873 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import React, { useState, useCallback, useEffect, useRef } from "react"; +import type { App as McpApp } from "@modelcontextprotocol/ext-apps"; +import { extractCallResult } from "../../shared/extract-tool-text"; +import { + AppHeader, + AppShell, + BackButton, + EmptyState, + KpiStrip, + KpiTile, + LoadingState, +} from "../../shared/components"; +import { useFullscreen } from "../../shared/hooks/useFullscreen"; +import { useMcpApp } from "../../shared/hooks/useMcpApp"; +import "./styles.css"; + +// --------------------------------------------------------------------------- +// Local domain types (shapes returned by the app-only migration tools) +// --------------------------------------------------------------------------- + +interface MigrationStats { + id: string; + status: string; + rules: { + total: number; + pending: number; + processing: number; + completed: number; + failed: number; + installable: number; + installed: number; + partially_translated: number; + untranslatable: number; + }; +} + +interface TranslatedRule { + id: string; + status: string; + translation_result?: "full" | "partial" | "untranslatable"; + original_rule: Record; + elastic_rule?: Record; + comments?: string[]; +} + +interface MigrationResource { + type: "macro" | "lookup"; + name: string; + content: string; +} + +interface InstallResult { + installed: number; + failed: number; +} + +// --------------------------------------------------------------------------- +// WorkbenchState discriminated union +// +// Each stage carries exactly the data it needs and no more. Transitions +// always move forward through the pipeline — no implicit shared state. +// --------------------------------------------------------------------------- + +export type WorkbenchState = + | { + stage: "vendor-select"; + } + | { + stage: "upload"; + vendor: string; + migrationId: string; + } + | { + stage: "translating"; + vendor: string; + migrationId: string; + stats: MigrationStats | null; + } + | { + stage: "review"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + } + | { + stage: "fix-rule-drawer"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + selectedRule: TranslatedRule; + } + | { + stage: "fix-resources-drawer"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + } + | { + stage: "install"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + } + | { + stage: "done"; + installed: number; + failed: number; + }; + +// --------------------------------------------------------------------------- +// Vendor catalogue — re-enabling a vendor is a one-line change here +// --------------------------------------------------------------------------- + +const SUPPORTED_VENDORS: readonly string[] = ["splunk"]; + +const VENDOR_CATALOGUE = [ + { id: "splunk", label: "Splunk" }, + { id: "qradar", label: "IBM QRadar" }, + { id: "sentinel-one", label: "Sentinel One" }, +] as const; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function callTool( + app: McpApp, + name: string, + args: Record +): Promise { + try { + const result = await app.callServerTool({ name, arguments: args }); + const text = extractCallResult(result); + if (!text) return null; + return JSON.parse(text) as T; + } catch (e) { + console.error(`[migration] ${name} failed:`, e); + return null; + } +} + +// --------------------------------------------------------------------------- +// App +// --------------------------------------------------------------------------- + +export function App() { + const [state, setState] = useState({ stage: "vendor-select" }); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + // For the translating stage: poll stats until translation completes + const pollTimerRef = useRef | null>(null); + + const clearPoll = useCallback(() => { + if (pollTimerRef.current !== null) { + clearTimeout(pollTimerRef.current); + pollTimerRef.current = null; + } + }, []); + + useEffect(() => () => clearPoll(), [clearPoll]); + + const { connected, getApp } = useMcpApp({ + name: "migration", + version: "1.0.0", + onConnect: (_app, _gotResult) => { + // No initial data load needed — the workbench starts at vendor-select. + }, + }); + + const fullscreen = useFullscreen(getApp); + + // ── Stage transitions ────────────────────────────────────────────────────── + + const selectVendor = useCallback( + async (vendor: string) => { + const app = getApp(); + if (!app) return; + setLoading(true); + setError(null); + try { + const res = await callTool<{ migration_id: string }>(app, "create-migration", { + name: `Migration ${new Date().toISOString().slice(0, 10)}`, + }); + if (!res?.migration_id) throw new Error("Failed to create migration"); + setState({ stage: "upload", vendor, migrationId: res.migration_id }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp] + ); + + const uploadRules = useCallback( + async (rulesJson: string) => { + const app = getApp(); + if (!app || state.stage !== "upload") return; + const { vendor, migrationId } = state; + setLoading(true); + setError(null); + try { + const rules = JSON.parse(rulesJson) as Record[]; + await callTool(app, "upload-rules", { migrationId, vendor, rules }); + await callTool(app, "start-translation", { migrationId, vendor }); + const stats = await callTool(app, "get-stats", { migrationId }); + setState({ stage: "translating", vendor, migrationId, stats: stats ?? null }); + schedulePoll(app, vendor, migrationId); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const schedulePoll = useCallback( + (app: McpApp, vendor: string, migrationId: string) => { + clearPoll(); + pollTimerRef.current = setTimeout(async () => { + const stats = await callTool(app, "get-stats", { migrationId }); + setState((prev) => { + if (prev.stage !== "translating") return prev; + return { ...prev, stats: stats ?? prev.stats }; + }); + if (stats && stats.rules.processing === 0 && stats.status !== "running") { + // Translation finished — load translated rules and resources, move to review + void (async () => { + const translationsRes = await callTool<{ + data: TranslatedRule[]; + }>(app, "get-translated-rules", { migrationId, vendor, perPage: 500 }); + const resources = + (await callTool(app, "get-resources", { + migrationId, + vendor, + })) ?? []; + setState({ + stage: "review", + vendor, + migrationId, + translations: translationsRes?.data ?? [], + resources, + }); + })(); + } else { + schedulePoll(app, vendor, migrationId); + } + }, 3000); + }, + [clearPoll] + ); + + const openRuleDrawer = useCallback((rule: TranslatedRule) => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { ...prev, stage: "fix-rule-drawer", selectedRule: rule }; + }); + }, []); + + const saveRuleFix = useCallback( + async (elasticRuleJson: string, translationResult: "full" | "partial" | "untranslatable") => { + const app = getApp(); + if (!app || state.stage !== "fix-rule-drawer") return; + const { vendor, migrationId, translations, resources, selectedRule } = state; + setLoading(true); + setError(null); + try { + const updated = await callTool( + app, + "update-translated-rule", + { migrationId, ruleId: selectedRule.id, vendor, elasticRule: elasticRuleJson, translationResult } + ); + setState({ + stage: "review", + vendor, + migrationId, + resources, + translations: translations.map((t) => + t.id === selectedRule.id ? (updated ?? t) : t + ), + }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const openResourcesDrawer = useCallback(() => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { ...prev, stage: "fix-resources-drawer" }; + }); + }, []); + + const saveResources = useCallback( + async (resource: MigrationResource) => { + const app = getApp(); + if (!app || state.stage !== "fix-resources-drawer") return; + const { vendor, migrationId, translations } = state; + setLoading(true); + setError(null); + try { + await callTool(app, "upsert-resource", { migrationId, vendor, ...resource }); + const resources = + (await callTool(app, "get-resources", { migrationId, vendor })) ?? []; + setState({ stage: "review", vendor, migrationId, translations, resources }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const closeDrawer = useCallback(() => { + setState((prev) => { + if (prev.stage === "fix-rule-drawer" || prev.stage === "fix-resources-drawer") { + const { stage: _stage, ...rest } = prev as WorkbenchState & { + stage: "fix-rule-drawer" | "fix-resources-drawer"; + }; + void _stage; + return { ...(rest as { vendor: string; migrationId: string; translations: TranslatedRule[]; resources: MigrationResource[] }), stage: "review" }; + } + return prev; + }); + }, []); + + const startInstall = useCallback(() => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations }; + }); + }, []); + + const confirmInstall = useCallback(async () => { + const app = getApp(); + if (!app || state.stage !== "install") return; + const { vendor, migrationId } = state; + setLoading(true); + setError(null); + try { + const result = await callTool(app, "install-rules", { migrationId, vendor }); + setState({ stage: "done", installed: result?.installed ?? 0, failed: result?.failed ?? 0 }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, [getApp, state]); + + const reset = useCallback(() => { + clearPoll(); + setState({ stage: "vendor-select" }); + setError(null); + }, [clearPoll]); + + // ── Render ───────────────────────────────────────────────────────────────── + + // AppHeader expects { isFullscreen, onToggle } — useFullscreen returns { isFullscreen, toggle } + const fullscreenProp = { isFullscreen: fullscreen.isFullscreen, onToggle: fullscreen.toggle }; + + if (!connected) { + return ( + + + Connecting to Elastic Security… + + ); + } + + return ( + + + ) : undefined + } + /> + + {error && ( +
+ {error} + +
+ )} + + {loading && Working…} + + {!loading && renderStage(state, { + selectVendor, + uploadRules, + openRuleDrawer, + saveRuleFix, + openResourcesDrawer, + saveResources, + closeDrawer, + startInstall, + confirmInstall, + reset, + })} +
+ ); +} + +// --------------------------------------------------------------------------- +// Per-stage renderers (extracted to keep App() readable) +// --------------------------------------------------------------------------- + +interface StageHandlers { + selectVendor: (vendor: string) => void; + uploadRules: (json: string) => void; + openRuleDrawer: (rule: TranslatedRule) => void; + saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void; + openResourcesDrawer: () => void; + saveResources: (resource: MigrationResource) => void; + closeDrawer: () => void; + startInstall: () => void; + confirmInstall: () => void; + reset: () => void; +} + +function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode { + switch (state.stage) { + case "vendor-select": + return ; + + case "upload": + return ; + + case "translating": + return ; + + case "review": + return ( + + ); + + case "fix-rule-drawer": + return ( + <> + + + + ); + + case "fix-resources-drawer": + return ( + <> + + + + ); + + case "install": + return ( + t.translation_result !== "untranslatable").length} + onConfirm={h.confirmInstall} + onBack={h.closeDrawer} + /> + ); + + case "done": + return ; + } +} + +// --------------------------------------------------------------------------- +// Stage components +// --------------------------------------------------------------------------- + +function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) { + return ( +
+

Select your source SIEM

+

+ Choose the platform you are migrating detection rules from. +

+
+ {VENDOR_CATALOGUE.map(({ id, label }) => { + const supported = SUPPORTED_VENDORS.includes(id); + return ( + + ); + })} +
+
+ ); +} + +function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) { + const [text, setText] = useState(""); + return ( +
+

Upload {vendor} rules

+

+ Paste your exported {vendor} rules as a JSON array, then start translation. +

+
+