From 5bb73b5e5cb9f82b2ba4a17bc9c09d2485babeac Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 10:55:42 +0200
Subject: [PATCH 01/42] evals: add types.ts with Dataset, Example, EvalResult
and related types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduces the canonical TypeScript type definitions for the eval pipeline:
- `ToolCall` / `Trajectory` — MCP host loop output primitives
- `ExpectedBehavior` — optional `tools`, `criteria`, `skill` fields (evaluators
return `'N/A'` when a field they need is absent)
- `Example` / `Dataset` — test-case and collection shapes
- `EvaluatorResult` / `EvalResult` — per-evaluator and per-example results
- `Evaluator` — async-compatible function contract all evaluator modules satisfy
Also adds `evals/**/*` to tsconfig.json includes so tsc covers eval files.
Co-Authored-By: Claude Sonnet 4.6
---
evals/types.ts | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
tsconfig.json | 2 +-
2 files changed, 76 insertions(+), 1 deletion(-)
create mode 100644 evals/types.ts
diff --git a/evals/types.ts b/evals/types.ts
new file mode 100644
index 0000000..4722075
--- /dev/null
+++ b/evals/types.ts
@@ -0,0 +1,75 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/** A single tool invocation captured during an MCP host loop run. */
+export interface ToolCall {
+ tool: string;
+ args: Record;
+ result?: unknown;
+}
+
+/** Ordered sequence of tool calls produced by one eval run. */
+export type Trajectory = ToolCall[];
+
+/**
+ * What a passing run should look like.
+ * `tools` and `criteria` are both optional — evaluators that depend on them
+ * return `'N/A'` when the field is absent, so a dataset can omit whichever
+ * dimension is irrelevant for a given example.
+ */
+export interface ExpectedBehavior {
+ /** Ordered list of tool names the host should call. Used by trajectory / tool-selection evaluators. */
+ tools?: string[];
+ /** Natural-language assertions checked by the criteria (LLM-as-judge) evaluator. */
+ criteria?: string[];
+ /** Skill ID that should be activated. Used by the skill-activation evaluator. */
+ skill?: string;
+}
+
+/** One test case inside a dataset. */
+export interface Example {
+ /** Stable identifier — used as a key in result tables and CI summaries. */
+ id: string;
+ /** The user message sent to the LLM host at the start of the simulation. */
+ input: string;
+ expected: ExpectedBehavior;
+}
+
+/** A named collection of examples that can be loaded by the runner. */
+export interface Dataset {
+ name: string;
+ examples: Example[];
+}
+
+/**
+ * Output of a single evaluator for one example.
+ * `score` is a value in [0, 1] when the evaluator ran, or `'N/A'` when the
+ * evaluator skipped (e.g. `expected.tools` was absent for trajectory evaluator).
+ */
+export interface EvaluatorResult {
+ score: number | 'N/A';
+ /** Human-readable explanation of the score, required when score is numeric. */
+ reason?: string;
+}
+
+/** Aggregate result for one example after all evaluators have run. */
+export interface EvalResult {
+ exampleId: string;
+ input: string;
+ trajectory: Trajectory;
+ /** Keys are evaluator names (e.g. `'skill-activation'`, `'trajectory'`). */
+ evaluators: Record;
+}
+
+/**
+ * Contract every evaluator module must satisfy.
+ * Async to accommodate LLM-as-judge evaluators that call an LLM provider.
+ */
+export type Evaluator = (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+) => EvaluatorResult | Promise;
diff --git a/tsconfig.json b/tsconfig.json
index 23b7968..5dc2901 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -14,6 +14,6 @@
"lib": ["ES2022", "DOM", "DOM.Iterable"],
"types": ["vitest/globals", "@testing-library/jest-dom"]
},
- "include": ["src/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"],
+ "include": ["src/**/*", "evals/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"],
"exclude": ["node_modules", "dist"]
}
From 06d830cee537349bf9be1b5f24188a83b9480668 Mon Sep 17 00:00:00 2001
From: patryks-treadmill
Date: Fri, 15 May 2026 10:55:59 +0200
Subject: [PATCH 02/42]
ao(create-evals-types-ts-with-typescript-definitions--0): Create
`evals/types.ts` with TypeScript definitions for `Dataset`, `Exam
Auto-committed by patryks-treadmill orchestrator.
plan=automatic-migration-mcp-app job=64319163-2da8-44b5-b087-3dee6e9e4c14 attempt=1
---
package-lock.json | 8 --------
1 file changed, 8 deletions(-)
diff --git a/package-lock.json b/package-lock.json
index 08e9dde..d34696e 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -5750,14 +5750,6 @@
"node": ">= 18"
}
},
- "node_modules/monaco-promql": {
- "version": "1.8.0",
- "resolved": "https://registry.npmjs.org/monaco-promql/-/monaco-promql-1.8.0.tgz",
- "integrity": "sha512-XdgRojBzEe/rKtrJaHbSfoMFOMD5TXymDHIitTngmBT6XEjtAirnA7Rb2YJAO1SZrJfgvAo4LFCzJ71fH7+WOw==",
- "license": "MIT",
- "optional": true,
- "peer": true
- },
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
From b3ad86e8c0ee4455c46237788006f9ec0b6a3ed6 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:02:10 +0200
Subject: [PATCH 03/42] evals: add runner.ts orchestrator, runMcpHostLoop stub,
and eval vitest config
runner.ts exports `runDataset(dataset, evaluators, options?)` which:
- Wraps all examples in `describe.skipIf(!process.env.RUN_LLM_EVALS)` so
regular `npm test` never makes LLM calls or requires API keys
- Creates one `it` per example: runs runMcpHostLoop, scores via evaluators,
asserts numeric scores >= passingScore (default 0.5)
- Emits a Markdown table summary via afterAll for CI job summaries
runMcpHostLoop.ts is a typed stub (throws); full InMemoryTransport
implementation comes in the next commit.
evals/vitest.config.ts runs in node environment with 120 s timeout,
scoped to evals/**/*.{test,spec,eval}.ts and *.dataset.ts patterns.
Also:
- Adds `test:evals` script to package.json (cross-env RUN_LLM_EVALS=1)
- Adds evals/**/*.ts to eslint.config.js file patterns so eval files
are linted and license-header-checked
Co-Authored-By: Claude Sonnet 4.6
---
eslint.config.js | 1 +
evals/runMcpHostLoop.ts | 19 +++++++
evals/runner.ts | 112 ++++++++++++++++++++++++++++++++++++++++
evals/vitest.config.ts | 24 +++++++++
package.json | 1 +
5 files changed, 157 insertions(+)
create mode 100644 evals/runMcpHostLoop.ts
create mode 100644 evals/runner.ts
create mode 100644 evals/vitest.config.ts
diff --git a/eslint.config.js b/eslint.config.js
index 382ca72..cde436c 100644
--- a/eslint.config.js
+++ b/eslint.config.js
@@ -16,6 +16,7 @@ export default tseslint.config(
files: [
'src/**/*.ts',
'src/**/*.tsx',
+ 'evals/**/*.ts',
'*.ts',
'scripts/**/*.js',
'*.mjs',
diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
new file mode 100644
index 0000000..be80a6d
--- /dev/null
+++ b/evals/runMcpHostLoop.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Trajectory } from "./types.js";
+
+/**
+ * Simulates one MCP host loop turn using the SDK's InMemoryTransport and
+ * returns the ordered sequence of tool calls the LLM made.
+ *
+ * Full implementation lands in the next commit; this stub satisfies the
+ * import so runner.ts type-checks now.
+ */
+export async function runMcpHostLoop(_input: string): Promise {
+ throw new Error("runMcpHostLoop is not yet implemented");
+}
diff --git a/evals/runner.ts b/evals/runner.ts
new file mode 100644
index 0000000..50035ca
--- /dev/null
+++ b/evals/runner.ts
@@ -0,0 +1,112 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, afterAll } from "vitest";
+import type { Dataset, EvalResult, EvaluatorResult, Evaluator } from "./types.js";
+import { runMcpHostLoop } from "./runMcpHostLoop.js";
+
+export interface RunnerOptions {
+ /** Minimum numeric score [0–1] for a test to pass. Defaults to 0.5. */
+ passingScore?: number;
+}
+
+/**
+ * Registers a Vitest suite for every example in `dataset`.
+ *
+ * The entire suite is skipped unless `RUN_LLM_EVALS=1` is set in the
+ * environment, so regular `npm test` incurs zero LLM cost.
+ *
+ * Each example becomes one `it` that:
+ * 1. Runs the in-process MCP host loop to collect a trajectory.
+ * 2. Passes the trajectory to every evaluator.
+ * 3. Asserts that numeric scores meet `passingScore`.
+ *
+ * After all examples complete, a Markdown summary is written to stdout so
+ * the GitHub Actions job summary (>> $GITHUB_STEP_SUMMARY) can capture it.
+ */
+export function runDataset(
+ dataset: Dataset,
+ evaluators: Record,
+ options: RunnerOptions = {}
+): void {
+ const { passingScore = 0.5 } = options;
+
+ describe.skipIf(!process.env.RUN_LLM_EVALS)(dataset.name, () => {
+ const results: EvalResult[] = [];
+
+ for (const example of dataset.examples) {
+ it(example.id, async () => {
+ const trajectory = await runMcpHostLoop(example.input);
+
+ const evalResults: Record = {};
+ for (const [name, evaluator] of Object.entries(evaluators)) {
+ evalResults[name] = await evaluator(trajectory, example.expected);
+ }
+
+ const result: EvalResult = {
+ exampleId: example.id,
+ input: example.input,
+ trajectory,
+ evaluators: evalResults,
+ };
+ results.push(result);
+
+ for (const [name, evalResult] of Object.entries(evalResults)) {
+ if (evalResult.score !== "N/A") {
+ expect(
+ evalResult.score,
+ `[${name}] score ${evalResult.score.toFixed(2)} < ${passingScore}` +
+ (evalResult.reason ? `: ${evalResult.reason}` : "")
+ ).toBeGreaterThanOrEqual(passingScore);
+ }
+ }
+ });
+ }
+
+ afterAll(() => {
+ process.stdout.write(buildMarkdownSummary(dataset.name, results) + "\n");
+ });
+ });
+}
+
+function buildMarkdownSummary(datasetName: string, results: EvalResult[]): string {
+ if (results.length === 0) {
+ return `## Eval results: ${datasetName}\n\n_No examples ran._\n`;
+ }
+
+ const evaluatorNames = Array.from(
+ new Set(results.flatMap((r) => Object.keys(r.evaluators)))
+ );
+
+ const headers = ["id", "input", ...evaluatorNames];
+ const separator = headers.map(() => "---");
+
+ const rows = results.map((r) => {
+ const scoreCells = evaluatorNames.map((name) => {
+ const e = r.evaluators[name];
+ if (!e) return "—";
+ if (e.score === "N/A") return "N/A";
+ return `${(e.score * 100).toFixed(0)}%`;
+ });
+ return [r.exampleId, truncate(r.input, 60), ...scoreCells];
+ });
+
+ const lines = [
+ `## Eval results: ${datasetName}`,
+ "",
+ `| ${headers.join(" | ")} |`,
+ `| ${separator.join(" | ")} |`,
+ ...rows.map((row) => `| ${row.join(" | ")} |`),
+ "",
+ ];
+
+ return lines.join("\n");
+}
+
+function truncate(s: string, maxLen: number): string {
+ return s.length <= maxLen ? s : `${s.slice(0, maxLen - 1)}…`;
+}
diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts
new file mode 100644
index 0000000..7d2f4d5
--- /dev/null
+++ b/evals/vitest.config.ts
@@ -0,0 +1,24 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { defineConfig } from "vitest/config";
+
+/**
+ * Vitest config for LLM eval suites. Intentionally separate from the main
+ * vitest.config.ts so `npm test` never picks up eval files (and thus never
+ * makes LLM calls or requires API keys in a regular dev/CI run).
+ *
+ * Run via: npm run test:evals
+ */
+export default defineConfig({
+ test: {
+ environment: "node",
+ globals: true,
+ include: ["evals/**/*.{test,spec,eval}.ts", "evals/datasets/**/*.dataset.ts"],
+ testTimeout: 120_000,
+ },
+});
diff --git a/package.json b/package.json
index 983e3ca..043ee51 100644
--- a/package.json
+++ b/package.json
@@ -47,6 +47,7 @@
"test": "vitest",
"test:run": "vitest run",
"test:coverage": "vitest run --coverage",
+ "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts",
"prepublishOnly": "npm run build",
"prepare": "husky",
"version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json"
From 21b3030ea8ada62dc5efc067e617293c41ba5112 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:21:13 +0200
Subject: [PATCH 04/42] evals: implement runMcpHostLoop with InMemoryTransport
and LLM provider types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
runMcpHostLoop wires an MCP Client to the server via InMemoryTransport
(in-process, no network), lists available tools, and drives a loop of up
to MAX_TURNS=8 turns:
LLM → tool calls → client.callTool() → result fed back → repeat
Options allow callers to inject a pre-built McpServer (for mocked-service
datasets) or a custom LlmProvider (for deterministic tests). Both default
to the real implementations when omitted.
evals/llm/types.ts introduces the LlmProvider interface and LlmMessage
discriminated union (OpenAI-style, compatible with LiteLLM proxies).
evals/llm/index.ts exposes createDefaultLlmProvider(), which auto-selects
by env var (ANTHROPIC_API_KEY first, then OPENAI_API_KEY); the concrete
adapters (anthropic.ts / openai.ts) land in the next commit — this stub
surfaces a clear error until they do.
Co-Authored-By: Claude Sonnet 4.6
---
evals/llm/index.ts | 38 ++++++++++++
evals/llm/types.ts | 54 +++++++++++++++++
evals/runMcpHostLoop.ts | 130 +++++++++++++++++++++++++++++++++++++---
3 files changed, 215 insertions(+), 7 deletions(-)
create mode 100644 evals/llm/index.ts
create mode 100644 evals/llm/types.ts
diff --git a/evals/llm/index.ts b/evals/llm/index.ts
new file mode 100644
index 0000000..d3c254b
--- /dev/null
+++ b/evals/llm/index.ts
@@ -0,0 +1,38 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { LlmProvider } from "./types.js";
+
+/**
+ * Returns the default LLM provider by inspecting environment variables.
+ *
+ * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5)
+ * OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ *
+ * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are
+ * implemented in the next commit; this stub ensures runMcpHostLoop.ts
+ * type-checks now and surfaces a clear error at runtime when evals are run
+ * before the adapters land.
+ */
+export function createDefaultLlmProvider(): LlmProvider {
+ if (process.env.ANTHROPIC_API_KEY) {
+ throw new Error(
+ "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
+ "It will land in the next commit."
+ );
+ }
+ if (process.env.OPENAI_API_KEY) {
+ throw new Error(
+ "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " +
+ "It will land in the next commit."
+ );
+ }
+ throw new Error(
+ "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
+ "before running evals (npm run test:evals)."
+ );
+}
diff --git a/evals/llm/types.ts b/evals/llm/types.ts
new file mode 100644
index 0000000..b5fef9b
--- /dev/null
+++ b/evals/llm/types.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/** A single tool the LLM may call, described in JSON Schema. */
+export interface LlmToolDefinition {
+ name: string;
+ description: string;
+ /** JSON Schema object describing the tool's input parameters. */
+ parameters: Record;
+}
+
+/** One tool invocation requested by the LLM in an assistant turn. */
+export interface LlmToolCallRequest {
+ id: string;
+ type: "function";
+ function: {
+ name: string;
+ /** JSON-encoded argument object. */
+ arguments: string;
+ };
+}
+
+/**
+ * Discriminated union covering every role that can appear in a chat thread.
+ * Shaped after the OpenAI chat messages API so a single interface works for
+ * both the OpenAI and Anthropic adapters (and any LiteLLM proxy in between).
+ */
+export type LlmMessage =
+ | { role: "user"; content: string }
+ | {
+ role: "assistant";
+ content: string | null;
+ tool_calls?: LlmToolCallRequest[];
+ }
+ | { role: "tool"; content: string; tool_call_id: string };
+
+/** Narrowed assistant message — what LlmProvider.chat() must return. */
+export type AssistantMessage = Extract;
+
+/**
+ * Minimal provider contract every LLM adapter must satisfy.
+ * The interface is intentionally thin: give it a message history + tool
+ * catalogue, get back the next assistant turn (possibly with tool calls).
+ */
+export interface LlmProvider {
+ chat(
+ messages: LlmMessage[],
+ tools: LlmToolDefinition[]
+ ): Promise;
+}
diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
index be80a6d..d6a732c 100644
--- a/evals/runMcpHostLoop.ts
+++ b/evals/runMcpHostLoop.ts
@@ -5,15 +5,131 @@
* 2.0.
*/
-import type { Trajectory } from "./types.js";
+import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
+import { Client } from "@modelcontextprotocol/sdk/client/index.js";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { createServer } from "../src/server.js";
+import type { Trajectory, ToolCall } from "./types.js";
+import type { LlmProvider, LlmMessage } from "./llm/types.js";
+import { createDefaultLlmProvider } from "./llm/index.js";
+
+/** Maximum LLM → tool-call turns before halting to prevent runaway evals. */
+const MAX_TURNS = 8;
+
+export interface HostLoopOptions {
+ /**
+ * Pre-built MCP server to test against.
+ *
+ * Pass a server constructed with mocked services for dataset-level evals
+ * that don't need a live cluster. Omit to use `createServer()`, which reads
+ * CLUSTERS_JSON / CLUSTERS_FILE and requires a real Elastic cluster.
+ *
+ * Each call to `runMcpHostLoop` should receive a **fresh** server instance;
+ * reusing a connected server across calls is not supported.
+ */
+ server?: McpServer;
+ /**
+ * LLM provider used to simulate the MCP host making tool-call decisions.
+ * Defaults to auto-selecting from ANTHROPIC_API_KEY / OPENAI_API_KEY.
+ */
+ llm?: LlmProvider;
+ /**
+ * Maximum number of LLM→tool-call turns per run.
+ * Defaults to MAX_TURNS (8).
+ */
+ maxTurns?: number;
+}
/**
- * Simulates one MCP host loop turn using the SDK's InMemoryTransport and
- * returns the ordered sequence of tool calls the LLM made.
+ * Simulates one MCP host loop run entirely in-process.
+ *
+ * Architecture:
+ * LLM ↔ Client ↔─InMemoryTransport─↔ McpServer ↔ (ES / Kibana clients)
*
- * Full implementation lands in the next commit; this stub satisfies the
- * import so runner.ts type-checks now.
+ * The function:
+ * 1. Wires a fresh Client to the server via InMemoryTransport.
+ * 2. Lists available MCP tools and hands them to the LLM as tool definitions.
+ * 3. Loops up to `maxTurns` times:
+ * a. Asks the LLM for the next assistant turn.
+ * b. If the LLM emits tool calls, executes each via client.callTool().
+ * c. Records every call in the trajectory.
+ * d. Feeds results back into the message history.
+ * e. Breaks when the LLM emits no tool calls (task complete).
+ * 4. Closes the client and returns the trajectory.
*/
-export async function runMcpHostLoop(_input: string): Promise {
- throw new Error("runMcpHostLoop is not yet implemented");
+export async function runMcpHostLoop(
+ input: string,
+ { server, llm, maxTurns = MAX_TURNS }: HostLoopOptions = {}
+): Promise {
+ const resolvedServer = server ?? createServer();
+ const resolvedLlm = llm ?? createDefaultLlmProvider();
+
+ const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair();
+ await resolvedServer.connect(serverTransport);
+
+ const client = new Client({ name: "eval-host", version: "1.0.0" });
+ await client.connect(clientTransport);
+
+ try {
+ const { tools: mcpTools } = await client.listTools();
+ const toolDefs = mcpTools.map((t) => ({
+ name: t.name,
+ description: t.description ?? "",
+ parameters: t.inputSchema as Record,
+ }));
+
+ const messages: LlmMessage[] = [{ role: "user", content: input }];
+ const trajectory: Trajectory = [];
+
+ for (let turn = 0; turn < maxTurns; turn++) {
+ const response = await resolvedLlm.chat(messages, toolDefs);
+ messages.push(response);
+
+ if (!response.tool_calls || response.tool_calls.length === 0) {
+ // LLM chose not to call a tool — simulation complete.
+ break;
+ }
+
+ for (const toolCall of response.tool_calls) {
+ const toolName = toolCall.function.name;
+ let toolArgs: Record;
+ try {
+ toolArgs = JSON.parse(toolCall.function.arguments) as Record<
+ string,
+ unknown
+ >;
+ } catch {
+ // Malformed JSON from the LLM; record the call with empty args
+ // so the trajectory evaluator can detect the failure.
+ toolArgs = {};
+ }
+
+ const result = await client.callTool({
+ name: toolName,
+ arguments: toolArgs,
+ });
+
+ const record: ToolCall = {
+ tool: toolName,
+ args: toolArgs,
+ result: result.content,
+ };
+ trajectory.push(record);
+
+ // Feed the tool result back so the LLM can reason about it.
+ messages.push({
+ role: "tool",
+ content: JSON.stringify(result.content),
+ tool_call_id: toolCall.id,
+ });
+ }
+ }
+
+ return trajectory;
+ } finally {
+ // Closing the client also closes clientTransport, which triggers
+ // serverTransport.onclose() — the InMemoryTransport linked pair
+ // tears down cleanly without needing an explicit server.close().
+ await client.close();
+ }
}
From 066f7cfdc27c1d8c3669ab772b44ffbde195bcd9 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:28:17 +0200
Subject: [PATCH 05/42] evals: add OpenAiProvider with LiteLLM proxy support
and wire default provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
OpenAiProvider (evals/llm/openai.ts):
- Implements LlmProvider.chat() via the openai SDK (gpt-4o-mini default)
- Accepts baseURL to point at a LiteLLM proxy for any compatible provider
- Maps LlmMessage ↔ ChatCompletionMessageParam in both directions; narrows
ChatCompletionMessageToolCall to FunctionToolCall before accessing .function
- Strips tools argument when the list is empty (avoids API errors)
evals/llm/index.ts:
- createDefaultLlmProvider() now returns a real OpenAiProvider when
OPENAI_API_KEY is set; picks up LITELLM_BASE_URL automatically
- Preserves the ANTHROPIC_API_KEY branch with a clear "coming soon" error
until evals/llm/anthropic.ts lands
Adds openai@^6.37.0 as a devDependency (npm install --save-dev openai).
Co-Authored-By: Claude Sonnet 4.6
---
evals/llm/index.ts | 25 +++++----
evals/llm/openai.ts | 128 ++++++++++++++++++++++++++++++++++++++++++++
package-lock.json | 23 ++++++++
package.json | 1 +
4 files changed, 164 insertions(+), 13 deletions(-)
create mode 100644 evals/llm/openai.ts
diff --git a/evals/llm/index.ts b/evals/llm/index.ts
index d3c254b..d58aeda 100644
--- a/evals/llm/index.ts
+++ b/evals/llm/index.ts
@@ -6,33 +6,32 @@
*/
import type { LlmProvider } from "./types.js";
+import { OpenAiProvider } from "./openai.js";
/**
* Returns the default LLM provider by inspecting environment variables.
*
- * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5)
- * OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ * Priority order:
+ * 1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon
+ * 2. OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini)
*
- * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are
- * implemented in the next commit; this stub ensures runMcpHostLoop.ts
- * type-checks now and surfaces a clear error at runtime when evals are run
- * before the adapters land.
+ * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM
+ * proxy, e.g. to use Claude via the OpenAI-compatible endpoint.
*/
export function createDefaultLlmProvider(): LlmProvider {
if (process.env.ANTHROPIC_API_KEY) {
throw new Error(
"Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
- "It will land in the next commit."
+ "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter."
);
}
if (process.env.OPENAI_API_KEY) {
- throw new Error(
- "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " +
- "It will land in the next commit."
- );
+ return new OpenAiProvider({
+ baseURL: process.env.LITELLM_BASE_URL,
+ });
}
throw new Error(
- "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
- "before running evals (npm run test:evals)."
+ "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " +
+ "once the Anthropic adapter lands) before running evals (npm run test:evals)."
);
}
diff --git a/evals/llm/openai.ts b/evals/llm/openai.ts
new file mode 100644
index 0000000..a6dd59b
--- /dev/null
+++ b/evals/llm/openai.ts
@@ -0,0 +1,128 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import OpenAI from "openai";
+import type {
+ AssistantMessage,
+ LlmMessage,
+ LlmProvider,
+ LlmToolDefinition,
+} from "./types.js";
+
+const DEFAULT_MODEL = "gpt-4o-mini";
+
+export interface OpenAiProviderOptions {
+ /** Chat model to use. Defaults to gpt-4o-mini. */
+ model?: string;
+ /**
+ * Override the API base URL. Point this at a LiteLLM proxy to route calls
+ * through any provider the proxy supports without changing client code.
+ */
+ baseURL?: string;
+ /**
+ * API key. Defaults to the OPENAI_API_KEY environment variable, which is
+ * the standard OpenAI SDK default.
+ */
+ apiKey?: string;
+}
+
+export class OpenAiProvider implements LlmProvider {
+ private readonly client: OpenAI;
+ private readonly model: string;
+
+ constructor({
+ model = DEFAULT_MODEL,
+ baseURL,
+ apiKey,
+ }: OpenAiProviderOptions = {}) {
+ this.model = model;
+ this.client = new OpenAI({
+ ...(apiKey !== undefined ? { apiKey } : {}),
+ ...(baseURL !== undefined ? { baseURL } : {}),
+ });
+ }
+
+ async chat(
+ messages: LlmMessage[],
+ tools: LlmToolDefinition[]
+ ): Promise {
+ const response = await this.client.chat.completions.create({
+ model: this.model,
+ messages: messages.map(toOaiMessage),
+ ...(tools.length > 0 ? { tools: tools.map(toOaiTool) } : {}),
+ });
+
+ const choice = response.choices[0];
+ if (!choice) {
+ throw new Error("OpenAI returned no choices");
+ }
+
+ const msg = choice.message;
+ return {
+ role: "assistant",
+ content: msg.content ?? null,
+ ...(msg.tool_calls
+ ? {
+ tool_calls: msg.tool_calls
+ .filter(
+ (tc): tc is OpenAI.ChatCompletionMessageFunctionToolCall =>
+ tc.type === "function"
+ )
+ .map((tc) => ({
+ id: tc.id,
+ type: "function" as const,
+ function: {
+ name: tc.function.name,
+ arguments: tc.function.arguments,
+ },
+ })),
+ }
+ : {}),
+ };
+ }
+}
+
+function toOaiMessage(msg: LlmMessage): OpenAI.ChatCompletionMessageParam {
+ switch (msg.role) {
+ case "user":
+ return { role: "user", content: msg.content };
+ case "assistant":
+ return {
+ role: "assistant",
+ content: msg.content,
+ ...(msg.tool_calls
+ ? {
+ tool_calls: msg.tool_calls.map((tc) => ({
+ id: tc.id,
+ type: "function" as const,
+ function: {
+ name: tc.function.name,
+ arguments: tc.function.arguments,
+ },
+ })),
+ }
+ : {}),
+ };
+ case "tool":
+ return {
+ role: "tool",
+ content: msg.content,
+ tool_call_id: msg.tool_call_id,
+ };
+ }
+}
+
+function toOaiTool(tool: LlmToolDefinition): OpenAI.ChatCompletionTool {
+ return {
+ type: "function",
+ function: {
+ name: tool.name,
+ description: tool.description,
+ parameters: tool.parameters,
+ },
+ };
+}
diff --git a/package-lock.json b/package-lock.json
index d34696e..2207e24 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -54,6 +54,7 @@
"husky": "^9.1.7",
"jsdom": "^29.1.1",
"lint-staged": "^16.4.0",
+ "openai": "^6.37.0",
"tailwindcss": "^4.2.2",
"tsx": "^4.21.0",
"typescript": "^6.0.2",
@@ -5860,6 +5861,28 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
+ "node_modules/openai": {
+ "version": "6.37.0",
+ "resolved": "https://registry.npmjs.org/openai/-/openai-6.37.0.tgz",
+ "integrity": "sha512-0H5dEGFmmLv6KSd0W1w2nyL8WsLkX6yoLeQpU+dZAOuGcany5qkYQMmj35ZrKgb6yiyYqpUzFOpR8mZQkgqeEQ==",
+ "dev": true,
+ "license": "Apache-2.0",
+ "bin": {
+ "openai": "bin/cli"
+ },
+ "peerDependencies": {
+ "ws": "^8.18.0",
+ "zod": "^3.25 || ^4.0"
+ },
+ "peerDependenciesMeta": {
+ "ws": {
+ "optional": true
+ },
+ "zod": {
+ "optional": true
+ }
+ }
+ },
"node_modules/optionator": {
"version": "0.9.4",
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
diff --git a/package.json b/package.json
index 043ee51..e328e65 100644
--- a/package.json
+++ b/package.json
@@ -103,6 +103,7 @@
"husky": "^9.1.7",
"jsdom": "^29.1.1",
"lint-staged": "^16.4.0",
+ "openai": "^6.37.0",
"tailwindcss": "^4.2.2",
"tsx": "^4.21.0",
"typescript": "^6.0.2",
From 9f47372445c80abf2295e64ac332f13c3453e30e Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:33:59 +0200
Subject: [PATCH 06/42] evals: add AnthropicProvider and wire it as the default
when ANTHROPIC_API_KEY is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
AnthropicProvider (evals/llm/anthropic.ts):
- Implements LlmProvider.chat() via @anthropic-ai/sdk (claude-haiku-4-5-20251001)
- toAnthropicMessages() handles the structural gap between OpenAI-style messages
and Anthropic's API: no `tool` role exists; tool results go as `user` messages
with `tool_result` content blocks; consecutive tool results are merged into a
single user turn to avoid adjacent-user-turn API errors
- Tool input is round-tripped JSON.parse (from LlmToolCallRequest.arguments) →
object for the request, then JSON.stringify back for the response to maintain
the OpenAI-compatible LlmToolCallRequest shape
- input_schema is cast from LlmToolDefinition.parameters (already JSON Schema)
evals/llm/index.ts:
- createDefaultLlmProvider() now returns AnthropicProvider when ANTHROPIC_API_KEY
is set (priority 1), falls back to OpenAiProvider for OPENAI_API_KEY (priority 2)
Adds @anthropic-ai/sdk@^0.96.0 as a devDependency.
Co-Authored-By: Claude Sonnet 4.6
---
evals/llm/anthropic.ts | 150 +++++++++++++++++++++++++++++++++++++++++
evals/llm/index.ts | 16 ++---
package-lock.json | 69 +++++++++++++++++++
package.json | 1 +
4 files changed, 227 insertions(+), 9 deletions(-)
create mode 100644 evals/llm/anthropic.ts
diff --git a/evals/llm/anthropic.ts b/evals/llm/anthropic.ts
new file mode 100644
index 0000000..70e9adc
--- /dev/null
+++ b/evals/llm/anthropic.ts
@@ -0,0 +1,150 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import Anthropic from "@anthropic-ai/sdk";
+import type {
+ AssistantMessage,
+ LlmMessage,
+ LlmProvider,
+ LlmToolDefinition,
+} from "./types.js";
+
+const DEFAULT_MODEL = "claude-haiku-4-5-20251001";
+
+/** Max tokens to request from the Anthropic API per turn. */
+const MAX_TOKENS = 4096;
+
+export interface AnthropicProviderOptions {
+ /** Chat model to use. Defaults to claude-haiku-4-5-20251001. */
+ model?: string;
+ /**
+ * API key. Defaults to the ANTHROPIC_API_KEY environment variable, which is
+ * the standard Anthropic SDK default.
+ */
+ apiKey?: string;
+}
+
+export class AnthropicProvider implements LlmProvider {
+ private readonly client: Anthropic;
+ private readonly model: string;
+
+ constructor({
+ model = DEFAULT_MODEL,
+ apiKey,
+ }: AnthropicProviderOptions = {}) {
+ this.model = model;
+ this.client = new Anthropic({
+ ...(apiKey !== undefined ? { apiKey } : {}),
+ });
+ }
+
+ async chat(
+ messages: LlmMessage[],
+ tools: LlmToolDefinition[]
+ ): Promise {
+ const response = await this.client.messages.create({
+ model: this.model,
+ max_tokens: MAX_TOKENS,
+ messages: toAnthropicMessages(messages),
+ ...(tools.length > 0 ? { tools: tools.map(toAnthropicTool) } : {}),
+ });
+
+ const textBlocks = response.content.filter(
+ (c): c is Anthropic.TextBlock => c.type === "text"
+ );
+ const toolUseBlocks = response.content.filter(
+ (c): c is Anthropic.ToolUseBlock => c.type === "tool_use"
+ );
+
+ return {
+ role: "assistant",
+ content: textBlocks.map((b) => b.text).join("") || null,
+ ...(toolUseBlocks.length > 0
+ ? {
+ tool_calls: toolUseBlocks.map((tu) => ({
+ id: tu.id,
+ type: "function" as const,
+ function: {
+ name: tu.name,
+ // Anthropic returns a parsed object; re-encode to match the
+ // OpenAI-style LlmToolCallRequest.function.arguments shape.
+ arguments: JSON.stringify(tu.input),
+ },
+ })),
+ }
+ : {}),
+ };
+ }
+}
+
+/**
+ * Converts OpenAI-style LlmMessage[] to Anthropic MessageParam[].
+ *
+ * Structural differences from OpenAI:
+ * - Anthropic has no `tool` role. Tool results go as `user` messages with
+ * `tool_result` content blocks.
+ * - Consecutive tool-result messages are merged into a single user message
+ * so the API never receives two adjacent user turns.
+ * - Assistant content is an array of TextBlockParam / ToolUseBlockParam.
+ */
+function toAnthropicMessages(
+ messages: LlmMessage[]
+): Anthropic.MessageParam[] {
+ const result: Anthropic.MessageParam[] = [];
+
+ for (const msg of messages) {
+ if (msg.role === "user") {
+ result.push({ role: "user", content: msg.content });
+ } else if (msg.role === "assistant") {
+ const content: Anthropic.ContentBlockParam[] = [];
+ if (msg.content) {
+ content.push({ type: "text", text: msg.content });
+ }
+ for (const tc of msg.tool_calls ?? []) {
+ let input: unknown;
+ try {
+ input = JSON.parse(tc.function.arguments);
+ } catch {
+ input = {};
+ }
+ content.push({ type: "tool_use", id: tc.id, name: tc.function.name, input });
+ }
+ result.push({ role: "assistant", content });
+ } else {
+ // msg.role === "tool"
+ const block: Anthropic.ToolResultBlockParam = {
+ type: "tool_result",
+ tool_use_id: msg.tool_call_id,
+ content: msg.content,
+ };
+
+ // Merge into the preceding user message when it already holds
+ // tool_result blocks — the Anthropic API rejects two adjacent user turns.
+ const prev = result[result.length - 1];
+ if (
+ prev?.role === "user" &&
+ Array.isArray(prev.content) &&
+ (prev.content as Anthropic.ContentBlockParam[])[0]?.type ===
+ "tool_result"
+ ) {
+ (prev.content as Anthropic.ContentBlockParam[]).push(block);
+ } else {
+ result.push({ role: "user", content: [block] });
+ }
+ }
+ }
+
+ return result;
+}
+
+function toAnthropicTool(tool: LlmToolDefinition): Anthropic.Tool {
+ return {
+ name: tool.name,
+ description: tool.description,
+ input_schema: tool.parameters as Anthropic.Tool.InputSchema,
+ };
+}
diff --git a/evals/llm/index.ts b/evals/llm/index.ts
index d58aeda..5698ff5 100644
--- a/evals/llm/index.ts
+++ b/evals/llm/index.ts
@@ -6,24 +6,22 @@
*/
import type { LlmProvider } from "./types.js";
+import { AnthropicProvider } from "./anthropic.js";
import { OpenAiProvider } from "./openai.js";
/**
* Returns the default LLM provider by inspecting environment variables.
*
* Priority order:
- * 1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon
- * 2. OPENAI_API_KEY → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ * 1. ANTHROPIC_API_KEY → AnthropicProvider (claude-haiku-4-5-20251001)
+ * 2. OPENAI_API_KEY → OpenAiProvider / LiteLLM proxy (gpt-4o-mini)
*
* Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM
- * proxy, e.g. to use Claude via the OpenAI-compatible endpoint.
+ * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint.
*/
export function createDefaultLlmProvider(): LlmProvider {
if (process.env.ANTHROPIC_API_KEY) {
- throw new Error(
- "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
- "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter."
- );
+ return new AnthropicProvider();
}
if (process.env.OPENAI_API_KEY) {
return new OpenAiProvider({
@@ -31,7 +29,7 @@ export function createDefaultLlmProvider(): LlmProvider {
});
}
throw new Error(
- "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " +
- "once the Anthropic adapter lands) before running evals (npm run test:evals)."
+ "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
+ "before running evals (npm run test:evals)."
);
}
diff --git a/package-lock.json b/package-lock.json
index 2207e24..156fc31 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -33,6 +33,7 @@
"elastic-security-mcp-app": "dist/main.js"
},
"devDependencies": {
+ "@anthropic-ai/sdk": "^0.96.0",
"@tailwindcss/vite": "^4.2.2",
"@testing-library/jest-dom": "^6.9.1",
"@testing-library/react": "^16.3.2",
@@ -74,6 +75,28 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/@anthropic-ai/sdk": {
+ "version": "0.96.0",
+ "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.96.0.tgz",
+ "integrity": "sha512-KlCsODtTyb17bLUVCSDC2HtSvAbJf60sEiPEax9dInF+aDF92vS4TZJ5XD7YCQXNb1/5icYaw8Y7wMjPlIV9Zg==",
+ "dev": true,
+ "license": "MIT",
+ "dependencies": {
+ "json-schema-to-ts": "^3.1.1",
+ "standardwebhooks": "^1.0.0"
+ },
+ "bin": {
+ "anthropic-ai-sdk": "bin/cli"
+ },
+ "peerDependencies": {
+ "zod": "^3.25.0 || ^4.0.0"
+ },
+ "peerDependenciesMeta": {
+ "zod": {
+ "optional": true
+ }
+ }
+ },
"node_modules/@asamuzakjp/css-color": {
"version": "5.1.11",
"resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz",
@@ -1860,6 +1883,13 @@
],
"peer": true
},
+ "node_modules/@stablelib/base64": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/@stablelib/base64/-/base64-1.0.1.tgz",
+ "integrity": "sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==",
+ "dev": true,
+ "license": "MIT"
+ },
"node_modules/@standard-schema/spec": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz",
@@ -4240,6 +4270,13 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/fast-sha256": {
+ "version": "1.3.0",
+ "resolved": "https://registry.npmjs.org/fast-sha256/-/fast-sha256-1.3.0.tgz",
+ "integrity": "sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==",
+ "dev": true,
+ "license": "Unlicense"
+ },
"node_modules/fast-uri": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
@@ -4945,6 +4982,20 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/json-schema-to-ts": {
+ "version": "3.1.1",
+ "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz",
+ "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==",
+ "dev": true,
+ "license": "MIT",
+ "dependencies": {
+ "@babel/runtime": "^7.18.3",
+ "ts-algebra": "^2.0.0"
+ },
+ "engines": {
+ "node": ">=16"
+ }
+ },
"node_modules/json-schema-traverse": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
@@ -6647,6 +6698,17 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/standardwebhooks": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/standardwebhooks/-/standardwebhooks-1.0.0.tgz",
+ "integrity": "sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==",
+ "dev": true,
+ "license": "MIT",
+ "dependencies": {
+ "@stablelib/base64": "^1.0.0",
+ "fast-sha256": "^1.3.0"
+ }
+ },
"node_modules/state-local": {
"version": "1.0.7",
"resolved": "https://registry.npmjs.org/state-local/-/state-local-1.0.7.tgz",
@@ -6886,6 +6948,13 @@
"tree-kill": "cli.js"
}
},
+ "node_modules/ts-algebra": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz",
+ "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==",
+ "dev": true,
+ "license": "MIT"
+ },
"node_modules/ts-api-utils": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz",
diff --git a/package.json b/package.json
index e328e65..83ef515 100644
--- a/package.json
+++ b/package.json
@@ -82,6 +82,7 @@
"react-dom": "^19.2.4"
},
"devDependencies": {
+ "@anthropic-ai/sdk": "^0.96.0",
"@tailwindcss/vite": "^4.2.2",
"@testing-library/jest-dom": "^6.9.1",
"@testing-library/react": "^16.3.2",
From ab6ac677b250128f3bef38046668cafcf3e36c8b Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:36:50 +0200
Subject: [PATCH 07/42] evals: add --reporter=verbose to test:evals script
Makes per-example test names visible in CI output and in the GitHub Actions
job summary, which is where the Markdown eval table lands.
Co-Authored-By: Claude Sonnet 4.6
---
package.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package.json b/package.json
index 83ef515..2308b39 100644
--- a/package.json
+++ b/package.json
@@ -47,7 +47,7 @@
"test": "vitest",
"test:run": "vitest run",
"test:coverage": "vitest run --coverage",
- "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts",
+ "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts --reporter=verbose",
"prepublishOnly": "npm run build",
"prepare": "husky",
"version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json"
From 9c7c1ddab13cf4b99a52f7b927fec43ec95e4593 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:37:44 +0200
Subject: [PATCH 08/42] evals: add skill-activation evaluator (binary score)
Returns 1 if the trajectory contains at least one call to the skill's
entry-point tool (expected.skill), 0 if not, or 'N/A' when expected.skill
is absent so datasets that don't test skill routing can omit the field.
The failure reason includes the full tool-name list from the trajectory to
make CI output actionable without re-running the eval.
Co-Authored-By: Claude Sonnet 4.6
---
evals/evaluators/skill-activation.ts | 37 ++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
create mode 100644 evals/evaluators/skill-activation.ts
diff --git a/evals/evaluators/skill-activation.ts b/evals/evaluators/skill-activation.ts
new file mode 100644
index 0000000..b7deb8d
--- /dev/null
+++ b/evals/evaluators/skill-activation.ts
@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Binary evaluator: did the LLM call the skill's entry-point tool?
+ *
+ * Each MCP skill has a single model-facing entry-point tool (e.g. `migrate-rules`
+ * for the automatic-migration skill, `manage-rules` for detection-rule-management).
+ * `expected.skill` holds that tool name. The evaluator checks whether the
+ * trajectory contains at least one call to that tool.
+ *
+ * Returns `'N/A'` when `expected.skill` is absent so datasets that don't
+ * care about skill routing can omit the field without failing the run.
+ */
+export const skillActivation: Evaluator = (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+): EvaluatorResult => {
+ if (!expected.skill) {
+ return { score: "N/A" };
+ }
+
+ const activated = trajectory.some((tc) => tc.tool === expected.skill);
+
+ return {
+ score: activated ? 1 : 0,
+ reason: activated
+ ? `Tool "${expected.skill}" was called`
+ : `Tool "${expected.skill}" was never called (trajectory: [${trajectory.map((t) => t.tool).join(", ") || "empty"}])`,
+ };
+};
From 7849ed50d4158205f861e068adf4451620f3cb68 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:39:33 +0200
Subject: [PATCH 09/42] evals: add negative-activation evaluator for distractor
examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Binary complement of skill-activation: returns 1 when the skill's
entry-point tool (expected.skill) is absent from the trajectory (correct —
LLM was not falsely triggered), 0 when the tool appears (false positive).
Returns 'N/A' when expected.skill is absent, matching the skill-activation
convention so both evaluators behave consistently on examples that don't
declare a skill.
CI gate intent: datasets should require 100% on this evaluator for distractor
examples — any false positive means the skill's SKILL.md is over-triggering
on unrelated queries in production.
Co-Authored-By: Claude Sonnet 4.6
---
evals/evaluators/negative-activation.ts | 46 +++++++++++++++++++++++++
1 file changed, 46 insertions(+)
create mode 100644 evals/evaluators/negative-activation.ts
diff --git a/evals/evaluators/negative-activation.ts b/evals/evaluators/negative-activation.ts
new file mode 100644
index 0000000..e08d315
--- /dev/null
+++ b/evals/evaluators/negative-activation.ts
@@ -0,0 +1,46 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Binary evaluator for distractor examples: did the LLM correctly avoid
+ * calling the skill's entry-point tool?
+ *
+ * This is the complement of `skillActivation`. Use it on examples where the
+ * user query should NOT trigger the skill — e.g. a migration skill dataset
+ * includes unrelated queries (case management, threat hunting) to confirm the
+ * LLM doesn't call `migrate-rules` for everything.
+ *
+ * Score semantics (binary):
+ * 1 — skill tool absent from trajectory (correct — not distracted)
+ * 0 — skill tool present in trajectory (false positive — skill over-triggered)
+ *
+ * Returns `'N/A'` when `expected.skill` is absent, consistent with how
+ * `skillActivation` handles missing skill declarations.
+ *
+ * CI gate: datasets should require 100% on this evaluator for distractor
+ * examples — a false positive means the skill's SKILL.md is too aggressive
+ * and will fire on unrelated queries in production.
+ */
+export const negativeActivation: Evaluator = (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+): EvaluatorResult => {
+ if (!expected.skill) {
+ return { score: "N/A" };
+ }
+
+ const falsePositive = trajectory.some((tc) => tc.tool === expected.skill);
+
+ return {
+ score: falsePositive ? 0 : 1,
+ reason: falsePositive
+ ? `Tool "${expected.skill}" was called but should not have been (false positive)`
+ : `Tool "${expected.skill}" was correctly absent from the trajectory`,
+ };
+};
From ed6ce7de2aa6877994e22558440fab8ab5d170b7 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:42:05 +0200
Subject: [PATCH 10/42] evals: add tool-selection evaluator (precision/recall
F1 against expected.tools)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Computes set-based precision, recall, and F1 against expected.tools.
Deduplicates both the trajectory and the expected list — order/repetition
is the trajectory evaluator's job.
Score = F1 ∈ [0, 1]. Returns 'N/A' when expected.tools is absent so
datasets that only test skill routing don't need to declare tool lists.
The reason string includes missed and extra tool names to make CI failures
immediately actionable without re-running the eval.
CI gate intent: ≥0.8 (80%) on positive examples.
Co-Authored-By: Claude Sonnet 4.6
---
evals/evaluators/tool-selection.ts | 60 ++++++++++++++++++++++++++++++
1 file changed, 60 insertions(+)
create mode 100644 evals/evaluators/tool-selection.ts
diff --git a/evals/evaluators/tool-selection.ts b/evals/evaluators/tool-selection.ts
new file mode 100644
index 0000000..71cf7b1
--- /dev/null
+++ b/evals/evaluators/tool-selection.ts
@@ -0,0 +1,60 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Set-based tool-selection evaluator: how well did the LLM pick the right tools?
+ *
+ * Computes precision, recall, and their harmonic mean (F1) against the
+ * set of tool names in `expected.tools`. Deduplicates both sides — order
+ * and repetition are tested by the trajectory evaluator instead.
+ *
+ * precision = |called ∩ expected| / |called| (no spurious calls)
+ * recall = |called ∩ expected| / |expected| (no missed calls)
+ * score = F1 = 2·P·R / (P+R) ∈ [0, 1]
+ *
+ * Returns `'N/A'` when `expected.tools` is absent so datasets that only
+ * care about skill routing don't need to declare tool lists.
+ *
+ * CI gate: datasets should require ≥0.8 (80%) on positive examples.
+ * The failure reason lists missed and extra tools to make debugging fast.
+ */
+export const toolSelection: Evaluator = (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+): EvaluatorResult => {
+ if (!expected.tools) {
+ return { score: "N/A" };
+ }
+
+ const expectedSet = new Set(expected.tools);
+ const calledSet = new Set(trajectory.map((tc) => tc.tool));
+
+ if (expectedSet.size === 0 && calledSet.size === 0) {
+ return { score: 1, reason: "No tools expected and none called" };
+ }
+
+ const tp = [...calledSet].filter((t) => expectedSet.has(t)).length;
+ const precision = calledSet.size > 0 ? tp / calledSet.size : 0;
+ const recall = expectedSet.size > 0 ? tp / expectedSet.size : 0;
+ const f1 =
+ precision + recall > 0
+ ? (2 * precision * recall) / (precision + recall)
+ : 0;
+
+ const missed = [...expectedSet].filter((t) => !calledSet.has(t));
+ const extra = [...calledSet].filter((t) => !expectedSet.has(t));
+
+ const parts = [
+ `F1=${f1.toFixed(2)} (precision=${precision.toFixed(2)}, recall=${recall.toFixed(2)})`,
+ ...(missed.length > 0 ? [`missed: [${missed.join(", ")}]`] : []),
+ ...(extra.length > 0 ? [`extra: [${extra.join(", ")}]`] : []),
+ ];
+
+ return { score: f1, reason: parts.join(" | ") };
+};
From 304df8df49bbe587c36176c61c3b963018a4ccde Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:44:10 +0200
Subject: [PATCH 11/42] evals: add trajectory evaluator (LCS-based sequence
score)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Computes score = lcs(actual, expected) / max(|actual|, |expected|).
Dividing by the max penalises both missing tools (recall gap) and extra
spurious tools (precision gap) in a single metric. Sequence matters here,
unlike tool-selection which is set-based.
Returns 'N/A' when expected.tools is absent — this guard prevents the
evaluator from emitting meaningless 0-scores on examples that declare no
ordered expectation, which would mask real regressions elsewhere.
LCS is O(m·n) time via a flat DP array to avoid nested-array allocation.
Co-Authored-By: Claude Sonnet 4.6
---
evals/evaluators/trajectory.ts | 79 ++++++++++++++++++++++++++++++++++
1 file changed, 79 insertions(+)
create mode 100644 evals/evaluators/trajectory.ts
diff --git a/evals/evaluators/trajectory.ts b/evals/evaluators/trajectory.ts
new file mode 100644
index 0000000..4e71ec8
--- /dev/null
+++ b/evals/evaluators/trajectory.ts
@@ -0,0 +1,79 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Sequence-aware evaluator: how closely did the LLM follow the expected tool order?
+ *
+ * Computes the Longest Common Subsequence (LCS) of the actual tool-call
+ * sequence against `expected.tools`, then normalises by the longer of the
+ * two sequences:
+ *
+ * score = lcs(actual, expected) / max(|actual|, |expected|) ∈ [0, 1]
+ *
+ * Dividing by the max penalises both missing tools (low recall) and extra
+ * spurious tools (low precision) without needing separate P/R components —
+ * those are tool-selection's job.
+ *
+ * Returns `'N/A'` when `expected.tools` is absent so datasets that don't
+ * specify an ordered tool sequence don't fail on this evaluator. This guard
+ * is load-bearing: running LCS against an undefined expectation would produce
+ * meaningless 0-scores that mask real regressions in other evaluators.
+ */
+export const trajectoryScore: Evaluator = (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+): EvaluatorResult => {
+ if (!expected.tools) {
+ return { score: "N/A" };
+ }
+
+ const actual = trajectory.map((tc) => tc.tool);
+ const exp = expected.tools;
+
+ if (actual.length === 0 && exp.length === 0) {
+ return { score: 1, reason: "Both actual and expected sequences are empty" };
+ }
+
+ const lcsLen = lcs(actual, exp);
+ const denom = Math.max(actual.length, exp.length);
+ const score = lcsLen / denom;
+
+ return {
+ score,
+ reason:
+ `LCS=${lcsLen} / max(|actual|=${actual.length}, |expected|=${exp.length})` +
+ `=${denom} → score=${score.toFixed(2)}` +
+ (score < 1
+ ? ` | actual=[${actual.join(", ")}] expected=[${exp.join(", ")}]`
+ : ""),
+ };
+};
+
+/**
+ * Classic O(m·n) DP implementation of Longest Common Subsequence length.
+ * Compares elements by identity (===), which is correct for tool name strings.
+ */
+function lcs(a: string[], b: string[]): number {
+ const m = a.length;
+ const n = b.length;
+ // Single flat array instead of Array> avoids inner allocation
+ const dp = new Array((m + 1) * (n + 1)).fill(0);
+ const idx = (i: number, j: number) => i * (n + 1) + j;
+
+ for (let i = 1; i <= m; i++) {
+ for (let j = 1; j <= n; j++) {
+ dp[idx(i, j)] =
+ a[i - 1] === b[j - 1]
+ ? dp[idx(i - 1, j - 1)] + 1
+ : Math.max(dp[idx(i - 1, j)], dp[idx(i, j - 1)]);
+ }
+ }
+
+ return dp[idx(m, n)];
+}
From b838b009740b16bb015dd605d8d1e7a81386642d Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:47:42 +0200
Subject: [PATCH 12/42] evals: add criteria (LLM-as-judge) evaluator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
createCriteriaEvaluator(llm) returns an Evaluator that sends the trajectory
and expected.criteria to a judge LLM with a structured rubric prompt asking
for JSON {score, reasoning}. Returns 'N/A' when expected.criteria is absent.
The factory pattern closes over the LLM provider so datasets can inject
different judges (e.g. a stronger model for criteria, haiku for routing).
Parsing: primary path extracts the first JSON object from the response and
clamps score to [0, 1]. Falls back to a bare-number regex for models that
ignore the JSON instruction, and finally returns score=0 with the raw text
if neither succeeds.
The judge prompt serialises only {tool, args} per call — omitting result
avoids token bloat from large tool outputs while still giving the judge
enough signal to evaluate routing decisions.
Co-Authored-By: Claude Sonnet 4.6
---
evals/evaluators/criteria.ts | 142 +++++++++++++++++++++++++++++++++++
1 file changed, 142 insertions(+)
create mode 100644 evals/evaluators/criteria.ts
diff --git a/evals/evaluators/criteria.ts b/evals/evaluators/criteria.ts
new file mode 100644
index 0000000..1994eac
--- /dev/null
+++ b/evals/evaluators/criteria.ts
@@ -0,0 +1,142 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+import type { LlmProvider } from "../llm/types.js";
+
+/**
+ * LLM-as-judge evaluator: asks an LLM to score the trajectory against
+ * the natural-language assertions in `expected.criteria`.
+ *
+ * Returns `'N/A'` when `expected.criteria` is absent or empty so datasets
+ * that rely only on structural evaluators don't incur extra LLM calls.
+ *
+ * Usage:
+ * import { createCriteriaEvaluator } from "./criteria.js";
+ * import { createDefaultLlmProvider } from "../llm/index.js";
+ *
+ * runDataset(dataset, {
+ * criteria: createCriteriaEvaluator(createDefaultLlmProvider()),
+ * });
+ *
+ * The factory pattern is necessary because the `Evaluator` type is a plain
+ * function — the LLM provider is closed over rather than passed as an arg.
+ */
+export function createCriteriaEvaluator(llm: LlmProvider): Evaluator {
+ return async (
+ trajectory: Trajectory,
+ expected: ExpectedBehavior
+ ): Promise => {
+ if (!expected.criteria || expected.criteria.length === 0) {
+ return { score: "N/A" };
+ }
+
+ const prompt = buildJudgePrompt(trajectory, expected.criteria);
+ const response = await llm.chat([{ role: "user", content: prompt }], []);
+ const text = response.content ?? "";
+
+ return parseJudgeResponse(text);
+ };
+}
+
+/**
+ * Builds the rubric prompt sent to the judge LLM.
+ *
+ * Asks for a JSON object with `score` (0–1) and `reasoning` (string) so
+ * parsing is deterministic. The trajectory is serialised as a compact JSON
+ * array of `{tool, args}` pairs — `result` is omitted to avoid token bloat
+ * from large tool outputs.
+ */
+function buildJudgePrompt(trajectory: Trajectory, criteria: string[]): string {
+ const trajectoryStr = JSON.stringify(
+ trajectory.map(({ tool, args }) => ({ tool, args })),
+ null,
+ 2
+ );
+
+ const criteriaList = criteria
+ .map((c, i) => `${i + 1}. ${c}`)
+ .join("\n");
+
+ return `You are an impartial evaluator assessing the quality of an AI assistant's tool-calling behaviour.
+
+## Trajectory (tools the assistant called, in order)
+
+\`\`\`json
+${trajectoryStr}
+\`\`\`
+
+## Evaluation criteria
+
+${criteriaList}
+
+## Task
+
+Score how well the trajectory satisfies ALL of the criteria above on a scale from 0.0 to 1.0:
+- 1.0 All criteria fully satisfied
+- 0.75 Most criteria satisfied with minor gaps
+- 0.5 About half the criteria satisfied
+- 0.25 Most criteria unmet with only minor satisfaction
+- 0.0 No criteria satisfied at all
+
+Respond with a single JSON object — no markdown fences, no extra text:
+{"score": , "reasoning": ""}`;
+}
+
+/**
+ * Parses the judge LLM's response into an EvaluatorResult.
+ *
+ * Tries JSON.parse first. Falls back to a regex that extracts a bare number
+ * from the text in case the model wraps the response in prose.
+ */
+function parseJudgeResponse(text: string): EvaluatorResult {
+ const trimmed = text.trim();
+
+ // Primary: extract the first {...} object in the response
+ const jsonMatch = trimmed.match(/\{[\s\S]*\}/);
+ if (jsonMatch) {
+ try {
+ const parsed = JSON.parse(jsonMatch[0]) as unknown;
+ if (
+ typeof parsed === "object" &&
+ parsed !== null &&
+ "score" in parsed &&
+ typeof (parsed as Record).score === "number"
+ ) {
+ const { score, reasoning } = parsed as {
+ score: number;
+ reasoning?: unknown;
+ };
+ const clampedScore = Math.min(1, Math.max(0, score));
+ return {
+ score: clampedScore,
+ reason:
+ typeof reasoning === "string"
+ ? reasoning
+ : `raw judge response: ${trimmed}`,
+ };
+ }
+ } catch {
+ // fall through to regex fallback
+ }
+ }
+
+ // Fallback: look for a bare decimal / integer in [0, 1]
+ const numMatch = trimmed.match(/\b(1(?:\.0+)?|0(?:\.\d+)?)\b/);
+ if (numMatch) {
+ const score = parseFloat(numMatch[1]);
+ return {
+ score,
+ reason: `score parsed from prose; raw response: ${trimmed.slice(0, 200)}`,
+ };
+ }
+
+ return {
+ score: 0,
+ reason: `judge response could not be parsed; raw response: ${trimmed.slice(0, 200)}`,
+ };
+}
From 60eebb35d26dc3286303457249643769530b6c35 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:50:52 +0200
Subject: [PATCH 13/42] evals: add detection-rule-management dataset (4
positives + 4 distractors)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Proves the eval harness end-to-end against the existing manage-rules skill.
Positives (drm-pos-01..04): natural-language queries about viewing/finding
detection rules — the LLM should call manage-rules. Evaluated with
skill-activation + tool-selection (≥80% gate).
Distractors (drm-neg-01..04): case creation, alert triage, ES|QL hunting,
host investigation — the LLM should NOT call manage-rules. Evaluated with
negative-activation (100% gate — any false positive is a regression).
Two separate runDataset calls wire the correct evaluators and thresholds
to each example group without mixing evaluator semantics across types.
Co-Authored-By: Claude Sonnet 4.6
---
.../detection-rule-management.dataset.ts | 125 ++++++++++++++++++
1 file changed, 125 insertions(+)
create mode 100644 evals/datasets/detection-rule-management.dataset.ts
diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts
new file mode 100644
index 0000000..09c6563
--- /dev/null
+++ b/evals/datasets/detection-rule-management.dataset.ts
@@ -0,0 +1,125 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Dataset, Example } from "../types.js";
+import { runDataset } from "../runner.js";
+import { skillActivation } from "../evaluators/skill-activation.js";
+import { negativeActivation } from "../evaluators/negative-activation.js";
+import { toolSelection } from "../evaluators/tool-selection.js";
+
+/**
+ * The model-facing entry-point tool registered by the
+ * detection-rule-management skill (src/tools/detection-rules.ts).
+ */
+const SKILL_TOOL = "manage-rules";
+
+// ---------------------------------------------------------------------------
+// Positive examples — the LLM should call manage-rules
+// ---------------------------------------------------------------------------
+
+const positiveExamples: Example[] = [
+ {
+ id: "drm-pos-01",
+ input: "Show me my noisy rules — which detection rules are generating the most alerts?",
+ expected: {
+ skill: SKILL_TOOL,
+ tools: [SKILL_TOOL],
+ },
+ },
+ {
+ id: "drm-pos-02",
+ input: "List all my currently enabled detection rules",
+ expected: {
+ skill: SKILL_TOOL,
+ tools: [SKILL_TOOL],
+ },
+ },
+ {
+ id: "drm-pos-03",
+ input: "Find high severity detection rules related to PowerShell execution",
+ expected: {
+ skill: SKILL_TOOL,
+ tools: [SKILL_TOOL],
+ },
+ },
+ {
+ id: "drm-pos-04",
+ input: "What detection rules do I have covering initial access tactics?",
+ expected: {
+ skill: SKILL_TOOL,
+ tools: [SKILL_TOOL],
+ },
+ },
+];
+
+// ---------------------------------------------------------------------------
+// Distractor examples — the LLM should NOT call manage-rules
+// ---------------------------------------------------------------------------
+
+const distractorExamples: Example[] = [
+ {
+ id: "drm-neg-01",
+ input: "Create a new case for a ransomware incident I'm currently investigating",
+ expected: {
+ // skill is set so negativeActivation knows which tool to check for absence
+ skill: SKILL_TOOL,
+ },
+ },
+ {
+ id: "drm-neg-02",
+ input: "Show me all critical alerts that fired in the last hour",
+ expected: {
+ skill: SKILL_TOOL,
+ },
+ },
+ {
+ id: "drm-neg-03",
+ input: "Run an ES|QL query to find failed SSH login attempts on my Linux hosts",
+ expected: {
+ skill: SKILL_TOOL,
+ },
+ },
+ {
+ id: "drm-neg-04",
+ input: "A process on host web-01 just spawned cmd.exe — help me investigate",
+ expected: {
+ skill: SKILL_TOOL,
+ },
+ },
+];
+
+// ---------------------------------------------------------------------------
+// Export the full dataset for reference / cross-dataset tooling
+// ---------------------------------------------------------------------------
+
+export const detectionRuleManagementDataset: Dataset = {
+ name: "detection-rule-management",
+ examples: [...positiveExamples, ...distractorExamples],
+};
+
+// ---------------------------------------------------------------------------
+// Vitest eval suites
+// Each runDataset call registers a describe block gated on RUN_LLM_EVALS.
+// Positives and distractors use different evaluators and passing thresholds.
+// ---------------------------------------------------------------------------
+
+runDataset(
+ { name: "detection-rule-management: positives", examples: positiveExamples },
+ {
+ "skill-activation": skillActivation,
+ "tool-selection": toolSelection,
+ },
+ { passingScore: 0.8 }
+);
+
+runDataset(
+ { name: "detection-rule-management: distractors", examples: distractorExamples },
+ {
+ "negative-activation": negativeActivation,
+ },
+ { passingScore: 1.0 } // 100% — any false positive is a regression
+);
From 726b3bd549438db0788ded80109bf83824d1121a Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:54:39 +0200
Subject: [PATCH 14/42] evals: add detection-rule-management.eval.test.ts;
split dataset from test orchestration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Separates data from test concerns:
- detection-rule-management.dataset.ts now only exports data (positiveExamples,
distractorExamples, detectionRuleManagementDataset); no runDataset calls
- detection-rule-management.eval.test.ts is the Vitest entry point that
imports the sub-arrays and calls runDataset with the correct evaluators
Gate layout (unchanged from before):
positives — skill-activation + tool-selection, passingScore: 0.8
distractors — negative-activation, passingScore: 1.0
The .eval.test.ts suffix matches the include glob in evals/vitest.config.ts
so `npm run test:evals` picks it up without further config changes.
Co-Authored-By: Claude Sonnet 4.6
---
.../detection-rule-management.dataset.ts | 30 +----------
evals/detection-rule-management.eval.test.ts | 54 +++++++++++++++++++
2 files changed, 56 insertions(+), 28 deletions(-)
create mode 100644 evals/detection-rule-management.eval.test.ts
diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts
index 09c6563..a1e2a2c 100644
--- a/evals/datasets/detection-rule-management.dataset.ts
+++ b/evals/datasets/detection-rule-management.dataset.ts
@@ -6,10 +6,6 @@
*/
import type { Dataset, Example } from "../types.js";
-import { runDataset } from "../runner.js";
-import { skillActivation } from "../evaluators/skill-activation.js";
-import { negativeActivation } from "../evaluators/negative-activation.js";
-import { toolSelection } from "../evaluators/tool-selection.js";
/**
* The model-facing entry-point tool registered by the
@@ -21,7 +17,7 @@ const SKILL_TOOL = "manage-rules";
// Positive examples — the LLM should call manage-rules
// ---------------------------------------------------------------------------
-const positiveExamples: Example[] = [
+export const positiveExamples: Example[] = [
{
id: "drm-pos-01",
input: "Show me my noisy rules — which detection rules are generating the most alerts?",
@@ -60,7 +56,7 @@ const positiveExamples: Example[] = [
// Distractor examples — the LLM should NOT call manage-rules
// ---------------------------------------------------------------------------
-const distractorExamples: Example[] = [
+export const distractorExamples: Example[] = [
{
id: "drm-neg-01",
input: "Create a new case for a ransomware incident I'm currently investigating",
@@ -101,25 +97,3 @@ export const detectionRuleManagementDataset: Dataset = {
examples: [...positiveExamples, ...distractorExamples],
};
-// ---------------------------------------------------------------------------
-// Vitest eval suites
-// Each runDataset call registers a describe block gated on RUN_LLM_EVALS.
-// Positives and distractors use different evaluators and passing thresholds.
-// ---------------------------------------------------------------------------
-
-runDataset(
- { name: "detection-rule-management: positives", examples: positiveExamples },
- {
- "skill-activation": skillActivation,
- "tool-selection": toolSelection,
- },
- { passingScore: 0.8 }
-);
-
-runDataset(
- { name: "detection-rule-management: distractors", examples: distractorExamples },
- {
- "negative-activation": negativeActivation,
- },
- { passingScore: 1.0 } // 100% — any false positive is a regression
-);
diff --git a/evals/detection-rule-management.eval.test.ts b/evals/detection-rule-management.eval.test.ts
new file mode 100644
index 0000000..ec9cab3
--- /dev/null
+++ b/evals/detection-rule-management.eval.test.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * End-to-end eval spec for the detection-rule-management skill.
+ *
+ * Proves the eval harness (runner → runMcpHostLoop → evaluators) works
+ * against a real registered skill using the proof dataset. Run via:
+ *
+ * npm run test:evals
+ *
+ * This suite is skipped in regular `npm test` because runDataset wraps
+ * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`.
+ *
+ * Gate summary:
+ * positives — skill-activation + tool-selection ≥ 80%
+ * distractors — negative-activation = 100% (any false positive is a regression)
+ */
+
+import { runDataset } from "./runner.js";
+import {
+ positiveExamples,
+ distractorExamples,
+} from "./datasets/detection-rule-management.dataset.js";
+import { skillActivation } from "./evaluators/skill-activation.js";
+import { negativeActivation } from "./evaluators/negative-activation.js";
+import { toolSelection } from "./evaluators/tool-selection.js";
+
+runDataset(
+ {
+ name: "detection-rule-management: positives",
+ examples: positiveExamples,
+ },
+ {
+ "skill-activation": skillActivation,
+ "tool-selection": toolSelection,
+ },
+ { passingScore: 0.8 }
+);
+
+runDataset(
+ {
+ name: "detection-rule-management: distractors",
+ examples: distractorExamples,
+ },
+ {
+ "negative-activation": negativeActivation,
+ },
+ { passingScore: 1.0 } // 100% — any false positive is a regression
+);
From 77844857b9b62e452690740a39bc4befd9f4b8b3 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 11:57:55 +0200
Subject: [PATCH 15/42] ci: add evals.yml GitHub Actions workflow
Triggers:
- workflow_dispatch manual run from Actions UI
- schedule (0 2 * * *) nightly at 02:00 UTC
- pull_request_target only when 'evals' label is added; gated by label
write permission so only maintainers can trigger
Concurrency group 'evals-[' cancels in-progress runs on new pushes,
preventing redundant jobs from burning LLM quota.
The 'Run evals' step sets RUN_LLM_EVALS=1 and passes four secrets:
EVAL_ANTHROPIC_API_KEY Claude Haiku (priority)
EVAL_OPENAI_API_KEY GPT-4o-mini fallback
EVAL_LITELLM_BASE_URL optional LiteLLM proxy base URL
EVAL_CLUSTERS_JSON Elastic cluster credentials for the MCP server
Output is captured with tee so it appears in the job log AND in eval-output.txt.
A separate 'Post eval results' step (if: always()) appends '## Eval results'
plus the full output to $GITHUB_STEP_SUMMARY so the rendered Markdown tables
from the runner appear in the Actions job summary.
For pull_request_target the checkout uses the PR head SHA so evals run against
the proposed changes rather than the base branch.
Co-Authored-By: Claude Sonnet 4.6 ]
---
.github/workflows/evals.yml | 87 +++++++++++++++++++++++++++++++++++++
1 file changed, 87 insertions(+)
create mode 100644 .github/workflows/evals.yml
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
new file mode 100644
index 0000000..c4b951a
--- /dev/null
+++ b/.github/workflows/evals.yml
@@ -0,0 +1,87 @@
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0; you may not use this file except in compliance with the Elastic License
+# 2.0.
+
+name: Evals
+
+on:
+ # Manually trigger a run from the Actions UI (useful for ad-hoc evaluation).
+ workflow_dispatch:
+
+ # Nightly run at 02:00 UTC to catch regressions before the work day starts.
+ schedule:
+ - cron: "0 2 * * *"
+
+ # Run when a PR is labeled with `evals`. Labels require write permission, so
+ # this implicitly limits triggering to maintainers — acceptable because
+ # pull_request_target runs with base-repo secrets.
+ pull_request_target:
+ types: [labeled]
+
+# Cancel any in-progress run for the same ref so a fast push doesn't queue up
+# redundant eval jobs that waste LLM quota.
+concurrency:
+ group: evals-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ evals:
+ name: LLM Eval Suite
+ runs-on: ubuntu-latest
+
+ # For pull_request_target, gate strictly on the evals label so the job
+ # doesn't fire for every other label event.
+ if: |
+ github.event_name == 'workflow_dispatch' ||
+ github.event_name == 'schedule' ||
+ (github.event_name == 'pull_request_target' && github.event.label.name == 'evals')
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ # For pull_request_target, check out the PR head so the eval runs
+ # against the proposed changes, not the base branch.
+ ref: >-
+ ${{
+ github.event_name == 'pull_request_target'
+ && github.event.pull_request.head.sha
+ || github.sha
+ }}
+
+ - uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ cache: npm
+
+ - name: Install dependencies
+ run: npm ci
+
+ - name: Run evals
+ env:
+ RUN_LLM_EVALS: "1"
+ # Set ANTHROPIC_API_KEY to use Claude Haiku (preferred); fall back to
+ # OPENAI_API_KEY for GPT-4o-mini. Set EVAL_LITELLM_BASE_URL to route
+ # through a LiteLLM proxy instead of the direct OpenAI endpoint.
+ ANTHROPIC_API_KEY: ${{ secrets.EVAL_ANTHROPIC_API_KEY }}
+ OPENAI_API_KEY: ${{ secrets.EVAL_OPENAI_API_KEY }}
+ LITELLM_BASE_URL: ${{ secrets.EVAL_LITELLM_BASE_URL }}
+ # JSON array describing the Elastic cluster the MCP server targets.
+ # Shape: [{"name":"primary","elasticsearchUrl":"...","kibanaUrl":"...","elasticsearchApiKey":"..."}]
+ CLUSTERS_JSON: ${{ secrets.EVAL_CLUSTERS_JSON }}
+ run: |
+ set -o pipefail
+ npm run test:evals 2>&1 | tee eval-output.txt
+
+ - name: Post eval results to job summary
+ if: always()
+ run: |
+ if [ -f eval-output.txt ]; then
+ echo "## Eval results" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ cat eval-output.txt >> "$GITHUB_STEP_SUMMARY"
+ else
+ echo "## Eval results" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "_No eval output captured._" >> "$GITHUB_STEP_SUMMARY"
+ fi
From ac864b8ad20b5c958979e0a9506f1097c3857390 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:02:27 +0200
Subject: [PATCH 16/42] =?UTF-8?q?docs:=20add=20evals.md=20=E2=80=94=20harn?=
=?UTF-8?q?ess=20design,=20dataset=20shape,=20evaluator=20catalog,=20CI=20?=
=?UTF-8?q?gating?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Covers:
- Architecture diagram showing runner → runMcpHostLoop → evaluators pipeline
- Key design choices table (in-process transport, skip-if guard, N/A semantics)
- Dataset shape reference with all three optional expected fields documented
- Positive vs distractor example pattern with runDataset code snippets
- Evaluator catalog: type, score range, N/A condition, and recommended gate for
all five evaluators (skill-activation, negative-activation, tool-selection,
trajectory, criteria)
- Step-by-step how-to-add-dataset guide with copy-paste templates
- CI gating: workflow triggers, required secrets table, passing threshold table
Co-Authored-By: Claude Sonnet 4.6
---
docs/evals.md | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 260 insertions(+)
create mode 100644 docs/evals.md
diff --git a/docs/evals.md b/docs/evals.md
new file mode 100644
index 0000000..486af33
--- /dev/null
+++ b/docs/evals.md
@@ -0,0 +1,260 @@
+# Eval Harness
+
+LLM-powered evals for the Elastic Security MCP app's skill layer. The harness
+tests whether the LLM host activates the right skill, calls the right tools in
+the right order, and does not fire on unrelated queries.
+
+Regular `npm test` never touches this harness — it only runs when
+`RUN_LLM_EVALS=1` is set, so CI stays fast and free of LLM costs.
+
+---
+
+## Architecture
+
+```
+Dataset (examples)
+ │
+ ▼
+runner.ts ─ describe.skipIf(!RUN_LLM_EVALS)(dataset.name, () => {
+ │ for each example:
+ │ trajectory = await runMcpHostLoop(input)
+ │ scores = await evaluators[*](trajectory, expected)
+ │ assert score >= passingScore
+ │ afterAll: print Markdown table to stdout
+ │ })
+ │
+ ├── runMcpHostLoop(input)
+ │ InMemoryTransport ─ Client ─ McpServer
+ │ LLM provider (Anthropic / OpenAI / LiteLLM)
+ │ loop ≤ MAX_TURNS=8: LLM → tool calls → results → repeat
+ │ returns Trajectory (ordered ToolCall[])
+ │
+ └── Evaluators
+ skill-activation binary: was skill tool called?
+ negative-activation binary: was skill tool correctly absent?
+ tool-selection F1 precision/recall against expected.tools
+ trajectory LCS similarity of actual vs expected sequence
+ criteria LLM-as-judge against natural-language assertions
+```
+
+### Key design choices
+
+| Decision | Rationale |
+|---|---|
+| In-process via `InMemoryTransport` | No network, no server process — evals run anywhere |
+| `describe.skipIf(!RUN_LLM_EVALS)` | Zero LLM cost in regular `npm test` |
+| `Evaluator` is a plain function | Easy to compose; factory pattern for stateful evaluators (criteria) |
+| `'N/A'` return instead of 0 | Datasets omit irrelevant evaluator dimensions without masking real regressions |
+| LCS for trajectory | Order matters; set-based coverage is tool-selection's job |
+
+---
+
+## Dataset shape
+
+A dataset is a `Dataset` object exported from a `*.dataset.ts` file:
+
+```typescript
+import type { Dataset } from "../types.js";
+
+export const myDataset: Dataset = {
+ name: "my-skill",
+ examples: [
+ {
+ id: "ms-pos-01", // stable, unique — appears in CI summaries
+ input: "user message to the LLM", // the query sent to runMcpHostLoop
+ expected: {
+ skill: "entry-point-tool-name", // tool the skill SKILL.md instructs the LLM to call
+ tools: ["entry-point-tool-name"], // ordered list for trajectory/tool-selection
+ criteria: [ // natural-language assertions for LLM-as-judge
+ "The model called the correct entry-point tool",
+ ],
+ },
+ },
+ ],
+};
+```
+
+All three `expected` fields are **optional**:
+
+| Field | Evaluators that use it | Omit when… |
+|---|---|---|
+| `skill` | `skill-activation`, `negative-activation` | Dataset doesn't test skill routing |
+| `tools` | `tool-selection`, `trajectory` | No ordered tool expectation |
+| `criteria` | `criteria` | No LLM-as-judge needed (saves cost) |
+
+Omitting a field causes the evaluator to return `'N/A'` for that example rather than a false 0.
+
+### Positive vs distractor examples
+
+A **positive** example is a query that *should* activate the skill.
+A **distractor** example is an unrelated query that *should not*.
+
+Use separate `runDataset` calls with different evaluators for each group:
+
+```typescript
+// Positive: skill should fire
+runDataset(
+ { name: "my-skill: positives", examples: positiveExamples },
+ { "skill-activation": skillActivation, "tool-selection": toolSelection },
+ { passingScore: 0.8 }
+);
+
+// Distractor: skill must NOT fire (gate is 100%)
+runDataset(
+ { name: "my-skill: distractors", examples: distractorExamples },
+ { "negative-activation": negativeActivation },
+ { passingScore: 1.0 }
+);
+```
+
+---
+
+## Evaluator catalog
+
+### `skill-activation`
+
+**Type**: binary · **Score**: `1` if `expected.skill` found in trajectory, `0` otherwise
+**Returns `'N/A'`**: when `expected.skill` is absent
+**Gate**: ≥ 0.8 on positive examples (use `passingScore: 0.8`)
+
+Tests whether the LLM called the skill's model-facing entry-point tool at
+least once.
+
+### `negative-activation`
+
+**Type**: binary · **Score**: `1` if `expected.skill` is *absent* from trajectory, `0` if present
+**Returns `'N/A'`**: when `expected.skill` is absent
+**Gate**: 1.0 on distractor examples (use `passingScore: 1.0`)
+
+Tests that the skill does not over-trigger on unrelated queries. Any false
+positive here means the skill's SKILL.md is too broad.
+
+### `tool-selection`
+
+**Type**: F1 · **Score**: harmonic mean of precision and recall against `expected.tools` (set-based)
+**Returns `'N/A'`**: when `expected.tools` is absent
+**Gate**: ≥ 0.8 on positive examples
+
+Tests *which* tools were called, ignoring order. Missed tools lower recall;
+spurious tools lower precision. Failure reason includes `missed: [...]` and
+`extra: [...]`.
+
+### `trajectory`
+
+**Type**: LCS similarity · **Score**: `lcs(actual, expected) / max(|actual|, |expected|)`
+**Returns `'N/A'`**: when `expected.tools` is absent
+**Gate**: ≥ 0.7 on positive examples (sequence matching is looser than set matching)
+
+Tests *order*. Dividing by `max` penalises both missing and extra steps.
+Use alongside `tool-selection` for full coverage.
+
+### `criteria`
+
+**Type**: LLM-as-judge · **Score**: `0.0–1.0` parsed from a rubric prompt response
+**Returns `'N/A'`**: when `expected.criteria` is absent
+**Gate**: ≥ 0.7
+
+Calls the judge LLM with the trajectory `{tool, args}` pairs and the
+criteria list. Asks for `{"score": <0–1>, "reasoning": "..."}`. Falls back
+to regex number extraction if JSON parse fails. Use for semantic assertions
+that structural evaluators can't express.
+
+**Cost**: one extra LLM call per example. Omit `expected.criteria` to skip.
+
+---
+
+## How to add a dataset
+
+1. **Create the data file** `evals/datasets/.dataset.ts`:
+
+ ```typescript
+ import type { Dataset, Example } from "../types.js";
+
+ const SKILL_TOOL = "my-tool"; // the model-facing entry-point tool
+
+ export const positiveExamples: Example[] = [
+ { id: "ms-pos-01", input: "...", expected: { skill: SKILL_TOOL, tools: [SKILL_TOOL] } },
+ // add ≥ 4 examples
+ ];
+
+ export const distractorExamples: Example[] = [
+ { id: "ms-neg-01", input: "...", expected: { skill: SKILL_TOOL } },
+ // add ≥ 4 examples
+ ];
+
+ export const myDataset: Dataset = {
+ name: "",
+ examples: [...positiveExamples, ...distractorExamples],
+ };
+ ```
+
+2. **Create the eval spec** `evals/.eval.test.ts`:
+
+ ```typescript
+ import { runDataset } from "./runner.js";
+ import { positiveExamples, distractorExamples } from "./datasets/.dataset.js";
+ import { skillActivation } from "./evaluators/skill-activation.js";
+ import { negativeActivation } from "./evaluators/negative-activation.js";
+ import { toolSelection } from "./evaluators/tool-selection.js";
+
+ runDataset(
+ { name: ": positives", examples: positiveExamples },
+ { "skill-activation": skillActivation, "tool-selection": toolSelection },
+ { passingScore: 0.8 }
+ );
+
+ runDataset(
+ { name: ": distractors", examples: distractorExamples },
+ { "negative-activation": negativeActivation },
+ { passingScore: 1.0 }
+ );
+ ```
+
+3. **Run locally**:
+
+ ```bash
+ # Anthropic (preferred)
+ ANTHROPIC_API_KEY=sk-ant-... CLUSTERS_JSON='[{...}]' npm run test:evals
+
+ # OpenAI / LiteLLM proxy
+ OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... CLUSTERS_JSON='[{...}]' npm run test:evals
+ ```
+
+4. **Trigger in CI**: open a PR and add the `evals` label (requires write access).
+
+---
+
+## CI gating
+
+### Workflow: `.github/workflows/evals.yml`
+
+| Trigger | When |
+|---|---|
+| `workflow_dispatch` | Manual run from Actions UI |
+| `schedule` | Nightly at 02:00 UTC |
+| `pull_request_target` | When `evals` label is added to a PR |
+
+The concurrency group `evals-[` cancels superseded runs to avoid wasting
+LLM quota on stale pushes.
+
+### Required secrets
+
+| Secret | Purpose |
+|---|---|
+| `EVAL_ANTHROPIC_API_KEY` | Anthropic API key (priority provider) |
+| `EVAL_OPENAI_API_KEY` | OpenAI / LiteLLM API key (fallback) |
+| `EVAL_LITELLM_BASE_URL` | Optional LiteLLM proxy base URL |
+| `EVAL_CLUSTERS_JSON` | Elastic cluster credentials for the MCP server |
+
+### Passing thresholds (recommended defaults)
+
+| Evaluator | Positives | Distractors |
+|---|---|---|
+| `skill-activation` | ≥ 0.8 | — |
+| `negative-activation` | — | = 1.0 |
+| `tool-selection` | ≥ 0.8 | — |
+| `trajectory` | ≥ 0.7 | — |
+| `criteria` | ≥ 0.7 | — |
+
+Results are posted as a Markdown table to the GitHub Actions job summary
+(`$GITHUB_STEP_SUMMARY`) after every run.
From e9a23fa184702423613da968d674ad707fed9459 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski ]
Date: Fri, 15 May 2026 12:08:54 +0200
Subject: [PATCH 17/42] feat: add MigrationsService wrapping 14
/internal/siem_migrations/* Kibana routes
Service injects KibanaClient directly (no separate *Client indirection since
these are internal-only Kibana routes with no public API equivalent). The
KibanaClient already supplies x-elastic-internal-origin: Kibana; each method
adds elastic-api-version: 2023-10-31 via MIGRATION_HEADERS per-request.
14 methods, one per route:
createMigration POST /internal/siem_migrations/rules
listMigrations GET /internal/siem_migrations/rules
getMigration GET /internal/siem_migrations/rules/:id
deleteMigration DELETE /internal/siem_migrations/rules/:id
uploadRules POST /internal/siem_migrations/rules/:id/rules
getTranslatedRules GET /internal/siem_migrations/rules/:id/rules
getTranslatedRule GET /internal/siem_migrations/rules/:id/rules/:ruleId
updateTranslatedRule PUT /internal/siem_migrations/rules/:id/rules/:ruleId
startTranslation POST /internal/siem_migrations/rules/:id/start
stopTranslation POST /internal/siem_migrations/rules/:id/stop
getResources GET /internal/siem_migrations/resources/:id
upsertResources POST /internal/siem_migrations/resources/:id
installRules POST /internal/siem_migrations/rules/:id/install
getStats GET /internal/siem_migrations/rules/:id/stats
MigrationApiError wraps every non-2xx response with typed status (extracted
from the Kibana client's "Kibana [cluster] STATUS: body" error format) and the
request path so callers can surface actionable error messages.
Domain types: SiemMigration, TranslatedRule, MigrationResource, MigrationStats
and associated option/result interfaces, all barrel-exported from service/index.
Co-Authored-By: Claude Sonnet 4.6
---
src/elastic/service/index.ts | 15 +
src/elastic/service/migrationsService.ts | 361 +++++++++++++++++++++++
2 files changed, 376 insertions(+)
create mode 100644 src/elastic/service/migrationsService.ts
diff --git a/src/elastic/service/index.ts b/src/elastic/service/index.ts
index 38671ee..3c6e574 100644
--- a/src/elastic/service/index.ts
+++ b/src/elastic/service/index.ts
@@ -19,3 +19,18 @@ export type {
ScenarioRuleDef,
} from "./sampleDataService.js";
export { SampleDataService, SCENARIO_NAMES, SCENARIO_RULES } from "./sampleDataService.js";
+export type {
+ SiemMigration,
+ TranslatedRule,
+ MigrationResource,
+ MigrationStats,
+ ListTranslatedRulesOptions,
+ ListTranslatedRulesResult,
+ InstallRulesOptions,
+ InstallRulesResult,
+} from "./migrationsService.js";
+export {
+ MigrationApiError,
+ MigrationsService,
+ SIEM_MIGRATIONS_API_BASE,
+} from "./migrationsService.js";
diff --git a/src/elastic/service/migrationsService.ts b/src/elastic/service/migrationsService.ts
new file mode 100644
index 0000000..ffd0dd4
--- /dev/null
+++ b/src/elastic/service/migrationsService.ts
@@ -0,0 +1,361 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { KibanaClient } from "../kibana-client/index.js";
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+export const SIEM_MIGRATIONS_API_BASE = "/internal/siem_migrations";
+
+/**
+ * Per-request headers required by the Kibana internal SIEM migrations API.
+ * `x-elastic-internal-origin: Kibana` is pre-baked into `KibanaClient`;
+ * only the versioning header needs to be added on each call.
+ */
+const MIGRATION_HEADERS = {
+ "elastic-api-version": "2023-10-31",
+} as const;
+
+// ---------------------------------------------------------------------------
+// Domain types
+// ---------------------------------------------------------------------------
+
+export interface SiemMigration {
+ id: string;
+ name: string;
+ /** Lifecycle status of the migration. */
+ status: "ready" | "running" | "finished" | "error";
+ created_at: string;
+ last_updated_at: string;
+ rules: {
+ total: number;
+ pending: number;
+ processing: number;
+ completed: number;
+ failed: number;
+ installable: number;
+ installed: number;
+ partially_translated: number;
+ untranslatable: number;
+ };
+}
+
+export interface TranslatedRule {
+ id: string;
+ migration_id: string;
+ status: "pending" | "processing" | "completed" | "failed";
+ translation_result?: "full" | "partial" | "untranslatable";
+ elastic_rule?: Record;
+ original_rule: Record;
+ comments?: string[];
+}
+
+export interface MigrationResource {
+ type: "macro" | "lookup";
+ name: string;
+ content: string;
+}
+
+export interface MigrationStats {
+ id: string;
+ status: SiemMigration["status"];
+ rules: SiemMigration["rules"];
+}
+
+export interface ListTranslatedRulesOptions {
+ readonly page?: number;
+ readonly perPage?: number;
+ readonly filter?: string;
+}
+
+export interface ListTranslatedRulesResult {
+ data: TranslatedRule[];
+ total: number;
+}
+
+export interface InstallRulesOptions {
+ /** Specific rule IDs to install; omit to install all installable rules. */
+ ids?: string[];
+}
+
+export interface InstallRulesResult {
+ installed: number;
+ failed: number;
+}
+
+// ---------------------------------------------------------------------------
+// Typed error
+// ---------------------------------------------------------------------------
+
+/**
+ * Thrown by every {@link MigrationsService} method on a non-2xx response.
+ *
+ * The Kibana client's response interceptor formats AxiosErrors as
+ * `"Kibana [] : "` before they reach here, so
+ * `status` is extracted from that message when available.
+ */
+export class MigrationApiError extends Error {
+ readonly status: number;
+ readonly path: string;
+
+ constructor(path: string, cause: unknown) {
+ const causeMsg = cause instanceof Error ? cause.message : String(cause);
+ // Match the Kibana client error format: "Kibana [name] STATUS: detail"
+ const statusMatch = causeMsg.match(/\b([1-5]\d{2})\b/);
+ const status = statusMatch ? parseInt(statusMatch[1], 10) : 0;
+
+ super(`SIEM Migrations API error on ${path}: ${causeMsg}`);
+ this.name = "MigrationApiError";
+ this.status = status;
+ this.path = path;
+ if (cause instanceof Error) {
+ this.cause = cause;
+ }
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Service
+// ---------------------------------------------------------------------------
+
+interface MigrationsServiceOptions {
+ readonly kibanaClient: KibanaClient;
+}
+
+/**
+ * Thin wrapper over the 14 `/internal/siem_migrations/*` Kibana routes.
+ *
+ * Every method adds `elastic-api-version: 2023-10-31`; the underlying
+ * {@link KibanaClient} supplies `x-elastic-internal-origin: Kibana` and
+ * authentication on every request. Non-2xx responses are re-thrown as
+ * {@link MigrationApiError}.
+ */
+export class MigrationsService {
+ private readonly client: KibanaClient;
+
+ constructor(options: MigrationsServiceOptions) {
+ this.client = options.kibanaClient;
+ }
+
+ // ── Migration lifecycle ──────────────────────────────────────────────────
+
+ /** POST /internal/siem_migrations/rules */
+ async createMigration(name: string): Promise<{ migration_id: string }> {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules`;
+ try {
+ const { data } = await this.client.post<{ migration_id: string }>(
+ path,
+ { name },
+ { headers: MIGRATION_HEADERS }
+ );
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** GET /internal/siem_migrations/rules */
+ async listMigrations(): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules`;
+ try {
+ const { data } = await this.client.get(path, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** GET /internal/siem_migrations/rules/:migrationId */
+ async getMigration(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`;
+ try {
+ const { data } = await this.client.get(path, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** DELETE /internal/siem_migrations/rules/:migrationId */
+ async deleteMigration(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`;
+ try {
+ await this.client.delete(path, { headers: MIGRATION_HEADERS });
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Splunk rule upload ───────────────────────────────────────────────────
+
+ /** POST /internal/siem_migrations/rules/:migrationId/rules */
+ async uploadRules(
+ migrationId: string,
+ rules: Record[]
+ ): Promise<{ total: number }> {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`;
+ try {
+ const { data } = await this.client.post<{ total: number }>(
+ path,
+ rules,
+ { headers: MIGRATION_HEADERS }
+ );
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Translated rules ─────────────────────────────────────────────────────
+
+ /** GET /internal/siem_migrations/rules/:migrationId/rules */
+ async getTranslatedRules(
+ migrationId: string,
+ options: ListTranslatedRulesOptions = {}
+ ): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`;
+ const params: Record = {
+ page: String(options.page ?? 1),
+ per_page: String(options.perPage ?? 20),
+ };
+ if (options.filter) params.filter = options.filter;
+
+ try {
+ const { data } = await this.client.get(path, {
+ params,
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** GET /internal/siem_migrations/rules/:migrationId/rules/:ruleId */
+ async getTranslatedRule(
+ migrationId: string,
+ ruleId: string
+ ): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`;
+ try {
+ const { data } = await this.client.get(path, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** PUT /internal/siem_migrations/rules/:migrationId/rules/:ruleId */
+ async updateTranslatedRule(
+ migrationId: string,
+ ruleId: string,
+ updates: Partial>
+ ): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`;
+ try {
+ const { data } = await this.client.put(path, updates, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Translation control ──────────────────────────────────────────────────
+
+ /** POST /internal/siem_migrations/rules/:migrationId/start */
+ async startTranslation(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/start`;
+ try {
+ await this.client.post(path, {}, { headers: MIGRATION_HEADERS });
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** POST /internal/siem_migrations/rules/:migrationId/stop */
+ async stopTranslation(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stop`;
+ try {
+ await this.client.post(path, {}, { headers: MIGRATION_HEADERS });
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Resources ────────────────────────────────────────────────────────────
+
+ /** GET /internal/siem_migrations/resources/:migrationId */
+ async getResources(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`;
+ try {
+ const { data } = await this.client.get(path, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ /** POST /internal/siem_migrations/resources/:migrationId */
+ async upsertResources(
+ migrationId: string,
+ resources: MigrationResource[]
+ ): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`;
+ try {
+ await this.client.post(path, resources, { headers: MIGRATION_HEADERS });
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Installation ─────────────────────────────────────────────────────────
+
+ /** POST /internal/siem_migrations/rules/:migrationId/install */
+ async installRules(
+ migrationId: string,
+ options: InstallRulesOptions = {}
+ ): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/install`;
+ try {
+ const { data } = await this.client.post(
+ path,
+ options.ids ? { ids: options.ids } : {},
+ { headers: MIGRATION_HEADERS }
+ );
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+
+ // ── Stats ────────────────────────────────────────────────────────────────
+
+ /** GET /internal/siem_migrations/rules/:migrationId/stats */
+ async getStats(migrationId: string): Promise {
+ const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stats`;
+ try {
+ const { data } = await this.client.get(path, {
+ headers: MIGRATION_HEADERS,
+ });
+ return data;
+ } catch (err) {
+ throw new MigrationApiError(path, err);
+ }
+ }
+}
From 2a4ec7d1ab9eacb5c7ad90b8664cd29295c6b372 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:13:58 +0200
Subject: [PATCH 18/42] test: add MigrationsService tests covering all 14 route
methods and error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
19 tests across 14 describe blocks — one per route method plus three
error-handling tests:
Migration lifecycle: createMigration, listMigrations, getMigration, deleteMigration
Rule upload: uploadRules
Translated rules: getTranslatedRules (default+custom pagination), getTranslatedRule, updateTranslatedRule
Translation control: startTranslation, stopTranslation
Resources: getResources, upsertResources
Installation: installRules (no-ids + with-ids)
Stats: getStats
MigrationApiError: status parsed from Kibana error format; status=0 fallback;
all mutating methods surface MigrationApiError
Also adds `put: vi.fn()` to MockHttpClient / makeMock in mockHttpClient.ts
so MigrationsService.updateTranslatedRule can be exercised.
Co-Authored-By: Claude Sonnet 4.6
---
src/elastic/service/migrationsService.test.ts | 329 ++++++++++++++++++
src/test/helpers/mockHttpClient.ts | 2 +
2 files changed, 331 insertions(+)
create mode 100644 src/elastic/service/migrationsService.test.ts
diff --git a/src/elastic/service/migrationsService.test.ts b/src/elastic/service/migrationsService.test.ts
new file mode 100644
index 0000000..0c184e7
--- /dev/null
+++ b/src/elastic/service/migrationsService.test.ts
@@ -0,0 +1,329 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, beforeEach } from "vitest";
+import {
+ MigrationsService,
+ MigrationApiError,
+ SIEM_MIGRATIONS_API_BASE,
+} from "./migrationsService.js";
+import type { KibanaClient } from "../kibana-client/index.js";
+import {
+ createMockKibanaClient,
+ dataEnvelope,
+ type MockHttpClient,
+} from "../../test/helpers/mockHttpClient.js";
+import type { SiemMigration, TranslatedRule, MigrationResource } from "./migrationsService.js";
+
+const BASE = SIEM_MIGRATIONS_API_BASE;
+const HEADERS = { headers: { "elastic-api-version": "2023-10-31" } };
+
+const MIGRATION_ID = "migration-1";
+const RULE_ID = "rule-1";
+
+const fakeMigration: SiemMigration = {
+ id: MIGRATION_ID,
+ name: "test-migration",
+ status: "ready",
+ created_at: "2026-01-01T00:00:00Z",
+ last_updated_at: "2026-01-01T00:00:00Z",
+ rules: {
+ total: 0, pending: 0, processing: 0, completed: 0, failed: 0,
+ installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+ },
+};
+
+const fakeRule: TranslatedRule = {
+ id: RULE_ID,
+ migration_id: MIGRATION_ID,
+ status: "completed",
+ translation_result: "full",
+ original_rule: { name: "splunk-rule" },
+};
+
+const fakeResource: MigrationResource = {
+ type: "macro",
+ name: "my_macro",
+ content: "| where true",
+};
+
+describe("MigrationsService", () => {
+ let kibanaClient: KibanaClient & MockHttpClient;
+ let service: MigrationsService;
+
+ beforeEach(() => {
+ kibanaClient = createMockKibanaClient();
+ service = new MigrationsService({ kibanaClient });
+ });
+
+ // ── Migration lifecycle ────────────────────────────────────────────────────
+
+ describe("createMigration", () => {
+ it("POSTs to /rules with the migration name and returns migration_id", async () => {
+ kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ migration_id: MIGRATION_ID }));
+
+ const result = await service.createMigration("My Migration");
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/rules`,
+ { name: "My Migration" },
+ HEADERS
+ );
+ expect(result).toEqual({ migration_id: MIGRATION_ID });
+ });
+ });
+
+ describe("listMigrations", () => {
+ it("GETs /rules and returns the array", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeMigration]));
+
+ const result = await service.listMigrations();
+
+ expect(kibanaClient.get).toHaveBeenCalledWith(`${BASE}/rules`, HEADERS);
+ expect(result).toEqual([fakeMigration]);
+ });
+ });
+
+ describe("getMigration", () => {
+ it("GETs /rules/:migrationId and returns the migration", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeMigration));
+
+ const result = await service.getMigration(MIGRATION_ID);
+
+ expect(kibanaClient.get).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}`,
+ HEADERS
+ );
+ expect(result).toEqual(fakeMigration);
+ });
+ });
+
+ describe("deleteMigration", () => {
+ it("DELETEs /rules/:migrationId", async () => {
+ await service.deleteMigration(MIGRATION_ID);
+
+ expect(kibanaClient.delete).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}`,
+ HEADERS
+ );
+ });
+ });
+
+ // ── Rule upload ────────────────────────────────────────────────────────────
+
+ describe("uploadRules", () => {
+ it("POSTs rules array to /rules/:migrationId/rules and returns totals", async () => {
+ kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ total: 5 }));
+ const splunkRules = [{ search: "index=main" }, { search: "index=security" }];
+
+ const result = await service.uploadRules(MIGRATION_ID, splunkRules);
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/rules`,
+ splunkRules,
+ HEADERS
+ );
+ expect(result).toEqual({ total: 5 });
+ });
+ });
+
+ // ── Translated rules ───────────────────────────────────────────────────────
+
+ describe("getTranslatedRules", () => {
+ it("GETs /rules/:migrationId/rules with default pagination", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [fakeRule], total: 1 }));
+
+ const result = await service.getTranslatedRules(MIGRATION_ID);
+
+ const [path, config] = kibanaClient.get.mock.calls[0] as [string, Record];
+ expect(path).toBe(`${BASE}/rules/${MIGRATION_ID}/rules`);
+ expect(config.params).toMatchObject({ page: "1", per_page: "20" });
+ expect(result).toEqual({ data: [fakeRule], total: 1 });
+ });
+
+ it("forwards custom page, perPage and filter params", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [], total: 0 }));
+
+ await service.getTranslatedRules(MIGRATION_ID, { page: 2, perPage: 50, filter: "status:completed" });
+
+ const [, config] = kibanaClient.get.mock.calls[0] as [string, Record];
+ expect(config.params).toEqual({ page: "2", per_page: "50", filter: "status:completed" });
+ });
+ });
+
+ describe("getTranslatedRule", () => {
+ it("GETs /rules/:migrationId/rules/:ruleId", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeRule));
+
+ const result = await service.getTranslatedRule(MIGRATION_ID, RULE_ID);
+
+ expect(kibanaClient.get).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`,
+ HEADERS
+ );
+ expect(result).toEqual(fakeRule);
+ });
+ });
+
+ describe("updateTranslatedRule", () => {
+ it("PUTs updates to /rules/:migrationId/rules/:ruleId and returns the updated rule", async () => {
+ const updated = { ...fakeRule, translation_result: "partial" as const };
+ kibanaClient.put.mockResolvedValueOnce(dataEnvelope(updated));
+
+ const result = await service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {
+ translation_result: "partial",
+ });
+
+ expect(kibanaClient.put).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`,
+ { translation_result: "partial" },
+ HEADERS
+ );
+ expect(result).toEqual(updated);
+ });
+ });
+
+ // ── Translation control ────────────────────────────────────────────────────
+
+ describe("startTranslation", () => {
+ it("POSTs to /rules/:migrationId/start", async () => {
+ await service.startTranslation(MIGRATION_ID);
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/start`,
+ {},
+ HEADERS
+ );
+ });
+ });
+
+ describe("stopTranslation", () => {
+ it("POSTs to /rules/:migrationId/stop", async () => {
+ await service.stopTranslation(MIGRATION_ID);
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/stop`,
+ {},
+ HEADERS
+ );
+ });
+ });
+
+ // ── Resources ──────────────────────────────────────────────────────────────
+
+ describe("getResources", () => {
+ it("GETs /resources/:migrationId and returns the array", async () => {
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeResource]));
+
+ const result = await service.getResources(MIGRATION_ID);
+
+ expect(kibanaClient.get).toHaveBeenCalledWith(
+ `${BASE}/resources/${MIGRATION_ID}`,
+ HEADERS
+ );
+ expect(result).toEqual([fakeResource]);
+ });
+ });
+
+ describe("upsertResources", () => {
+ it("POSTs resources array to /resources/:migrationId", async () => {
+ await service.upsertResources(MIGRATION_ID, [fakeResource]);
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/resources/${MIGRATION_ID}`,
+ [fakeResource],
+ HEADERS
+ );
+ });
+ });
+
+ // ── Installation ───────────────────────────────────────────────────────────
+
+ describe("installRules", () => {
+ it("POSTs empty body to /rules/:migrationId/install when no ids given", async () => {
+ kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 3, failed: 0 }));
+
+ const result = await service.installRules(MIGRATION_ID);
+
+ expect(kibanaClient.post).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/install`,
+ {},
+ HEADERS
+ );
+ expect(result).toEqual({ installed: 3, failed: 0 });
+ });
+
+ it("includes ids in the body when provided", async () => {
+ kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 1, failed: 0 }));
+
+ await service.installRules(MIGRATION_ID, { ids: ["r1", "r2"] });
+
+ const [, body] = kibanaClient.post.mock.calls[0] as [string, Record];
+ expect(body).toEqual({ ids: ["r1", "r2"] });
+ });
+ });
+
+ // ── Stats ──────────────────────────────────────────────────────────────────
+
+ describe("getStats", () => {
+ it("GETs /rules/:migrationId/stats and returns the stats", async () => {
+ const stats = { id: MIGRATION_ID, status: "ready" as const, rules: fakeMigration.rules };
+ kibanaClient.get.mockResolvedValueOnce(dataEnvelope(stats));
+
+ const result = await service.getStats(MIGRATION_ID);
+
+ expect(kibanaClient.get).toHaveBeenCalledWith(
+ `${BASE}/rules/${MIGRATION_ID}/stats`,
+ HEADERS
+ );
+ expect(result).toEqual(stats);
+ });
+ });
+
+ // ── MigrationApiError ──────────────────────────────────────────────────────
+
+ describe("MigrationApiError", () => {
+ it("wraps non-2xx with status parsed from Kibana error format", async () => {
+ const path = `${BASE}/rules/${MIGRATION_ID}`;
+ kibanaClient.get.mockRejectedValue(
+ new Error("Kibana [test-cluster] 404: migration not found")
+ );
+
+ await expect(service.getMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.getMigration(MIGRATION_ID)).rejects.toMatchObject({
+ status: 404,
+ path,
+ message: expect.stringContaining(path) as string,
+ });
+ });
+
+ it("sets status 0 when error message has no HTTP status code", async () => {
+ kibanaClient.get.mockRejectedValueOnce(new Error("network timeout"));
+
+ const err = await service.getMigration(MIGRATION_ID).catch((e) => e as MigrationApiError);
+ expect(err).toBeInstanceOf(MigrationApiError);
+ expect(err.status).toBe(0);
+ });
+
+ it("surfaces a MigrationApiError from every mutating method", async () => {
+ const netErr = new Error("Kibana [test-cluster] 503: service unavailable");
+
+ kibanaClient.post.mockRejectedValue(netErr);
+ kibanaClient.put.mockRejectedValue(netErr);
+ kibanaClient.delete.mockRejectedValue(netErr);
+
+ await expect(service.createMigration("x")).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.uploadRules(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.startTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.stopTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.upsertResources(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.installRules(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {})).rejects.toBeInstanceOf(MigrationApiError);
+ await expect(service.deleteMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+ });
+ });
+});
diff --git a/src/test/helpers/mockHttpClient.ts b/src/test/helpers/mockHttpClient.ts
index b843524..f640f2c 100644
--- a/src/test/helpers/mockHttpClient.ts
+++ b/src/test/helpers/mockHttpClient.ts
@@ -17,6 +17,7 @@ import type { KibanaClient } from "../../elastic/kibana-client/kibana-client.js"
export interface MockHttpClient {
get: Mock;
post: Mock;
+ put: Mock;
patch: Mock;
delete: Mock;
clusterName: string;
@@ -48,6 +49,7 @@ function makeMock(clusterName: string): MockHttpClient {
return {
get: vi.fn().mockResolvedValue({ data: undefined }),
post: vi.fn().mockResolvedValue({ data: undefined }),
+ put: vi.fn().mockResolvedValue({ data: undefined }),
patch: vi.fn().mockResolvedValue({ data: undefined }),
delete: vi.fn().mockResolvedValue({ data: undefined }),
clusterName,
From 16e4d062d9434c14cb544ff249f641dcc7edaeb7 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:17:08 +0200
Subject: [PATCH 19/42] feat: register migration tools (1 model-facing + 10
app-only)
migrate-rules (model-facing):
_meta.ui.resourceUri = ui://migrate-rules/mcp-app.html
Callback seeds the workbench with a compact migration list so the LLM
gets immediate context.
App-only tools (_meta.ui.visibility: ["app"]):
list-migrations GET all migrations
get-migration GET single migration by ID
get-translated-rules paginated translated rule listing (vendor-gated)
start-translation kick off AI translation (vendor-gated)
stop-translation halt in-progress translation (vendor-gated)
update-translated-rule patch elastic_rule / translation_result / comments (vendor-gated)
get-resources list macros/lookups (vendor-gated)
upsert-resource create/replace single macro or lookup (vendor-gated)
install-rules install translated rules, optional id filter (vendor-gated)
get-stats per-migration translation/installation stats
Vendor gate: SUPPORTED_VENDORS = ["splunk"]. If a vendor param is provided
and not in the list, returns { error: "vendorNotSupported", vendor } without
hitting Kibana. Re-enabling a vendor is a one-line change to the constant.
Also registers the migration workbench HTML via registerAppResource; the view
file is resolved at request time (resolveViewPath("migration")) so the tool
works once the view is built in a subsequent commit.
Co-Authored-By: Claude Sonnet 4.6
---
src/tools/migration.ts | 353 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 353 insertions(+)
create mode 100644 src/tools/migration.ts
diff --git a/src/tools/migration.ts b/src/tools/migration.ts
new file mode 100644
index 0000000..5502bd2
--- /dev/null
+++ b/src/tools/migration.ts
@@ -0,0 +1,353 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import {
+ registerAppTool,
+ registerAppResource,
+ RESOURCE_MIME_TYPE,
+} from "@modelcontextprotocol/ext-apps/server";
+import { z } from "zod";
+import fs from "fs";
+import type { MigrationsService } from "../elastic/service/index.js";
+import { resolveViewPath } from "./view-path.js";
+
+const RESOURCE_URI = "ui://migrate-rules/mcp-app.html";
+
+/**
+ * Vendors for which the Kibana SIEM migrations translator is production-ready.
+ * Re-enabling a vendor is a one-line change to this array once the translator
+ * matures — QRadar and Sentinel-One are the next candidates.
+ */
+const SUPPORTED_VENDORS: readonly string[] = ["splunk"];
+
+export interface MigrationToolDeps {
+ readonly migrationsService: MigrationsService;
+}
+
+/** Returns a vendor-gate error response for app-only tools. */
+function vendorNotSupportedResponse(vendor: string) {
+ return {
+ content: [
+ {
+ type: "text" as const,
+ text: JSON.stringify({ error: "vendorNotSupported", vendor }),
+ },
+ ],
+ };
+}
+
+/** Returns true when `vendor` is explicitly provided but not in SUPPORTED_VENDORS. */
+function isUnsupportedVendor(vendor: string | undefined): vendor is string {
+ return vendor !== undefined && !SUPPORTED_VENDORS.includes(vendor);
+}
+
+export function registerMigrationTools(
+ server: McpServer,
+ deps: MigrationToolDeps
+) {
+ const { migrationsService } = deps;
+
+ // ── Model-facing entry-point ───────────────────────────────────────────────
+
+ registerAppTool(
+ server,
+ "migrate-rules",
+ {
+ title: "Migrate Rules",
+ description:
+ "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security. " +
+ "Opens an interactive migration workbench for uploading, translating, reviewing, " +
+ "and installing rules. Vendor support: Splunk (active), QRadar / Sentinel-One (coming soon).",
+ inputSchema: {},
+ _meta: { ui: { resourceUri: RESOURCE_URI } },
+ },
+ async () => {
+ const migrations = await migrationsService.listMigrations();
+ return {
+ content: [
+ {
+ type: "text" as const,
+ text: JSON.stringify({
+ message: "Opening SIEM migration workbench",
+ migrations: migrations.map(({ id, name, status }) => ({ id, name, status })),
+ }),
+ },
+ ],
+ };
+ }
+ );
+
+ // ── App-only tools ─────────────────────────────────────────────────────────
+
+ registerAppTool(
+ server,
+ "list-migrations",
+ {
+ title: "List Migrations",
+ description: "List all SIEM rule migrations.",
+ inputSchema: {},
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async () => {
+ const migrations = await migrationsService.listMigrations();
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(migrations) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "get-migration",
+ {
+ title: "Get Migration",
+ description: "Get details for a specific SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId }) => {
+ const migration = await migrationsService.getMigration(migrationId);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(migration) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "get-translated-rules",
+ {
+ title: "Get Translated Rules",
+ description: "Get translated rules for a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ page: z.number().optional(),
+ perPage: z.number().optional(),
+ filter: z.string().optional(),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor, page, perPage, filter }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ const result = await migrationsService.getTranslatedRules(migrationId, {
+ page,
+ perPage,
+ filter,
+ });
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(result) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "start-translation",
+ {
+ title: "Start Translation",
+ description: "Start the AI translation process for a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ await migrationsService.startTranslation(migrationId);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify({ status: "started" }) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "stop-translation",
+ {
+ title: "Stop Translation",
+ description: "Stop the AI translation process for a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ await migrationsService.stopTranslation(migrationId);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify({ status: "stopped" }) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "update-translated-rule",
+ {
+ title: "Update Translated Rule",
+ description: "Update a translated rule in a SIEM migration (e.g. fix its Elastic rule JSON).",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ ruleId: z.string().describe("Translated rule ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ elasticRule: z
+ .string()
+ .optional()
+ .describe("JSON-encoded Elastic rule updates"),
+ translationResult: z
+ .enum(["full", "partial", "untranslatable"])
+ .optional(),
+ comments: z.array(z.string()).optional(),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, ruleId, vendor, elasticRule, translationResult, comments }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ const updates: Record = {};
+ if (elasticRule !== undefined)
+ updates.elastic_rule = JSON.parse(elasticRule) as Record;
+ if (translationResult !== undefined) updates.translation_result = translationResult;
+ if (comments !== undefined) updates.comments = comments;
+ const result = await migrationsService.updateTranslatedRule(migrationId, ruleId, updates);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(result) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "get-resources",
+ {
+ title: "Get Resources",
+ description: "Get macro/lookup resources for a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ const resources = await migrationsService.getResources(migrationId);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(resources) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "upsert-resource",
+ {
+ title: "Upsert Resource",
+ description: "Create or update a macro/lookup resource in a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ type: z.enum(["macro", "lookup"]).describe("Resource type"),
+ name: z.string().describe("Resource name"),
+ content: z.string().describe("Resource content"),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor, type, name, content }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ await migrationsService.upsertResources(migrationId, [{ type, name, content }]);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify({ status: "ok" }) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "install-rules",
+ {
+ title: "Install Rules",
+ description: "Install translated rules from a SIEM migration into Elastic Security.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ vendor: z
+ .string()
+ .optional()
+ .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+ ids: z
+ .array(z.string())
+ .optional()
+ .describe("Specific rule IDs to install. Omit to install all installable rules."),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId, vendor, ids }) => {
+ if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+ const result = await migrationsService.installRules(migrationId, { ids });
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(result) }],
+ };
+ }
+ );
+
+ registerAppTool(
+ server,
+ "get-stats",
+ {
+ title: "Get Stats",
+ description: "Get translation and installation statistics for a SIEM migration.",
+ inputSchema: {
+ migrationId: z.string().describe("Migration ID"),
+ },
+ _meta: { ui: { visibility: ["app"] } },
+ },
+ async ({ migrationId }) => {
+ const stats = await migrationsService.getStats(migrationId);
+ return {
+ content: [{ type: "text" as const, text: JSON.stringify(stats) }],
+ };
+ }
+ );
+
+ // ── App resource (HTML workbench) ──────────────────────────────────────────
+
+ const viewPath = resolveViewPath("migration");
+ registerAppResource(
+ server,
+ RESOURCE_URI,
+ RESOURCE_URI,
+ { mimeType: RESOURCE_MIME_TYPE },
+ async () => {
+ const html = fs.readFileSync(viewPath, "utf-8");
+ return {
+ contents: [{ uri: RESOURCE_URI, mimeType: RESOURCE_MIME_TYPE, text: html }],
+ };
+ }
+ );
+}
From 6b9c8bce468a8097f48bf7cc5c12986ced10cbe1 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:21:47 +0200
Subject: [PATCH 20/42] test: add migration tool tests (tool registrations +
vendor gating)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
20 tests covering:
Registration: all 11 tools + HTML resource registered under the correct names
migrate-rules: workbench message + compact migration list returned to LLM
app-only tool happy paths:
list-migrations, get-migration, get-translated-rules (with pagination),
start-translation, stop-translation, update-translated-rule (parses
elasticRule JSON), get-resources, upsert-resource (single-element array),
install-rules (with ids), get-stats
Vendor gating (per gated tool):
- vendor="qradar" / "sentinel-one" / unknown → { error: "vendorNotSupported" }
without calling the service
- vendor absent → proceeds (defaults to Splunk path)
get-stats has no vendor gate — confirmed by calling without vendor
Also adds createMockMigrationsService() to mockServices.ts covering all
14 MigrationsService methods.
Co-Authored-By: Claude Sonnet 4.6
---
src/test/helpers/mockServices.ts | 20 ++
src/tools/migration.test.ts | 408 +++++++++++++++++++++++++++++++
2 files changed, 428 insertions(+)
create mode 100644 src/tools/migration.test.ts
diff --git a/src/test/helpers/mockServices.ts b/src/test/helpers/mockServices.ts
index bb77c48..819e95c 100644
--- a/src/test/helpers/mockServices.ts
+++ b/src/test/helpers/mockServices.ts
@@ -13,6 +13,7 @@ import type { EntityDetailService } from "../../elastic/service/entityDetailServ
import type { EsqlService } from "../../elastic/service/esqlService.js";
import type { IndicesService } from "../../elastic/service/indicesService.js";
import type { InvestigateService } from "../../elastic/service/investigateService.js";
+import type { MigrationsService } from "../../elastic/service/migrationsService.js";
import type { RulesService } from "../../elastic/service/rulesService.js";
import type { SampleDataService } from "../../elastic/service/sampleDataService.js";
@@ -99,6 +100,25 @@ export function createMockRulesService(): RulesService {
]);
}
+export function createMockMigrationsService(): MigrationsService {
+ return mockService([
+ "createMigration",
+ "listMigrations",
+ "getMigration",
+ "deleteMigration",
+ "uploadRules",
+ "getTranslatedRules",
+ "getTranslatedRule",
+ "updateTranslatedRule",
+ "startTranslation",
+ "stopTranslation",
+ "getResources",
+ "upsertResources",
+ "installRules",
+ "getStats",
+ ]);
+}
+
export function createMockSampleDataService(): SampleDataService {
return mockService([
"generateSampleData",
diff --git a/src/tools/migration.test.ts b/src/tools/migration.test.ts
new file mode 100644
index 0000000..7193075
--- /dev/null
+++ b/src/tools/migration.test.ts
@@ -0,0 +1,408 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import fs from "fs";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+
+import { registerMigrationTools } from "./migration.js";
+import {
+ createMockMcpServer,
+ parseToolText,
+ type MockMcpServer,
+} from "../test/helpers/mockMcpServer.js";
+import { createMockMigrationsService } from "../test/helpers/mockServices.js";
+import type { MigrationsService } from "../elastic/service/index.js";
+
+const RESOURCE_URI = "ui://migrate-rules/mcp-app.html";
+const MIGRATION_ID = "m-1";
+const RULE_ID = "r-1";
+
+function setup() {
+ const server = createMockMcpServer();
+ const migrationsService = createMockMigrationsService();
+ vi.spyOn(fs, "existsSync").mockReturnValue(false);
+ vi.spyOn(fs, "readFileSync").mockReturnValue("migration");
+ registerMigrationTools(server as unknown as McpServer, { migrationsService });
+ return { server, migrationsService };
+}
+
+describe("registerMigrationTools", () => {
+ let server: MockMcpServer;
+ let migrationsService: MigrationsService;
+
+ beforeEach(() => {
+ ({ server, migrationsService } = setup());
+ });
+
+ // ── Registration ───────────────────────────────────────────────────────────
+
+ it("registers all 11 tools and the HTML resource", () => {
+ expect([...server.tools.keys()].sort()).toEqual(
+ [
+ "migrate-rules",
+ "list-migrations",
+ "get-migration",
+ "get-translated-rules",
+ "start-translation",
+ "stop-translation",
+ "update-translated-rule",
+ "get-resources",
+ "upsert-resource",
+ "install-rules",
+ "get-stats",
+ ].sort()
+ );
+ expect([...server.resources.keys()]).toEqual([RESOURCE_URI]);
+ });
+
+ // ── migrate-rules (model-facing) ───────────────────────────────────────────
+
+ describe("migrate-rules", () => {
+ it("returns a compact migration list for the LLM to see", async () => {
+ vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([
+ {
+ id: MIGRATION_ID,
+ name: "Splunk prod",
+ status: "ready",
+ created_at: "2026-01-01T00:00:00Z",
+ last_updated_at: "2026-01-01T00:00:00Z",
+ rules: {
+ total: 10, pending: 5, processing: 0, completed: 5, failed: 0,
+ installable: 5, installed: 0, partially_translated: 0, untranslatable: 0,
+ },
+ },
+ ]);
+
+ const out = parseToolText<{ message: string; migrations: unknown[] }>(
+ await server.tool("migrate-rules").callback({})
+ );
+
+ expect(out.message).toContain("workbench");
+ expect(out.migrations).toHaveLength(1);
+ expect(out.migrations[0]).toMatchObject({ id: MIGRATION_ID, name: "Splunk prod" });
+ });
+ });
+
+ // ── list-migrations ────────────────────────────────────────────────────────
+
+ describe("list-migrations", () => {
+ it("delegates to migrationsService.listMigrations and returns the array", async () => {
+ vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([]);
+
+ const out = parseToolText(
+ await server.tool("list-migrations").callback({})
+ );
+
+ expect(migrationsService.listMigrations).toHaveBeenCalledTimes(1);
+ expect(out).toEqual([]);
+ });
+ });
+
+ // ── get-migration ──────────────────────────────────────────────────────────
+
+ describe("get-migration", () => {
+ it("calls getMigration with the provided ID", async () => {
+ vi.mocked(migrationsService.getMigration).mockResolvedValueOnce({
+ id: MIGRATION_ID,
+ name: "test",
+ status: "ready",
+ created_at: "",
+ last_updated_at: "",
+ rules: {
+ total: 0, pending: 0, processing: 0, completed: 0, failed: 0,
+ installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+ },
+ });
+
+ await server.tool("get-migration").callback({ migrationId: MIGRATION_ID });
+
+ expect(migrationsService.getMigration).toHaveBeenCalledWith(MIGRATION_ID);
+ });
+ });
+
+ // ── get-translated-rules ───────────────────────────────────────────────────
+
+ describe("get-translated-rules", () => {
+ it("forwards pagination params to getTranslatedRules", async () => {
+ vi.mocked(migrationsService.getTranslatedRules).mockResolvedValueOnce({
+ data: [],
+ total: 0,
+ });
+
+ await server.tool("get-translated-rules").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ page: 2,
+ perPage: 50,
+ filter: "status:completed",
+ });
+
+ expect(migrationsService.getTranslatedRules).toHaveBeenCalledWith(
+ MIGRATION_ID,
+ { page: 2, perPage: 50, filter: "status:completed" }
+ );
+ });
+
+ it("returns vendorNotSupported for a non-Splunk vendor", async () => {
+ const out = parseToolText<{ error: string; vendor: string }>(
+ await server.tool("get-translated-rules").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "qradar",
+ })
+ );
+
+ expect(out).toEqual({ error: "vendorNotSupported", vendor: "qradar" });
+ expect(migrationsService.getTranslatedRules).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── start-translation ──────────────────────────────────────────────────────
+
+ describe("start-translation", () => {
+ it("calls startTranslation and returns { status: 'started' }", async () => {
+ vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined);
+
+ const out = parseToolText<{ status: string }>(
+ await server.tool("start-translation").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ })
+ );
+
+ expect(migrationsService.startTranslation).toHaveBeenCalledWith(MIGRATION_ID);
+ expect(out.status).toBe("started");
+ });
+
+ it("returns vendorNotSupported for sentinel-one", async () => {
+ const out = parseToolText<{ error: string; vendor: string }>(
+ await server.tool("start-translation").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "sentinel-one",
+ })
+ );
+
+ expect(out).toEqual({ error: "vendorNotSupported", vendor: "sentinel-one" });
+ expect(migrationsService.startTranslation).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── stop-translation ───────────────────────────────────────────────────────
+
+ describe("stop-translation", () => {
+ it("calls stopTranslation and returns { status: 'stopped' }", async () => {
+ vi.mocked(migrationsService.stopTranslation).mockResolvedValueOnce(undefined);
+
+ const out = parseToolText<{ status: string }>(
+ await server.tool("stop-translation").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ })
+ );
+
+ expect(migrationsService.stopTranslation).toHaveBeenCalledWith(MIGRATION_ID);
+ expect(out.status).toBe("stopped");
+ });
+
+ it("returns vendorNotSupported for an unknown vendor", async () => {
+ const out = parseToolText<{ error: string }>(
+ await server.tool("stop-translation").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "unknown-siem",
+ })
+ );
+
+ expect(out.error).toBe("vendorNotSupported");
+ expect(migrationsService.stopTranslation).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── update-translated-rule ─────────────────────────────────────────────────
+
+ describe("update-translated-rule", () => {
+ it("parses elasticRule JSON and passes updates to service", async () => {
+ vi.mocked(migrationsService.updateTranslatedRule).mockResolvedValueOnce({
+ id: RULE_ID,
+ migration_id: MIGRATION_ID,
+ status: "completed",
+ translation_result: "partial",
+ original_rule: {},
+ });
+ const elasticRule = { name: "Fixed rule", type: "query" };
+
+ await server.tool("update-translated-rule").callback({
+ migrationId: MIGRATION_ID,
+ ruleId: RULE_ID,
+ vendor: "splunk",
+ elasticRule: JSON.stringify(elasticRule),
+ translationResult: "partial",
+ });
+
+ expect(migrationsService.updateTranslatedRule).toHaveBeenCalledWith(
+ MIGRATION_ID,
+ RULE_ID,
+ expect.objectContaining({
+ elastic_rule: elasticRule,
+ translation_result: "partial",
+ })
+ );
+ });
+
+ it("returns vendorNotSupported without calling service", async () => {
+ const out = parseToolText<{ error: string }>(
+ await server.tool("update-translated-rule").callback({
+ migrationId: MIGRATION_ID,
+ ruleId: RULE_ID,
+ vendor: "qradar",
+ })
+ );
+
+ expect(out.error).toBe("vendorNotSupported");
+ expect(migrationsService.updateTranslatedRule).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── get-resources ──────────────────────────────────────────────────────────
+
+ describe("get-resources", () => {
+ it("calls getResources with migrationId", async () => {
+ vi.mocked(migrationsService.getResources).mockResolvedValueOnce([
+ { type: "macro", name: "my_macro", content: "| where true" },
+ ]);
+
+ const out = parseToolText(
+ await server.tool("get-resources").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ })
+ );
+
+ expect(migrationsService.getResources).toHaveBeenCalledWith(MIGRATION_ID);
+ expect(out).toHaveLength(1);
+ });
+
+ it("returns vendorNotSupported for non-Splunk", async () => {
+ const out = parseToolText<{ error: string }>(
+ await server.tool("get-resources").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "qradar",
+ })
+ );
+
+ expect(out.error).toBe("vendorNotSupported");
+ });
+ });
+
+ // ── upsert-resource ────────────────────────────────────────────────────────
+
+ describe("upsert-resource", () => {
+ it("calls upsertResources with a single-element array", async () => {
+ vi.mocked(migrationsService.upsertResources).mockResolvedValueOnce(undefined);
+
+ await server.tool("upsert-resource").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ type: "macro",
+ name: "splunk_macro",
+ content: "| eval x=1",
+ });
+
+ expect(migrationsService.upsertResources).toHaveBeenCalledWith(
+ MIGRATION_ID,
+ [{ type: "macro", name: "splunk_macro", content: "| eval x=1" }]
+ );
+ });
+
+ it("returns vendorNotSupported for non-Splunk", async () => {
+ const out = parseToolText<{ error: string }>(
+ await server.tool("upsert-resource").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "sentinel-one",
+ type: "macro",
+ name: "m",
+ content: "",
+ })
+ );
+
+ expect(out.error).toBe("vendorNotSupported");
+ expect(migrationsService.upsertResources).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── install-rules ──────────────────────────────────────────────────────────
+
+ describe("install-rules", () => {
+ it("passes ids array to installRules", async () => {
+ vi.mocked(migrationsService.installRules).mockResolvedValueOnce({
+ installed: 2,
+ failed: 0,
+ });
+
+ const out = parseToolText<{ installed: number; failed: number }>(
+ await server.tool("install-rules").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "splunk",
+ ids: ["r-1", "r-2"],
+ })
+ );
+
+ expect(migrationsService.installRules).toHaveBeenCalledWith(
+ MIGRATION_ID,
+ { ids: ["r-1", "r-2"] }
+ );
+ expect(out).toEqual({ installed: 2, failed: 0 });
+ });
+
+ it("returns vendorNotSupported for non-Splunk", async () => {
+ const out = parseToolText<{ error: string }>(
+ await server.tool("install-rules").callback({
+ migrationId: MIGRATION_ID,
+ vendor: "qradar",
+ })
+ );
+
+ expect(out.error).toBe("vendorNotSupported");
+ expect(migrationsService.installRules).not.toHaveBeenCalled();
+ });
+ });
+
+ // ── get-stats ──────────────────────────────────────────────────────────────
+
+ describe("get-stats", () => {
+ it("calls getStats and returns the result (no vendor gate)", async () => {
+ const stats = {
+ id: MIGRATION_ID,
+ status: "ready" as const,
+ rules: {
+ total: 5, pending: 5, processing: 0, completed: 0, failed: 0,
+ installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+ },
+ };
+ vi.mocked(migrationsService.getStats).mockResolvedValueOnce(stats);
+
+ const out = parseToolText(
+ await server.tool("get-stats").callback({ migrationId: MIGRATION_ID })
+ );
+
+ expect(migrationsService.getStats).toHaveBeenCalledWith(MIGRATION_ID);
+ expect(out).toEqual(stats);
+ });
+ });
+
+ // ── Vendor gate: undefined vendor is allowed ───────────────────────────────
+
+ it("proceeds when vendor parameter is absent (defaults to Splunk path)", async () => {
+ vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined);
+
+ const out = parseToolText<{ status: string }>(
+ await server.tool("start-translation").callback({ migrationId: MIGRATION_ID })
+ );
+
+ expect(out.status).toBe("started");
+ expect(migrationsService.startTranslation).toHaveBeenCalled();
+ });
+});
From 1c3177978c7f43c42581f71146e184b25373aff2 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:28:10 +0200
Subject: [PATCH 21/42] feat: add migration workbench view with WorkbenchState
machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
src/views/migration/App.tsx — full state machine:
WorkbenchState discriminated union (8 stages):
vendor-select → user picks vendor → creates migration
upload → paste Splunk rules JSON → upload + start translation
translating → polls get-stats every 3s → advances on completion
review → lists translated rules with status badges + fix actions
fix-rule-drawer → slide-over editor for single rule JSON + result enum
fix-resources-drawer → slide-over for macro/lookup create/update
install → confirmation step before calling install-rules
done → success summary with installed/failed counts
Vendor gate (5-LOC client check):
SUPPORTED_VENDORS = ["splunk"]
VENDOR_CATALOGUE entries not in SUPPORTED_VENDORS render as disabled
with "Coming soon" badge — re-enabling a vendor is a one-line change.
MCP integration:
All data via app.callServerTool() through the 10 app-only tools.
translating stage schedules a 3-second poll loop that stops and
transitions to review when stats.rules.processing === 0.
Supporting files:
mcp-app.html — minimal HTML shell (title: "SIEM Migration")
mcp-app.tsx — standard React 18 createRoot mount
styles.css — vendor-grid, upload-area, progress-bar, rule status
badges, drawer layout
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 873 +++++++++++++++++++++++++++++++
src/views/migration/mcp-app.html | 12 +
src/views/migration/mcp-app.tsx | 12 +
src/views/migration/styles.css | 165 ++++++
4 files changed, 1062 insertions(+)
create mode 100644 src/views/migration/App.tsx
create mode 100644 src/views/migration/mcp-app.html
create mode 100644 src/views/migration/mcp-app.tsx
create mode 100644 src/views/migration/styles.css
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
new file mode 100644
index 0000000..8add033
--- /dev/null
+++ b/src/views/migration/App.tsx
@@ -0,0 +1,873 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import React, { useState, useCallback, useEffect, useRef } from "react";
+import type { App as McpApp } from "@modelcontextprotocol/ext-apps";
+import { extractCallResult } from "../../shared/extract-tool-text";
+import {
+ AppHeader,
+ AppShell,
+ BackButton,
+ EmptyState,
+ KpiStrip,
+ KpiTile,
+ LoadingState,
+} from "../../shared/components";
+import { useFullscreen } from "../../shared/hooks/useFullscreen";
+import { useMcpApp } from "../../shared/hooks/useMcpApp";
+import "./styles.css";
+
+// ---------------------------------------------------------------------------
+// Local domain types (shapes returned by the app-only migration tools)
+// ---------------------------------------------------------------------------
+
+interface MigrationStats {
+ id: string;
+ status: string;
+ rules: {
+ total: number;
+ pending: number;
+ processing: number;
+ completed: number;
+ failed: number;
+ installable: number;
+ installed: number;
+ partially_translated: number;
+ untranslatable: number;
+ };
+}
+
+interface TranslatedRule {
+ id: string;
+ status: string;
+ translation_result?: "full" | "partial" | "untranslatable";
+ original_rule: Record;
+ elastic_rule?: Record;
+ comments?: string[];
+}
+
+interface MigrationResource {
+ type: "macro" | "lookup";
+ name: string;
+ content: string;
+}
+
+interface InstallResult {
+ installed: number;
+ failed: number;
+}
+
+// ---------------------------------------------------------------------------
+// WorkbenchState discriminated union
+//
+// Each stage carries exactly the data it needs and no more. Transitions
+// always move forward through the pipeline — no implicit shared state.
+// ---------------------------------------------------------------------------
+
+export type WorkbenchState =
+ | {
+ stage: "vendor-select";
+ }
+ | {
+ stage: "upload";
+ vendor: string;
+ migrationId: string;
+ }
+ | {
+ stage: "translating";
+ vendor: string;
+ migrationId: string;
+ stats: MigrationStats | null;
+ }
+ | {
+ stage: "review";
+ vendor: string;
+ migrationId: string;
+ translations: TranslatedRule[];
+ resources: MigrationResource[];
+ }
+ | {
+ stage: "fix-rule-drawer";
+ vendor: string;
+ migrationId: string;
+ translations: TranslatedRule[];
+ resources: MigrationResource[];
+ selectedRule: TranslatedRule;
+ }
+ | {
+ stage: "fix-resources-drawer";
+ vendor: string;
+ migrationId: string;
+ translations: TranslatedRule[];
+ resources: MigrationResource[];
+ }
+ | {
+ stage: "install";
+ vendor: string;
+ migrationId: string;
+ translations: TranslatedRule[];
+ }
+ | {
+ stage: "done";
+ installed: number;
+ failed: number;
+ };
+
+// ---------------------------------------------------------------------------
+// Vendor catalogue — re-enabling a vendor is a one-line change here
+// ---------------------------------------------------------------------------
+
+const SUPPORTED_VENDORS: readonly string[] = ["splunk"];
+
+const VENDOR_CATALOGUE = [
+ { id: "splunk", label: "Splunk" },
+ { id: "qradar", label: "IBM QRadar" },
+ { id: "sentinel-one", label: "Sentinel One" },
+] as const;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+async function callTool(
+ app: McpApp,
+ name: string,
+ args: Record
+): Promise {
+ try {
+ const result = await app.callServerTool({ name, arguments: args });
+ const text = extractCallResult(result);
+ if (!text) return null;
+ return JSON.parse(text) as T;
+ } catch (e) {
+ console.error(`[migration] ${name} failed:`, e);
+ return null;
+ }
+}
+
+// ---------------------------------------------------------------------------
+// App
+// ---------------------------------------------------------------------------
+
+export function App() {
+ const [state, setState] = useState({ stage: "vendor-select" });
+ const [loading, setLoading] = useState(false);
+ const [error, setError] = useState(null);
+
+ // For the translating stage: poll stats until translation completes
+ const pollTimerRef = useRef | null>(null);
+
+ const clearPoll = useCallback(() => {
+ if (pollTimerRef.current !== null) {
+ clearTimeout(pollTimerRef.current);
+ pollTimerRef.current = null;
+ }
+ }, []);
+
+ useEffect(() => () => clearPoll(), [clearPoll]);
+
+ const { connected, getApp } = useMcpApp({
+ name: "migration",
+ version: "1.0.0",
+ onConnect: (_app, _gotResult) => {
+ // No initial data load needed — the workbench starts at vendor-select.
+ },
+ });
+
+ const fullscreen = useFullscreen(getApp);
+
+ // ── Stage transitions ──────────────────────────────────────────────────────
+
+ const selectVendor = useCallback(
+ async (vendor: string) => {
+ const app = getApp();
+ if (!app) return;
+ setLoading(true);
+ setError(null);
+ try {
+ const res = await callTool<{ migration_id: string }>(app, "create-migration", {
+ name: `Migration ${new Date().toISOString().slice(0, 10)}`,
+ });
+ if (!res?.migration_id) throw new Error("Failed to create migration");
+ setState({ stage: "upload", vendor, migrationId: res.migration_id });
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ },
+ [getApp]
+ );
+
+ const uploadRules = useCallback(
+ async (rulesJson: string) => {
+ const app = getApp();
+ if (!app || state.stage !== "upload") return;
+ const { vendor, migrationId } = state;
+ setLoading(true);
+ setError(null);
+ try {
+ const rules = JSON.parse(rulesJson) as Record[];
+ await callTool(app, "upload-rules", { migrationId, vendor, rules });
+ await callTool(app, "start-translation", { migrationId, vendor });
+ const stats = await callTool(app, "get-stats", { migrationId });
+ setState({ stage: "translating", vendor, migrationId, stats: stats ?? null });
+ schedulePoll(app, vendor, migrationId);
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ },
+ [getApp, state]
+ );
+
+ const schedulePoll = useCallback(
+ (app: McpApp, vendor: string, migrationId: string) => {
+ clearPoll();
+ pollTimerRef.current = setTimeout(async () => {
+ const stats = await callTool(app, "get-stats", { migrationId });
+ setState((prev) => {
+ if (prev.stage !== "translating") return prev;
+ return { ...prev, stats: stats ?? prev.stats };
+ });
+ if (stats && stats.rules.processing === 0 && stats.status !== "running") {
+ // Translation finished — load translated rules and resources, move to review
+ void (async () => {
+ const translationsRes = await callTool<{
+ data: TranslatedRule[];
+ }>(app, "get-translated-rules", { migrationId, vendor, perPage: 500 });
+ const resources =
+ (await callTool(app, "get-resources", {
+ migrationId,
+ vendor,
+ })) ?? [];
+ setState({
+ stage: "review",
+ vendor,
+ migrationId,
+ translations: translationsRes?.data ?? [],
+ resources,
+ });
+ })();
+ } else {
+ schedulePoll(app, vendor, migrationId);
+ }
+ }, 3000);
+ },
+ [clearPoll]
+ );
+
+ const openRuleDrawer = useCallback((rule: TranslatedRule) => {
+ setState((prev) => {
+ if (prev.stage !== "review") return prev;
+ return { ...prev, stage: "fix-rule-drawer", selectedRule: rule };
+ });
+ }, []);
+
+ const saveRuleFix = useCallback(
+ async (elasticRuleJson: string, translationResult: "full" | "partial" | "untranslatable") => {
+ const app = getApp();
+ if (!app || state.stage !== "fix-rule-drawer") return;
+ const { vendor, migrationId, translations, resources, selectedRule } = state;
+ setLoading(true);
+ setError(null);
+ try {
+ const updated = await callTool(
+ app,
+ "update-translated-rule",
+ { migrationId, ruleId: selectedRule.id, vendor, elasticRule: elasticRuleJson, translationResult }
+ );
+ setState({
+ stage: "review",
+ vendor,
+ migrationId,
+ resources,
+ translations: translations.map((t) =>
+ t.id === selectedRule.id ? (updated ?? t) : t
+ ),
+ });
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ },
+ [getApp, state]
+ );
+
+ const openResourcesDrawer = useCallback(() => {
+ setState((prev) => {
+ if (prev.stage !== "review") return prev;
+ return { ...prev, stage: "fix-resources-drawer" };
+ });
+ }, []);
+
+ const saveResources = useCallback(
+ async (resource: MigrationResource) => {
+ const app = getApp();
+ if (!app || state.stage !== "fix-resources-drawer") return;
+ const { vendor, migrationId, translations } = state;
+ setLoading(true);
+ setError(null);
+ try {
+ await callTool(app, "upsert-resource", { migrationId, vendor, ...resource });
+ const resources =
+ (await callTool(app, "get-resources", { migrationId, vendor })) ?? [];
+ setState({ stage: "review", vendor, migrationId, translations, resources });
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ },
+ [getApp, state]
+ );
+
+ const closeDrawer = useCallback(() => {
+ setState((prev) => {
+ if (prev.stage === "fix-rule-drawer" || prev.stage === "fix-resources-drawer") {
+ const { stage: _stage, ...rest } = prev as WorkbenchState & {
+ stage: "fix-rule-drawer" | "fix-resources-drawer";
+ };
+ void _stage;
+ return { ...(rest as { vendor: string; migrationId: string; translations: TranslatedRule[]; resources: MigrationResource[] }), stage: "review" };
+ }
+ return prev;
+ });
+ }, []);
+
+ const startInstall = useCallback(() => {
+ setState((prev) => {
+ if (prev.stage !== "review") return prev;
+ return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations };
+ });
+ }, []);
+
+ const confirmInstall = useCallback(async () => {
+ const app = getApp();
+ if (!app || state.stage !== "install") return;
+ const { vendor, migrationId } = state;
+ setLoading(true);
+ setError(null);
+ try {
+ const result = await callTool(app, "install-rules", { migrationId, vendor });
+ setState({ stage: "done", installed: result?.installed ?? 0, failed: result?.failed ?? 0 });
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ }, [getApp, state]);
+
+ const reset = useCallback(() => {
+ clearPoll();
+ setState({ stage: "vendor-select" });
+ setError(null);
+ }, [clearPoll]);
+
+ // ── Render ─────────────────────────────────────────────────────────────────
+
+ // AppHeader expects { isFullscreen, onToggle } — useFullscreen returns { isFullscreen, toggle }
+ const fullscreenProp = { isFullscreen: fullscreen.isFullscreen, onToggle: fullscreen.toggle };
+
+ if (!connected) {
+ return (
+
+
+ Connecting to Elastic Security…
+
+ );
+ }
+
+ return (
+
+
+ ) : undefined
+ }
+ />
+
+ {error && (
+
+ {error}
+ setError(null)}>
+ Dismiss
+
+
+ )}
+
+ {loading && Working… }
+
+ {!loading && renderStage(state, {
+ selectVendor,
+ uploadRules,
+ openRuleDrawer,
+ saveRuleFix,
+ openResourcesDrawer,
+ saveResources,
+ closeDrawer,
+ startInstall,
+ confirmInstall,
+ reset,
+ })}
+
+ );
+}
+
+// ---------------------------------------------------------------------------
+// Per-stage renderers (extracted to keep App() readable)
+// ---------------------------------------------------------------------------
+
+interface StageHandlers {
+ selectVendor: (vendor: string) => void;
+ uploadRules: (json: string) => void;
+ openRuleDrawer: (rule: TranslatedRule) => void;
+ saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void;
+ openResourcesDrawer: () => void;
+ saveResources: (resource: MigrationResource) => void;
+ closeDrawer: () => void;
+ startInstall: () => void;
+ confirmInstall: () => void;
+ reset: () => void;
+}
+
+function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
+ switch (state.stage) {
+ case "vendor-select":
+ return ;
+
+ case "upload":
+ return ;
+
+ case "translating":
+ return ;
+
+ case "review":
+ return (
+
+ );
+
+ case "fix-rule-drawer":
+ return (
+ <>
+
+
+ >
+ );
+
+ case "fix-resources-drawer":
+ return (
+ <>
+
+
+ >
+ );
+
+ case "install":
+ return (
+ t.translation_result !== "untranslatable").length}
+ onConfirm={h.confirmInstall}
+ onBack={h.closeDrawer}
+ />
+ );
+
+ case "done":
+ return ;
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Stage components
+// ---------------------------------------------------------------------------
+
+function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
+ return (
+
+
Select your source SIEM
+
+ Choose the platform you are migrating detection rules from.
+
+
+ {VENDOR_CATALOGUE.map(({ id, label }) => {
+ const supported = SUPPORTED_VENDORS.includes(id);
+ return (
+ supported && onSelect(id)}
+ >
+ {label}
+ {!supported && Coming soon }
+
+ );
+ })}
+
+
+ );
+}
+
+function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) {
+ const [text, setText] = useState("");
+ return (
+
+
Upload {vendor} rules
+
+ Paste your exported {vendor} rules as a JSON array, then start translation.
+
+
+
+
onUpload(text)}
+ >
+ Upload & start translation
+
+
+ );
+}
+
+function Translating({ stats }: { stats: MigrationStats | null }) {
+ const rules = stats?.rules;
+ const pct = rules && rules.total > 0 ? Math.round(((rules.total - rules.pending) / rules.total) * 100) : 0;
+ return (
+
+
Translating rules…
+
+ The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes.
+
+ {rules && (
+ <>
+
+
+
+
+
+
+
+
{pct}% complete
+ >
+ )}
+ {!rules &&
Waiting for translation to start… }
+
+ );
+}
+
+function Review({
+ translations,
+ resources,
+ onOpenRule,
+ onOpenResources,
+ onInstall,
+ dimmed,
+}: {
+ translations: TranslatedRule[];
+ resources: MigrationResource[];
+ onOpenRule: (rule: TranslatedRule) => void;
+ onOpenResources: () => void;
+ onInstall: () => void;
+ dimmed?: boolean;
+}) {
+ const installable = translations.filter(
+ (t) => t.translation_result && t.translation_result !== "untranslatable"
+ ).length;
+ const needsFix = translations.filter((t) => t.translation_result === "partial").length;
+
+ return (
+
+
+
Review translated rules
+
+ {resources.length > 0 && (
+
+ Fix resources ({resources.length})
+
+ )}
+
+ Install {installable} rules
+
+
+
+
+ {needsFix > 0 && (
+
+ {needsFix} rule{needsFix !== 1 ? "s" : ""} need manual review before installation.
+
+ )}
+
+ {translations.length === 0 ? (
+
No translated rules found.
+ ) : (
+
+ {translations.map((rule) => (
+ onOpenRule(rule)} />
+ ))}
+
+ )}
+
+ );
+}
+
+function RuleRow({ rule, onFix }: { rule: TranslatedRule; onFix: () => void }) {
+ const name =
+ (rule.elastic_rule?.name as string | undefined) ??
+ (rule.original_rule?.title as string | undefined) ??
+ rule.id;
+ return (
+
+
+
+ {name}
+
+ {(rule.translation_result === "partial" || !rule.elastic_rule) && (
+
+ Fix
+
+ )}
+
+ );
+}
+
+function TranslationBadge({ result }: { result?: string }) {
+ const cls = `migration-rule-status-badge migration-rule-status-badge--${result ?? "pending"}`;
+ const label = result ?? "pending";
+ return {label} ;
+}
+
+function RuleDrawer({
+ rule,
+ onSave,
+ onClose,
+}: {
+ rule: TranslatedRule;
+ onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
+ onClose: () => void;
+}) {
+ const [json, setJson] = useState(() =>
+ JSON.stringify(rule.elastic_rule ?? {}, null, 2)
+ );
+ const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
+ rule.translation_result ?? "partial"
+ );
+
+ return (
+
+
+
Fix translated rule
+
+ ✕
+
+
+
+
+ Edit the Elastic rule JSON and select the translation quality.
+
+
+
+
+ Cancel
+
+ onSave(json, result)}
+ >
+ Save
+
+
+
+ );
+}
+
+function ResourcesDrawer({
+ resources,
+ onSave,
+ onClose,
+}: {
+ resources: MigrationResource[];
+ onSave: (resource: MigrationResource) => void;
+ onClose: () => void;
+}) {
+ const [name, setName] = useState("");
+ const [type, setType] = useState<"macro" | "lookup">("macro");
+ const [content, setContent] = useState("");
+
+ return (
+
+
+
Manage resources
+
+ ✕
+
+
+
+ {resources.length > 0 && (
+
+
Existing resources
+ {resources.map((r) => (
+
+ {r.type}
+ {r.name}
+
+ ))}
+
+ )}
+
Add / update resource
+
+ setType(e.target.value as typeof type)}
+ >
+ Macro
+ Lookup
+
+ setName(e.target.value)}
+ />
+
+
+
+
+ Close
+
+ onSave({ type, name: name.trim(), content })}
+ >
+ Save resource
+
+
+
+ );
+}
+
+function Install({
+ count,
+ onConfirm,
+ onBack,
+}: {
+ count: number;
+ onConfirm: () => void;
+ onBack: () => void;
+}) {
+ return (
+
+
Install {count} rules
+
+ The translated rules will be installed as disabled detection rules in Elastic Security.
+ You can enable them after reviewing their configuration.
+
+
+
+ Back to review
+
+
+ Confirm install
+
+
+
+ );
+}
+
+function Done({
+ installed,
+ failed,
+ onReset,
+}: {
+ installed: number;
+ failed: number;
+ onReset: () => void;
+}) {
+ return (
+
+
✓
+
Migration complete
+
0 ? 2 : 1}>
+
+ {failed > 0 && }
+
+
+ Rules have been installed as disabled. Navigate to Detection Rules to enable and tune them.
+
+
+ Start another migration
+
+
+ );
+}
diff --git a/src/views/migration/mcp-app.html b/src/views/migration/mcp-app.html
new file mode 100644
index 0000000..69fe301
--- /dev/null
+++ b/src/views/migration/mcp-app.html
@@ -0,0 +1,12 @@
+
+
+
+
+
+ SIEM Migration
+
+
+
+
+
+
diff --git a/src/views/migration/mcp-app.tsx b/src/views/migration/mcp-app.tsx
new file mode 100644
index 0000000..7251dbf
--- /dev/null
+++ b/src/views/migration/mcp-app.tsx
@@ -0,0 +1,12 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import React from "react";
+import { createRoot } from "react-dom/client";
+import { App } from "./App";
+
+createRoot(document.getElementById("root")!).render( );
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
new file mode 100644
index 0000000..2e3cbc4
--- /dev/null
+++ b/src/views/migration/styles.css
@@ -0,0 +1,165 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/* Migration workbench — view-specific overrides */
+
+.migration-vendor-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
+ gap: 12px;
+ margin-top: 24px;
+}
+
+.migration-vendor-card {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ gap: 8px;
+ padding: 20px 16px;
+ border: 1px solid var(--border-color, #d4d4d4);
+ border-radius: 8px;
+ background: var(--surface-color, #fff);
+ cursor: pointer;
+ transition: border-color 0.15s, box-shadow 0.15s;
+}
+
+.migration-vendor-card:hover:not(.migration-vendor-card--disabled) {
+ border-color: var(--accent-color, #0077cc);
+ box-shadow: 0 0 0 2px var(--accent-color-alpha, rgba(0, 119, 204, 0.15));
+}
+
+.migration-vendor-card--disabled {
+ opacity: 0.5;
+ cursor: default;
+}
+
+.migration-vendor-label {
+ font-size: 14px;
+ font-weight: 500;
+}
+
+.migration-vendor-badge {
+ font-size: 11px;
+ color: var(--text-muted, #737373);
+}
+
+.migration-upload-area {
+ border: 2px dashed var(--border-color, #d4d4d4);
+ border-radius: 8px;
+ padding: 40px;
+ text-align: center;
+ margin: 16px 0;
+ transition: border-color 0.15s;
+}
+
+.migration-upload-area:hover {
+ border-color: var(--accent-color, #0077cc);
+}
+
+.migration-progress-bar-track {
+ height: 6px;
+ background: var(--surface-subtle, #f0f0f0);
+ border-radius: 3px;
+ overflow: hidden;
+ margin: 8px 0;
+}
+
+.migration-progress-bar-fill {
+ height: 100%;
+ background: var(--accent-color, #0077cc);
+ border-radius: 3px;
+ transition: width 0.4s ease;
+}
+
+.migration-rule-status-badge {
+ display: inline-flex;
+ align-items: center;
+ gap: 4px;
+ padding: 2px 8px;
+ border-radius: 12px;
+ font-size: 11px;
+ font-weight: 500;
+ text-transform: capitalize;
+}
+
+.migration-rule-status-badge--full {
+ background: #d1fae5;
+ color: #065f46;
+}
+
+.migration-rule-status-badge--partial {
+ background: #fef3c7;
+ color: #92400e;
+}
+
+.migration-rule-status-badge--untranslatable {
+ background: #fee2e2;
+ color: #991b1b;
+}
+
+.migration-rule-status-badge--pending {
+ background: #f0f0f0;
+ color: #525252;
+}
+
+.migration-drawer {
+ position: fixed;
+ right: 0;
+ top: 0;
+ bottom: 0;
+ width: 520px;
+ max-width: 100vw;
+ background: var(--surface-color, #fff);
+ border-left: 1px solid var(--border-color, #d4d4d4);
+ box-shadow: -4px 0 16px rgba(0, 0, 0, 0.08);
+ display: flex;
+ flex-direction: column;
+ z-index: 100;
+}
+
+.migration-drawer-header {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ padding: 16px 20px;
+ border-bottom: 1px solid var(--border-color, #d4d4d4);
+}
+
+.migration-drawer-body {
+ flex: 1;
+ overflow: auto;
+ padding: 20px;
+}
+
+.migration-drawer-footer {
+ padding: 16px 20px;
+ border-top: 1px solid var(--border-color, #d4d4d4);
+ display: flex;
+ gap: 8px;
+ justify-content: flex-end;
+}
+
+.migration-rule-json-editor {
+ width: 100%;
+ font-family: "Fira Code", "Cascadia Code", monospace;
+ font-size: 12px;
+ line-height: 1.5;
+ border: 1px solid var(--border-color, #d4d4d4);
+ border-radius: 4px;
+ padding: 8px;
+ resize: vertical;
+ min-height: 240px;
+ background: var(--surface-subtle, #fafafa);
+}
+
+.migration-resource-row {
+ display: flex;
+ gap: 8px;
+ align-items: flex-start;
+ padding: 8px 0;
+ border-bottom: 1px solid var(--border-color, #e5e5e5);
+}
From be15e3404dcba43b0db3bd51da032263540092e1 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:31:41 +0200
Subject: [PATCH 22/42] feat: tighten vendor-select gate to use opacity-50
cursor-not-allowed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Replaces the custom migration-vendor-card--disabled CSS class with the
spec-required Tailwind utilities (opacity-50 + cursor-not-allowed) so the
disabled state is expressed as two atomic utility classes rather than a
bespoke rule, and removes the now-unused CSS block from styles.css.
The client-side gate remains ≤5 LOC:
const active = SUPPORTED_VENDORS.includes(id); // 1 LOC check
disabled={!active} // 1 LOC DOM attr
onClick={() => active && onSelect(id)} // 1 LOC guard
Re-enabling a vendor is still a one-line change to SUPPORTED_VENDORS.
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 12 +++++++-----
src/views/migration/styles.css | 4 ----
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 8add033..ffd952c 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -519,16 +519,18 @@ function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
{VENDOR_CATALOGUE.map(({ id, label }) => {
- const supported = SUPPORTED_VENDORS.includes(id);
+ // ≤5-LOC client-side gate: only Splunk is production-ready.
+ // Add a vendor to SUPPORTED_VENDORS to re-enable it.
+ const active = SUPPORTED_VENDORS.includes(id);
return (
supported && onSelect(id)}
+ className={`migration-vendor-card${active ? "" : " opacity-50 cursor-not-allowed"}`}
+ disabled={!active}
+ onClick={() => active && onSelect(id)}
>
{label}
- {!supported && Coming soon }
+ {!active && Coming soon }
);
})}
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index 2e3cbc4..e9ac84e 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -32,10 +32,6 @@
box-shadow: 0 0 0 2px var(--accent-color-alpha, rgba(0, 119, 204, 0.15));
}
-.migration-vendor-card--disabled {
- opacity: 0.5;
- cursor: default;
-}
.migration-vendor-label {
font-size: 14px;
From b109d6f45b65676e98a549df2bbf6b74826780fe Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:33:53 +0200
Subject: [PATCH 23/42] feat: implement upload step with file input,
drag-and-drop, and start-translation call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Upload component now offers three input paths:
1. File picker — hidden wired to a
visible "Choose file…" button; FileReader populates the textarea
2. Drag-and-drop — drop zone tracks dragOver state for visual feedback
(border-blue-400 bg-blue-50) and reads the dropped file via FileReader
3. Paste — textarea remains for direct JSON pasting
"Upload & start translation" button stays disabled until text is non-empty.
Clicking it calls onUpload(text) which runs the chain in App:
upload-rules → start-translation → get-stats → translating stage
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 52 ++++++++++++++++++++++++++++++++++---
1 file changed, 48 insertions(+), 4 deletions(-)
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index ffd952c..020c6a0 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -541,22 +541,66 @@ function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) {
const [text, setText] = useState("");
+ const [dragOver, setDragOver] = useState(false);
+ const fileInputRef = React.useRef(null);
+
+ const readFile = (file: File) => {
+ const reader = new FileReader();
+ reader.onload = (e) => setText((e.target?.result as string | null) ?? "");
+ reader.readAsText(file);
+ };
+
+ const handleDrop = (e: React.DragEvent) => {
+ e.preventDefault();
+ setDragOver(false);
+ const file = e.dataTransfer.files[0];
+ if (file) readFile(file);
+ };
+
return (
Upload {vendor} rules
- Paste your exported {vendor} rules as a JSON array, then start translation.
+ Drop a JSON export file, use the file picker, or paste the rules array directly.
-
+
+ {/* Hidden file input wired to the drop zone button */}
+
{
+ const file = e.target.files?.[0];
+ if (file) readFile(file);
+ e.target.value = "";
+ }}
+ />
+
+
{ e.preventDefault(); setDragOver(true); }}
+ onDragLeave={() => setDragOver(false)}
+ onDrop={handleDrop}
+ >
+
fileInputRef.current?.click()}
+ >
+ Choose file…
+
+
or drop a .json file here, or paste below
+
onUpload(text)}
>
From 8edcbbe79dddcfde37fc4be2db80955c1df83d62 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:37:58 +0200
Subject: [PATCH 24/42] feat: translating step now polls get-migration instead
of get-stats
schedulePoll replaces get-stats with get-migration so progress tracking
uses Kibana's authoritative lifecycle status ("ready" | "running" |
"finished" | "error") rather than the derived stats endpoint.
Completion condition changed from:
stats.rules.processing === 0 && stats.status !== "running"
to:
migration.status === "finished" || migration.status === "error"
This is both more precise (avoids a brief window where processing can
be 0 mid-run) and aligns with the Kibana status contract.
MigrationStats type gains the narrowed status union and an optional name
field so the same shape works for both get-migration and get-stats
responses without a separate type.
Translating component gains an error-state branch: when status is "error"
the heading says "Translation encountered an error" and the progress bar
is hidden, letting the workbench advance to review with whatever partial
results Kibana returned.
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 51 +++++++++++++++++++++++--------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 020c6a0..04ccc8b 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -27,7 +27,9 @@ import "./styles.css";
interface MigrationStats {
id: string;
- status: string;
+ name?: string;
+ /** Lifecycle status returned by get-migration. */
+ status: "ready" | "running" | "finished" | "error" | string;
rules: {
total: number;
pending: number;
@@ -230,22 +232,21 @@ export function App() {
(app: McpApp, vendor: string, migrationId: string) => {
clearPoll();
pollTimerRef.current = setTimeout(async () => {
- const stats = await callTool(app, "get-stats", { migrationId });
+ // Use get-migration (not get-stats) so we get the strongly-typed status
+ // field ("ready" | "running" | "finished" | "error") alongside the rule counts.
+ const migration = await callTool(app, "get-migration", { migrationId });
setState((prev) => {
if (prev.stage !== "translating") return prev;
- return { ...prev, stats: stats ?? prev.stats };
+ return { ...prev, stats: migration ?? prev.stats };
});
- if (stats && stats.rules.processing === 0 && stats.status !== "running") {
- // Translation finished — load translated rules and resources, move to review
+ // Translation is complete when Kibana sets status to "finished" or "error".
+ if (migration && (migration.status === "finished" || migration.status === "error")) {
void (async () => {
- const translationsRes = await callTool<{
- data: TranslatedRule[];
- }>(app, "get-translated-rules", { migrationId, vendor, perPage: 500 });
+ const translationsRes = await callTool<{ data: TranslatedRule[] }>(
+ app, "get-translated-rules", { migrationId, vendor, perPage: 500 }
+ );
const resources =
- (await callTool(app, "get-resources", {
- migrationId,
- vendor,
- })) ?? [];
+ (await callTool(app, "get-resources", { migrationId, vendor })) ?? [];
setState({
stage: "review",
vendor,
@@ -612,12 +613,20 @@ function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string)
function Translating({ stats }: { stats: MigrationStats | null }) {
const rules = stats?.rules;
- const pct = rules && rules.total > 0 ? Math.round(((rules.total - rules.pending) / rules.total) * 100) : 0;
+ const done = stats?.rules.total ?? 0;
+ const pending = rules?.pending ?? 0;
+ const pct = done > 0 ? Math.round(((done - pending) / done) * 100) : 0;
+ const isError = stats?.status === "error";
+
return (
-
Translating rules…
+
+ {isError ? "Translation encountered an error" : "Translating rules…"}
+
- The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes.
+ {isError
+ ? "Some rules could not be translated. Loading results…"
+ : "The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes."}
{rules && (
<>
@@ -627,10 +636,14 @@ function Translating({ stats }: { stats: MigrationStats | null }) {
-
-
{pct}% complete
+ {!isError && (
+ <>
+
+
{pct}% complete
+ >
+ )}
>
)}
{!rules &&
Waiting for translation to start… }
From db5e4c3ec02d5904b4f681c26abdfe0e083135d2 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:43:45 +0200
Subject: [PATCH 25/42] feat: review step renders three-column diff (SPL |
generated | editable Monaco)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Review step now expands any rule row inline to show RuleDiff — a three-column
panel that renders the full diff/fix UX without leaving the review list:
Left — Original SPL (plain , read-only): shows rule.original_rule.search
or falls back to full original_rule JSON if the search field is absent.
Middle — Generated Elastic rule JSON (read-only Monaco, language=json):
shows the rule.elastic_rule output from the AI translator.
Right — User-editable version (Monaco, language=json): seeded from the
generated JSON, editable by the reviewer, saved via update-translated-rule.
Footer bar: translation-result enum selector + Cancel / Save buttons.
Clicking a rule row toggles the inline diff; clicking again or Cancel collapses.
A "Drawer" button remains for partial/untranslatable rules that need the full
slide-over editor.
saveRuleInline callback in App handles update-translated-rule from the review
state directly, bypassing the fix-rule-drawer state transition.
monaco-environment.ts added (mirrors threat-hunt) so the inlined bundle can
resolve the editor worker without fetching external chunks.
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 212 +++++++++++++++++++++-
src/views/migration/mcp-app.tsx | 1 +
src/views/migration/monaco-environment.ts | 26 +++
src/views/migration/styles.css | 59 ++++++
4 files changed, 289 insertions(+), 9 deletions(-)
create mode 100644 src/views/migration/monaco-environment.ts
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 04ccc8b..3177370 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -5,7 +5,9 @@
* 2.0.
*/
-import React, { useState, useCallback, useEffect, useRef } from "react";
+import React, { useState, useCallback, useEffect, useMemo, useRef } from "react";
+import Editor from "@monaco-editor/react";
+import type { editor } from "monaco-editor";
import type { App as McpApp } from "@modelcontextprotocol/ext-apps";
import { extractCallResult } from "../../shared/extract-tool-text";
import {
@@ -301,6 +303,41 @@ export function App() {
[getApp, state]
);
+ const saveRuleInline = useCallback(
+ async (
+ ruleId: string,
+ elasticRuleJson: string,
+ translationResult: "full" | "partial" | "untranslatable"
+ ) => {
+ const app = getApp();
+ if (!app || state.stage !== "review") return;
+ const { vendor, migrationId, translations, resources } = state;
+ setLoading(true);
+ setError(null);
+ try {
+ const updated = await callTool(app, "update-translated-rule", {
+ migrationId,
+ ruleId,
+ vendor,
+ elasticRule: elasticRuleJson,
+ translationResult,
+ });
+ setState({
+ stage: "review",
+ vendor,
+ migrationId,
+ resources,
+ translations: translations.map((t) => (t.id === ruleId ? (updated ?? t) : t)),
+ });
+ } catch (e) {
+ setError(e instanceof Error ? e.message : String(e));
+ } finally {
+ setLoading(false);
+ }
+ },
+ [getApp, state]
+ );
+
const openResourcesDrawer = useCallback(() => {
setState((prev) => {
if (prev.stage !== "review") return prev;
@@ -413,6 +450,7 @@ export function App() {
uploadRules,
openRuleDrawer,
saveRuleFix,
+ saveRuleInline,
openResourcesDrawer,
saveResources,
closeDrawer,
@@ -433,6 +471,7 @@ interface StageHandlers {
uploadRules: (json: string) => void;
openRuleDrawer: (rule: TranslatedRule) => void;
saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void;
+ saveRuleInline: (id: string, json: string, result: "full" | "partial" | "untranslatable") => void;
openResourcesDrawer: () => void;
saveResources: (resource: MigrationResource) => void;
closeDrawer: () => void;
@@ -458,6 +497,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
translations={state.translations}
resources={state.resources}
onOpenRule={h.openRuleDrawer}
+ onSaveRule={h.saveRuleInline}
onOpenResources={h.openResourcesDrawer}
onInstall={h.startInstall}
/>
@@ -470,6 +510,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
translations={state.translations}
resources={state.resources}
onOpenRule={h.openRuleDrawer}
+ onSaveRule={h.saveRuleInline}
onOpenResources={h.openResourcesDrawer}
onInstall={h.startInstall}
dimmed
@@ -485,6 +526,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
translations={state.translations}
resources={state.resources}
onOpenRule={h.openRuleDrawer}
+ onSaveRule={h.saveRuleInline}
onOpenResources={h.openResourcesDrawer}
onInstall={h.startInstall}
dimmed
@@ -655,6 +697,7 @@ function Review({
translations,
resources,
onOpenRule,
+ onSaveRule,
onOpenResources,
onInstall,
dimmed,
@@ -662,15 +705,21 @@ function Review({
translations: TranslatedRule[];
resources: MigrationResource[];
onOpenRule: (rule: TranslatedRule) => void;
+ onSaveRule: (id: string, json: string, result: "full" | "partial" | "untranslatable") => void;
onOpenResources: () => void;
onInstall: () => void;
dimmed?: boolean;
}) {
+ const [expandedId, setExpandedId] = useState(null);
+
const installable = translations.filter(
(t) => t.translation_result && t.translation_result !== "untranslatable"
).length;
const needsFix = translations.filter((t) => t.translation_result === "partial").length;
+ const toggleExpand = (id: string) =>
+ setExpandedId((prev) => (prev === id ? null : id));
+
return (
@@ -705,7 +754,24 @@ function Review({
) : (
{translations.map((rule) => (
-
onOpenRule(rule)} />
+
+ toggleExpand(rule.id)}
+ onOpenDrawer={() => onOpenRule(rule)}
+ />
+ {expandedId === rule.id && (
+ {
+ onSaveRule(rule.id, json, result);
+ setExpandedId(null);
+ }}
+ onCancel={() => setExpandedId(null)}
+ />
+ )}
+
))}
)}
@@ -713,22 +779,150 @@ function Review({
);
}
-function RuleRow({ rule, onFix }: { rule: TranslatedRule; onFix: () => void }) {
+function RuleRow({
+ rule,
+ expanded,
+ onToggle,
+ onOpenDrawer,
+}: {
+ rule: TranslatedRule;
+ expanded: boolean;
+ onToggle: () => void;
+ onOpenDrawer: () => void;
+}) {
const name =
(rule.elastic_rule?.name as string | undefined) ??
(rule.original_rule?.title as string | undefined) ??
rule.id;
return (
-
+
{name}
- {(rule.translation_result === "partial" || !rule.elastic_rule) && (
-
- Fix
-
- )}
+
e.stopPropagation()}>
+ {(rule.translation_result === "partial" || !rule.elastic_rule) && (
+ { e.stopPropagation(); onOpenDrawer(); }}
+ >
+ Drawer
+
+ )}
+ {expanded ? "▲" : "▼"}
+
+
+ );
+}
+
+// ---------------------------------------------------------------------------
+// Three-column diff panel (inline within the review step)
+// ---------------------------------------------------------------------------
+
+const MONACO_OPTIONS_RO: editor.IStandaloneEditorConstructionOptions = {
+ readOnly: true,
+ minimap: { enabled: false },
+ scrollBeyondLastLine: false,
+ lineNumbers: "off",
+ glyphMargin: false,
+ folding: false,
+ renderLineHighlight: "none",
+ wordWrap: "on",
+ automaticLayout: true,
+ fontSize: 12,
+};
+
+const MONACO_OPTIONS_EDIT: editor.IStandaloneEditorConstructionOptions = {
+ ...MONACO_OPTIONS_RO,
+ readOnly: false,
+ lineNumbers: "on",
+};
+
+function RuleDiff({
+ rule,
+ onSave,
+ onCancel,
+}: {
+ rule: TranslatedRule;
+ onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
+ onCancel: () => void;
+}) {
+ const [editedJson, setEditedJson] = useState(() =>
+ JSON.stringify(rule.elastic_rule ?? {}, null, 2)
+ );
+ const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
+ rule.translation_result ?? "partial"
+ );
+
+ const originalSpl = useMemo(() => {
+ const r = rule.original_rule;
+ return (r.search as string | undefined) ?? (r.spl as string | undefined) ??
+ JSON.stringify(r, null, 2);
+ }, [rule.original_rule]);
+
+ const generatedJson = useMemo(
+ () => JSON.stringify(rule.elastic_rule ?? {}, null, 2),
+ [rule.elastic_rule]
+ );
+
+ return (
+
+
+ {/* Left: original SPL (read-only code block) */}
+
+
Original SPL
+
{originalSpl}
+
+
+ {/* Middle: generated Elastic rule JSON (read-only Monaco) */}
+
+
Generated (read-only)
+
+
+
+ {/* Right: user-editable Elastic rule JSON (Monaco) */}
+
+
Edit
+
setEditedJson(v ?? "")}
+ />
+
+
+
+
+
setResult(e.target.value as typeof result)}
+ >
+ Full — production-ready
+ Partial — needs tuning
+ Untranslatable — skip
+
+
+
+ Cancel
+
+ onSave(editedJson, result)}
+ >
+ Save
+
+
+
);
}
diff --git a/src/views/migration/mcp-app.tsx b/src/views/migration/mcp-app.tsx
index 7251dbf..85bf167 100644
--- a/src/views/migration/mcp-app.tsx
+++ b/src/views/migration/mcp-app.tsx
@@ -5,6 +5,7 @@
* 2.0.
*/
+import "./monaco-environment";
import React from "react";
import { createRoot } from "react-dom/client";
import { App } from "./App";
diff --git a/src/views/migration/monaco-environment.ts b/src/views/migration/monaco-environment.ts
new file mode 100644
index 0000000..744f2a0
--- /dev/null
+++ b/src/views/migration/monaco-environment.ts
@@ -0,0 +1,26 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import * as monaco from "monaco-editor";
+import { loader } from "@monaco-editor/react";
+import EditorWorker from "monaco-editor/esm/vs/editor/editor.worker?worker&inline";
+
+/**
+ * The view ships as a single inlined HTML bundle (vite-plugin-singlefile).
+ * Workers and JS chunks are not reachable at runtime, so:
+ *
+ * - `?worker&inline` base64-inlines the editor worker into the bundle.
+ * - `loader.config({ monaco })` makes @monaco-editor/react use the
+ * locally-bundled monaco instead of fetching it from the CDN.
+ */
+(globalThis as unknown as { MonacoEnvironment: { getWorker: (...args: unknown[]) => Worker } }).MonacoEnvironment = {
+ getWorker() {
+ return new EditorWorker();
+ },
+};
+
+loader.config({ monaco });
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index e9ac84e..40d0884 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -159,3 +159,62 @@
padding: 8px 0;
border-bottom: 1px solid var(--border-color, #e5e5e5);
}
+
+/* Three-column diff panel */
+
+.migration-diff-panel {
+ background: var(--surface-subtle, #fafafa);
+}
+
+.migration-diff-columns {
+ display: grid;
+ grid-template-columns: 1fr 1fr 1fr;
+ min-height: 320px;
+}
+
+.migration-diff-col {
+ display: flex;
+ flex-direction: column;
+ border-right: 1px solid var(--border-color, #e5e5e5);
+ overflow: hidden;
+}
+
+.migration-diff-col:last-child {
+ border-right: none;
+}
+
+.migration-diff-col-header {
+ padding: 6px 10px;
+ font-size: 11px;
+ font-weight: 600;
+ text-transform: uppercase;
+ letter-spacing: 0.04em;
+ color: var(--text-muted, #737373);
+ background: var(--surface-color, #fff);
+ border-bottom: 1px solid var(--border-color, #e5e5e5);
+ flex-shrink: 0;
+}
+
+.migration-diff-spl {
+ flex: 1;
+ margin: 0;
+ padding: 8px 10px;
+ font-family: "Fira Code", "Cascadia Code", monospace;
+ font-size: 11px;
+ line-height: 1.6;
+ white-space: pre-wrap;
+ word-break: break-all;
+ overflow: auto;
+ background: var(--surface-subtle, #fafafa);
+ color: var(--text-color, #171717);
+}
+
+.migration-diff-footer {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ padding: 10px 14px;
+ border-top: 1px solid var(--border-color, #e5e5e5);
+ background: var(--surface-color, #fff);
+ gap: 8px;
+}
From c99801a0f06351f2d034deff6da5768880921769 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski
Date: Fri, 15 May 2026 12:50:52 +0200
Subject: [PATCH 26/42] feat: per-rule drawer with ElasticRulePartial form and
Re-validate button
Replaces the bare JSON textarea in RuleDrawer with a structured form
covering the 7 key Elastic detection rule fields (name, description,
type, query, language, severity, risk_score). The Re-validate button
saves the current edits and marks the rule as "partial" via
update-translated-rule; Save uses the user-selected translation result.
Adds .migration-form-input CSS for consistent field styling.
Co-Authored-By: Claude Sonnet 4.6
---
src/views/migration/App.tsx | 194 ++++++++++++++++++++++++++++++---
src/views/migration/styles.css | 19 ++++
2 files changed, 196 insertions(+), 17 deletions(-)
diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 3177370..ac4c855 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -933,6 +933,136 @@ function TranslationBadge({ result }: { result?: string }) {
return {label} ;
}
+// ---------------------------------------------------------------------------
+// ElasticRulePartial — key fields of an Elastic detection rule
+// ---------------------------------------------------------------------------
+
+interface ElasticRulePartial {
+ name: string;
+ description: string;
+ type: string;
+ query: string;
+ language: string;
+ severity: string;
+ risk_score: number;
+ [key: string]: unknown;
+}
+
+function fromRuleJson(raw: Record): ElasticRulePartial {
+ return {
+ name: (raw.name as string | undefined) ?? "",
+ description: (raw.description as string | undefined) ?? "",
+ type: (raw.type as string | undefined) ?? "query",
+ query: (raw.query as string | undefined) ?? "",
+ language: (raw.language as string | undefined) ?? "kuery",
+ severity: (raw.severity as string | undefined) ?? "medium",
+ risk_score: typeof raw.risk_score === "number" ? raw.risk_score : 50,
+ ...raw,
+ };
+}
+
+function ElasticRuleForm({
+ fields,
+ onChange,
+}: {
+ fields: ElasticRulePartial;
+ onChange: (patch: Partial) => void;
+}) {
+ return (
+
+
+ onChange({ name: e.target.value })}
+ />
+
+
+
+
+
+ onChange({ type: e.target.value })}
+ >
+ {["query", "eql", "esql", "threshold", "machine_learning", "new_terms"].map(
+ (t) => {t}
+ )}
+
+
+
+ onChange({ language: e.target.value })}
+ >
+ {["kuery", "eql", "esql", "lucene"].map(
+ (l) => {l}
+ )}
+
+
+
+
+
+
+
+ onChange({ severity: e.target.value })}
+ >
+ {["low", "medium", "high", "critical"].map(
+ (s) => {s}
+ )}
+
+
+
+ onChange({ risk_score: Math.min(100, Math.max(0, Number(e.target.value))) })}
+ />
+
+
+
+ );
+}
+
+function FormRow({
+ label,
+ className,
+ children,
+}: {
+ label: string;
+ className?: string;
+ children: React.ReactNode;
+}) {
+ return (
+
+ {label}
+ {children}
+
+ );
+}
+
+// ---------------------------------------------------------------------------
+// RuleDrawer — slide-over with ElasticRulePartial form
+// ---------------------------------------------------------------------------
+
function RuleDrawer({
rule,
onSave,
@@ -942,34 +1072,55 @@ function RuleDrawer({
onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
onClose: () => void;
}) {
- const [json, setJson] = useState(() =>
- JSON.stringify(rule.elastic_rule ?? {}, null, 2)
- );
+ const rawRule = rule.elastic_rule ?? {};
+ const [fields, setFields] = useState(() => fromRuleJson(rawRule));
const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
rule.translation_result ?? "partial"
);
+ const [revalidating, setRevalidating] = useState(false);
+
+ const patch = (update: Partial) =>
+ setFields((prev) => ({ ...prev, ...update }));
+
+ const toJson = () => JSON.stringify({ ...rawRule, ...fields }, null, 2);
+
+ const handleRevalidate = async () => {
+ setRevalidating(true);
+ try {
+ // Save the current edits; caller persists via update-translated-rule
+ // and can determine a new translation result from the API response.
+ onSave(toJson(), "partial");
+ } finally {
+ setRevalidating(false);
+ }
+ };
+
+ const ruleName =
+ fields.name ||
+ (rule.original_rule?.title as string | undefined) ||
+ rule.id;
return (
-
Fix translated rule
-
+
+
{ruleName}
+
+
+
✕
+
-
- Edit the Elastic rule JSON and select the translation quality.
-
-