From 5bb73b5e5cb9f82b2ba4a17bc9c09d2485babeac Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 10:55:42 +0200
Subject: [PATCH 01/42] evals: add types.ts with Dataset, Example, EvalResult
 and related types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the canonical TypeScript type definitions for the eval pipeline:
- `ToolCall` / `Trajectory` — MCP host loop output primitives
- `ExpectedBehavior` — optional `tools`, `criteria`, `skill` fields (evaluators
  return `'N/A'` when a field they need is absent)
- `Example` / `Dataset` — test-case and collection shapes
- `EvaluatorResult` / `EvalResult` — per-evaluator and per-example results
- `Evaluator` — async-compatible function contract all evaluator modules satisfy

Also adds `evals/**/*` to tsconfig.json includes so tsc covers eval files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/types.ts | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tsconfig.json  |  2 +-
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 evals/types.ts
diff --git a/evals/types.ts b/evals/types.ts
new file mode 100644
index 0000000..4722075
--- /dev/null
+++ b/evals/types.ts
@@ -0,0 +1,75 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/** A single tool invocation captured during an MCP host loop run. */
+export interface ToolCall {
+  tool: string;
+  args: Record<string, unknown>;
+  result?: unknown;
+}
+
+/** Ordered sequence of tool calls produced by one eval run. */
+export type Trajectory = ToolCall[];
+
+/**
+ * What a passing run should look like.
+ * `tools` and `criteria` are both optional — evaluators that depend on them
+ * return `'N/A'` when the field is absent, so a dataset can omit whichever
+ * dimension is irrelevant for a given example.
+ */
+export interface ExpectedBehavior {
+  /** Ordered list of tool names the host should call. Used by trajectory / tool-selection evaluators. */
+  tools?: string[];
+  /** Natural-language assertions checked by the criteria (LLM-as-judge) evaluator. */
+  criteria?: string[];
+  /** Skill ID that should be activated. Used by the skill-activation evaluator. */
+  skill?: string;
+}
+
+/** One test case inside a dataset. */
+export interface Example {
+  /** Stable identifier — used as a key in result tables and CI summaries. */
+  id: string;
+  /** The user message sent to the LLM host at the start of the simulation. */
+  input: string;
+  expected: ExpectedBehavior;
+}
+
+/** A named collection of examples that can be loaded by the runner. */
+export interface Dataset {
+  name: string;
+  examples: Example[];
+}
+
+/**
+ * Output of a single evaluator for one example.
+ * `score` is a value in [0, 1] when the evaluator ran, or `'N/A'` when the
+ * evaluator skipped (e.g. `expected.tools` was absent for trajectory evaluator).
+ */
+export interface EvaluatorResult {
+  score: number | 'N/A';
+  /** Human-readable explanation of the score, required when score is numeric. */
+  reason?: string;
+}
+
+/** Aggregate result for one example after all evaluators have run. */
+export interface EvalResult {
+  exampleId: string;
+  input: string;
+  trajectory: Trajectory;
+  /** Keys are evaluator names (e.g. `'skill-activation'`, `'trajectory'`). */
+  evaluators: Record<string, EvaluatorResult>;
+}
+
+/**
+ * Contract every evaluator module must satisfy.
+ * Async to accommodate LLM-as-judge evaluators that call an LLM provider.
+ */
+export type Evaluator = (
+  trajectory: Trajectory,
+  expected: ExpectedBehavior
+) => EvaluatorResult | Promise<EvaluatorResult>;
diff --git a/tsconfig.json b/tsconfig.json
index 23b7968..5dc2901 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -14,6 +14,6 @@
     "lib": ["ES2022", "DOM", "DOM.Iterable"],
     "types": ["vitest/globals", "@testing-library/jest-dom"]
   },
-  "include": ["src/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"],
+  "include": ["src/**/*", "evals/**/*", "main.ts", "vite.config.ts", "vitest.config.ts", "scripts/**/*"],
   "exclude": ["node_modules", "dist"]
 }

From 06d830cee537349bf9be1b5f24188a83b9480668 Mon Sep 17 00:00:00 2001
From: patryks-treadmill <ao@patryks-treadmill.local>
Date: Fri, 15 May 2026 10:55:59 +0200
Subject: [PATCH 02/42] 
 ao(create-evals-types-ts-with-typescript-definitions--0): Create
 `evals/types.ts` with TypeScript definitions for `Dataset`, `Exam

Auto-committed by patryks-treadmill orchestrator.
plan=automatic-migration-mcp-app job=64319163-2da8-44b5-b087-3dee6e9e4c14 attempt=1
---
 package-lock.json | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 08e9dde..d34696e 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -5750,14 +5750,6 @@
         "node": ">= 18"
       }
     },
-    "node_modules/monaco-promql": {
-      "version": "1.8.0",
-      "resolved": "https://registry.npmjs.org/monaco-promql/-/monaco-promql-1.8.0.tgz",
-      "integrity": "sha512-XdgRojBzEe/rKtrJaHbSfoMFOMD5TXymDHIitTngmBT6XEjtAirnA7Rb2YJAO1SZrJfgvAo4LFCzJ71fH7+WOw==",
-      "license": "MIT",
-      "optional": true,
-      "peer": true
-    },
     "node_modules/ms": {
       "version": "2.1.3",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",

From b3ad86e8c0ee4455c46237788006f9ec0b6a3ed6 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:02:10 +0200
Subject: [PATCH 03/42] evals: add runner.ts orchestrator, runMcpHostLoop stub,
 and eval vitest config

runner.ts exports `runDataset(dataset, evaluators, options?)` which:
- Wraps all examples in `describe.skipIf(!process.env.RUN_LLM_EVALS)` so
  regular `npm test` never makes LLM calls or requires API keys
- Creates one `it` per example: runs runMcpHostLoop, scores via evaluators,
  asserts numeric scores >= passingScore (default 0.5)
- Emits a Markdown table summary via afterAll for CI job summaries

runMcpHostLoop.ts is a typed stub (throws); full InMemoryTransport
implementation comes in the next commit.

evals/vitest.config.ts runs in node environment with 120 s timeout,
scoped to evals/**/*.{test,spec,eval}.ts and *.dataset.ts patterns.

Also:
- Adds `test:evals` script to package.json (cross-env RUN_LLM_EVALS=1)
- Adds evals/**/*.ts to eslint.config.js file patterns so eval files
  are linted and license-header-checked

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 eslint.config.js        |   1 +
 evals/runMcpHostLoop.ts |  19 +++++++
 evals/runner.ts         | 112 ++++++++++++++++++++++++++++++++++++++++
 evals/vitest.config.ts  |  24 +++++++++
 package.json            |   1 +
 5 files changed, 157 insertions(+)
 create mode 100644 evals/runMcpHostLoop.ts
 create mode 100644 evals/runner.ts
 create mode 100644 evals/vitest.config.ts

diff --git a/eslint.config.js b/eslint.config.js
index 382ca72..cde436c 100644
--- a/eslint.config.js
+++ b/eslint.config.js
@@ -16,6 +16,7 @@ export default tseslint.config(
     files: [
       'src/**/*.ts',
       'src/**/*.tsx',
+      'evals/**/*.ts',
       '*.ts',
       'scripts/**/*.js',
       '*.mjs',
diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
new file mode 100644
index 0000000..be80a6d
--- /dev/null
+++ b/evals/runMcpHostLoop.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Trajectory } from "./types.js";
+
+/**
+ * Simulates one MCP host loop turn using the SDK's InMemoryTransport and
+ * returns the ordered sequence of tool calls the LLM made.
+ *
+ * Full implementation lands in the next commit; this stub satisfies the
+ * import so runner.ts type-checks now.
+ */
+export async function runMcpHostLoop(_input: string): Promise<Trajectory> {
+  throw new Error("runMcpHostLoop is not yet implemented");
+}
diff --git a/evals/runner.ts b/evals/runner.ts
new file mode 100644
index 0000000..50035ca
--- /dev/null
+++ b/evals/runner.ts
@@ -0,0 +1,112 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, afterAll } from "vitest";
+import type { Dataset, EvalResult, EvaluatorResult, Evaluator } from "./types.js";
+import { runMcpHostLoop } from "./runMcpHostLoop.js";
+
+export interface RunnerOptions {
+  /** Minimum numeric score [0–1] for a test to pass. Defaults to 0.5. */
+  passingScore?: number;
+}
+
+/**
+ * Registers a Vitest suite for every example in `dataset`.
+ *
+ * The entire suite is skipped unless `RUN_LLM_EVALS=1` is set in the
+ * environment, so regular `npm test` incurs zero LLM cost.
+ *
+ * Each example becomes one `it` that:
+ *   1. Runs the in-process MCP host loop to collect a trajectory.
+ *   2. Passes the trajectory to every evaluator.
+ *   3. Asserts that numeric scores meet `passingScore`.
+ *
+ * After all examples complete, a Markdown summary is written to stdout so
+ * the GitHub Actions job summary (>> $GITHUB_STEP_SUMMARY) can capture it.
+ */
+export function runDataset(
+  dataset: Dataset,
+  evaluators: Record<string, Evaluator>,
+  options: RunnerOptions = {}
+): void {
+  const { passingScore = 0.5 } = options;
+
+  describe.skipIf(!process.env.RUN_LLM_EVALS)(dataset.name, () => {
+    const results: EvalResult[] = [];
+
+    for (const example of dataset.examples) {
+      it(example.id, async () => {
+        const trajectory = await runMcpHostLoop(example.input);
+
+        const evalResults: Record<string, EvaluatorResult> = {};
+        for (const [name, evaluator] of Object.entries(evaluators)) {
+          evalResults[name] = await evaluator(trajectory, example.expected);
+        }
+
+        const result: EvalResult = {
+          exampleId: example.id,
+          input: example.input,
+          trajectory,
+          evaluators: evalResults,
+        };
+        results.push(result);
+
+        for (const [name, evalResult] of Object.entries(evalResults)) {
+          if (evalResult.score !== "N/A") {
+            expect(
+              evalResult.score,
+              `[${name}] score ${evalResult.score.toFixed(2)} < ${passingScore}` +
+                (evalResult.reason ? `: ${evalResult.reason}` : "")
+            ).toBeGreaterThanOrEqual(passingScore);
+          }
+        }
+      });
+    }
+
+    afterAll(() => {
+      process.stdout.write(buildMarkdownSummary(dataset.name, results) + "\n");
+    });
+  });
+}
+
+function buildMarkdownSummary(datasetName: string, results: EvalResult[]): string {
+  if (results.length === 0) {
+    return `## Eval results: ${datasetName}\n\n_No examples ran._\n`;
+  }
+
+  const evaluatorNames = Array.from(
+    new Set(results.flatMap((r) => Object.keys(r.evaluators)))
+  );
+
+  const headers = ["id", "input", ...evaluatorNames];
+  const separator = headers.map(() => "---");
+
+  const rows = results.map((r) => {
+    const scoreCells = evaluatorNames.map((name) => {
+      const e = r.evaluators[name];
+      if (!e) return "—";
+      if (e.score === "N/A") return "N/A";
+      return `${(e.score * 100).toFixed(0)}%`;
+    });
+    return [r.exampleId, truncate(r.input, 60), ...scoreCells];
+  });
+
+  const lines = [
+    `## Eval results: ${datasetName}`,
+    "",
+    `| ${headers.join(" | ")} |`,
+    `| ${separator.join(" | ")} |`,
+    ...rows.map((row) => `| ${row.join(" | ")} |`),
+    "",
+  ];
+
+  return lines.join("\n");
+}
+
+function truncate(s: string, maxLen: number): string {
+  return s.length <= maxLen ? s : `${s.slice(0, maxLen - 1)}…`;
+}
diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts
new file mode 100644
index 0000000..7d2f4d5
--- /dev/null
+++ b/evals/vitest.config.ts
@@ -0,0 +1,24 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { defineConfig } from "vitest/config";
+
+/**
+ * Vitest config for LLM eval suites.  Intentionally separate from the main
+ * vitest.config.ts so `npm test` never picks up eval files (and thus never
+ * makes LLM calls or requires API keys in a regular dev/CI run).
+ *
+ * Run via: npm run test:evals
+ */
+export default defineConfig({
+  test: {
+    environment: "node",
+    globals: true,
+    include: ["evals/**/*.{test,spec,eval}.ts", "evals/datasets/**/*.dataset.ts"],
+    testTimeout: 120_000,
+  },
+});
diff --git a/package.json b/package.json
index 983e3ca..043ee51 100644
--- a/package.json
+++ b/package.json
@@ -47,6 +47,7 @@
     "test": "vitest",
     "test:run": "vitest run",
     "test:coverage": "vitest run --coverage",
+    "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts",
     "prepublishOnly": "npm run build",
     "prepare": "husky",
     "version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json"

From 21b3030ea8ada62dc5efc067e617293c41ba5112 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:21:13 +0200
Subject: [PATCH 04/42] evals: implement runMcpHostLoop with InMemoryTransport
 and LLM provider types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runMcpHostLoop wires an MCP Client to the server via InMemoryTransport
(in-process, no network), lists available tools, and drives a loop of up
to MAX_TURNS=8 turns:
  LLM → tool calls → client.callTool() → result fed back → repeat

Options allow callers to inject a pre-built McpServer (for mocked-service
datasets) or a custom LlmProvider (for deterministic tests). Both default
to the real implementations when omitted.

evals/llm/types.ts introduces the LlmProvider interface and LlmMessage
discriminated union (OpenAI-style, compatible with LiteLLM proxies).

evals/llm/index.ts exposes createDefaultLlmProvider(), which auto-selects
by env var (ANTHROPIC_API_KEY first, then OPENAI_API_KEY); the concrete
adapters (anthropic.ts / openai.ts) land in the next commit — this stub
surfaces a clear error until they do.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/llm/index.ts      |  38 ++++++++++++
 evals/llm/types.ts      |  54 +++++++++++++++++
 evals/runMcpHostLoop.ts | 130 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 215 insertions(+), 7 deletions(-)
 create mode 100644 evals/llm/index.ts
 create mode 100644 evals/llm/types.ts

diff --git a/evals/llm/index.ts b/evals/llm/index.ts
new file mode 100644
index 0000000..d3c254b
--- /dev/null
+++ b/evals/llm/index.ts
@@ -0,0 +1,38 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { LlmProvider } from "./types.js";
+
+/**
+ * Returns the default LLM provider by inspecting environment variables.
+ *
+ * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5)
+ *           OPENAI_API_KEY    → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ *
+ * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are
+ * implemented in the next commit; this stub ensures runMcpHostLoop.ts
+ * type-checks now and surfaces a clear error at runtime when evals are run
+ * before the adapters land.
+ */
+export function createDefaultLlmProvider(): LlmProvider {
+  if (process.env.ANTHROPIC_API_KEY) {
+    throw new Error(
+      "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
+        "It will land in the next commit."
+    );
+  }
+  if (process.env.OPENAI_API_KEY) {
+    throw new Error(
+      "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " +
+        "It will land in the next commit."
+    );
+  }
+  throw new Error(
+    "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
+      "before running evals (npm run test:evals)."
+  );
+}
diff --git a/evals/llm/types.ts b/evals/llm/types.ts
new file mode 100644
index 0000000..b5fef9b
--- /dev/null
+++ b/evals/llm/types.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/** A single tool the LLM may call, described in JSON Schema. */
+export interface LlmToolDefinition {
+  name: string;
+  description: string;
+  /** JSON Schema object describing the tool's input parameters. */
+  parameters: Record<string, unknown>;
+}
+
+/** One tool invocation requested by the LLM in an assistant turn. */
+export interface LlmToolCallRequest {
+  id: string;
+  type: "function";
+  function: {
+    name: string;
+    /** JSON-encoded argument object. */
+    arguments: string;
+  };
+}
+
+/**
+ * Discriminated union covering every role that can appear in a chat thread.
+ * Shaped after the OpenAI chat messages API so a single interface works for
+ * both the OpenAI and Anthropic adapters (and any LiteLLM proxy in between).
+ */
+export type LlmMessage =
+  | { role: "user"; content: string }
+  | {
+      role: "assistant";
+      content: string | null;
+      tool_calls?: LlmToolCallRequest[];
+    }
+  | { role: "tool"; content: string; tool_call_id: string };
+
+/** Narrowed assistant message — what LlmProvider.chat() must return. */
+export type AssistantMessage = Extract<LlmMessage, { role: "assistant" }>;
+
+/**
+ * Minimal provider contract every LLM adapter must satisfy.
+ * The interface is intentionally thin: give it a message history + tool
+ * catalogue, get back the next assistant turn (possibly with tool calls).
+ */
+export interface LlmProvider {
+  chat(
+    messages: LlmMessage[],
+    tools: LlmToolDefinition[]
+  ): Promise<AssistantMessage>;
+}
diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
index be80a6d..d6a732c 100644
--- a/evals/runMcpHostLoop.ts
+++ b/evals/runMcpHostLoop.ts
@@ -5,15 +5,131 @@
  * 2.0.
  */
 
-import type { Trajectory } from "./types.js";
+import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
+import { Client } from "@modelcontextprotocol/sdk/client/index.js";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { createServer } from "../src/server.js";
+import type { Trajectory, ToolCall } from "./types.js";
+import type { LlmProvider, LlmMessage } from "./llm/types.js";
+import { createDefaultLlmProvider } from "./llm/index.js";
+
+/** Maximum LLM → tool-call turns before halting to prevent runaway evals. */
+const MAX_TURNS = 8;
+
+export interface HostLoopOptions {
+  /**
+   * Pre-built MCP server to test against.
+   *
+   * Pass a server constructed with mocked services for dataset-level evals
+   * that don't need a live cluster. Omit to use `createServer()`, which reads
+   * CLUSTERS_JSON / CLUSTERS_FILE and requires a real Elastic cluster.
+   *
+   * Each call to `runMcpHostLoop` should receive a **fresh** server instance;
+   * reusing a connected server across calls is not supported.
+   */
+  server?: McpServer;
+  /**
+   * LLM provider used to simulate the MCP host making tool-call decisions.
+   * Defaults to auto-selecting from ANTHROPIC_API_KEY / OPENAI_API_KEY.
+   */
+  llm?: LlmProvider;
+  /**
+   * Maximum number of LLM→tool-call turns per run.
+   * Defaults to MAX_TURNS (8).
+   */
+  maxTurns?: number;
+}
 
 /**
- * Simulates one MCP host loop turn using the SDK's InMemoryTransport and
- * returns the ordered sequence of tool calls the LLM made.
+ * Simulates one MCP host loop run entirely in-process.
+ *
+ * Architecture:
+ *   LLM ↔ Client ↔─InMemoryTransport─↔ McpServer ↔ (ES / Kibana clients)
  *
- * Full implementation lands in the next commit; this stub satisfies the
- * import so runner.ts type-checks now.
+ * The function:
+ *   1. Wires a fresh Client to the server via InMemoryTransport.
+ *   2. Lists available MCP tools and hands them to the LLM as tool definitions.
+ *   3. Loops up to `maxTurns` times:
+ *        a. Asks the LLM for the next assistant turn.
+ *        b. If the LLM emits tool calls, executes each via client.callTool().
+ *        c. Records every call in the trajectory.
+ *        d. Feeds results back into the message history.
+ *        e. Breaks when the LLM emits no tool calls (task complete).
+ *   4. Closes the client and returns the trajectory.
  */
-export async function runMcpHostLoop(_input: string): Promise<Trajectory> {
-  throw new Error("runMcpHostLoop is not yet implemented");
+export async function runMcpHostLoop(
+  input: string,
+  { server, llm, maxTurns = MAX_TURNS }: HostLoopOptions = {}
+): Promise<Trajectory> {
+  const resolvedServer = server ?? createServer();
+  const resolvedLlm = llm ?? createDefaultLlmProvider();
+
+  const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair();
+  await resolvedServer.connect(serverTransport);
+
+  const client = new Client({ name: "eval-host", version: "1.0.0" });
+  await client.connect(clientTransport);
+
+  try {
+    const { tools: mcpTools } = await client.listTools();
+    const toolDefs = mcpTools.map((t) => ({
+      name: t.name,
+      description: t.description ?? "",
+      parameters: t.inputSchema as Record<string, unknown>,
+    }));
+
+    const messages: LlmMessage[] = [{ role: "user", content: input }];
+    const trajectory: Trajectory = [];
+
+    for (let turn = 0; turn < maxTurns; turn++) {
+      const response = await resolvedLlm.chat(messages, toolDefs);
+      messages.push(response);
+
+      if (!response.tool_calls || response.tool_calls.length === 0) {
+        // LLM chose not to call a tool — simulation complete.
+        break;
+      }
+
+      for (const toolCall of response.tool_calls) {
+        const toolName = toolCall.function.name;
+        let toolArgs: Record<string, unknown>;
+        try {
+          toolArgs = JSON.parse(toolCall.function.arguments) as Record<
+            string,
+            unknown
+          >;
+        } catch {
+          // Malformed JSON from the LLM; record the call with empty args
+          // so the trajectory evaluator can detect the failure.
+          toolArgs = {};
+        }
+
+        const result = await client.callTool({
+          name: toolName,
+          arguments: toolArgs,
+        });
+
+        const record: ToolCall = {
+          tool: toolName,
+          args: toolArgs,
+          result: result.content,
+        };
+        trajectory.push(record);
+
+        // Feed the tool result back so the LLM can reason about it.
+        messages.push({
+          role: "tool",
+          content: JSON.stringify(result.content),
+          tool_call_id: toolCall.id,
+        });
+      }
+    }
+
+    return trajectory;
+  } finally {
+    // Closing the client also closes clientTransport, which triggers
+    // serverTransport.onclose() — the InMemoryTransport linked pair
+    // tears down cleanly without needing an explicit server.close().
+    await client.close();
+  }
 }

From 066f7cfdc27c1d8c3669ab772b44ffbde195bcd9 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:28:17 +0200
Subject: [PATCH 05/42] evals: add OpenAiProvider with LiteLLM proxy support
 and wire default provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OpenAiProvider (evals/llm/openai.ts):
- Implements LlmProvider.chat() via the openai SDK (gpt-4o-mini default)
- Accepts baseURL to point at a LiteLLM proxy for any compatible provider
- Maps LlmMessage ↔ ChatCompletionMessageParam in both directions; narrows
  ChatCompletionMessageToolCall to FunctionToolCall before accessing .function
- Strips tools argument when the list is empty (avoids API errors)

evals/llm/index.ts:
- createDefaultLlmProvider() now returns a real OpenAiProvider when
  OPENAI_API_KEY is set; picks up LITELLM_BASE_URL automatically
- Preserves the ANTHROPIC_API_KEY branch with a clear "coming soon" error
  until evals/llm/anthropic.ts lands

Adds openai@^6.37.0 as a devDependency (npm install --save-dev openai).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/llm/index.ts  |  25 +++++----
 evals/llm/openai.ts | 128 ++++++++++++++++++++++++++++++++++++++++++++
 package-lock.json   |  23 ++++++++
 package.json        |   1 +
 4 files changed, 164 insertions(+), 13 deletions(-)
 create mode 100644 evals/llm/openai.ts

diff --git a/evals/llm/index.ts b/evals/llm/index.ts
index d3c254b..d58aeda 100644
--- a/evals/llm/index.ts
+++ b/evals/llm/index.ts
@@ -6,33 +6,32 @@
  */
 
 import type { LlmProvider } from "./types.js";
+import { OpenAiProvider } from "./openai.js";
 
 /**
  * Returns the default LLM provider by inspecting environment variables.
  *
- * Priority: ANTHROPIC_API_KEY → Anthropic (claude-haiku-4-5)
- *           OPENAI_API_KEY    → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ * Priority order:
+ *   1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon
+ *   2. OPENAI_API_KEY    → OpenAI / LiteLLM proxy (gpt-4o-mini)
  *
- * The concrete adapters (evals/llm/anthropic.ts, evals/llm/openai.ts) are
- * implemented in the next commit; this stub ensures runMcpHostLoop.ts
- * type-checks now and surfaces a clear error at runtime when evals are run
- * before the adapters land.
+ * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM
+ * proxy, e.g. to use Claude via the OpenAI-compatible endpoint.
  */
 export function createDefaultLlmProvider(): LlmProvider {
   if (process.env.ANTHROPIC_API_KEY) {
     throw new Error(
       "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
-        "It will land in the next commit."
+        "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter."
     );
   }
   if (process.env.OPENAI_API_KEY) {
-    throw new Error(
-      "OpenAI LLM adapter not yet implemented (evals/llm/openai.ts). " +
-        "It will land in the next commit."
-    );
+    return new OpenAiProvider({
+      baseURL: process.env.LITELLM_BASE_URL,
+    });
   }
   throw new Error(
-    "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
-      "before running evals (npm run test:evals)."
+    "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " +
+      "once the Anthropic adapter lands) before running evals (npm run test:evals)."
   );
 }
diff --git a/evals/llm/openai.ts b/evals/llm/openai.ts
new file mode 100644
index 0000000..a6dd59b
--- /dev/null
+++ b/evals/llm/openai.ts
@@ -0,0 +1,128 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import OpenAI from "openai";
+import type {
+  AssistantMessage,
+  LlmMessage,
+  LlmProvider,
+  LlmToolDefinition,
+} from "./types.js";
+
+const DEFAULT_MODEL = "gpt-4o-mini";
+
+export interface OpenAiProviderOptions {
+  /** Chat model to use. Defaults to gpt-4o-mini. */
+  model?: string;
+  /**
+   * Override the API base URL. Point this at a LiteLLM proxy to route calls
+   * through any provider the proxy supports without changing client code.
+   */
+  baseURL?: string;
+  /**
+   * API key. Defaults to the OPENAI_API_KEY environment variable, which is
+   * the standard OpenAI SDK default.
+   */
+  apiKey?: string;
+}
+
+export class OpenAiProvider implements LlmProvider {
+  private readonly client: OpenAI;
+  private readonly model: string;
+
+  constructor({
+    model = DEFAULT_MODEL,
+    baseURL,
+    apiKey,
+  }: OpenAiProviderOptions = {}) {
+    this.model = model;
+    this.client = new OpenAI({
+      ...(apiKey !== undefined ? { apiKey } : {}),
+      ...(baseURL !== undefined ? { baseURL } : {}),
+    });
+  }
+
+  async chat(
+    messages: LlmMessage[],
+    tools: LlmToolDefinition[]
+  ): Promise<AssistantMessage> {
+    const response = await this.client.chat.completions.create({
+      model: this.model,
+      messages: messages.map(toOaiMessage),
+      ...(tools.length > 0 ? { tools: tools.map(toOaiTool) } : {}),
+    });
+
+    const choice = response.choices[0];
+    if (!choice) {
+      throw new Error("OpenAI returned no choices");
+    }
+
+    const msg = choice.message;
+    return {
+      role: "assistant",
+      content: msg.content ?? null,
+      ...(msg.tool_calls
+        ? {
+            tool_calls: msg.tool_calls
+              .filter(
+                (tc): tc is OpenAI.ChatCompletionMessageFunctionToolCall =>
+                  tc.type === "function"
+              )
+              .map((tc) => ({
+                id: tc.id,
+                type: "function" as const,
+                function: {
+                  name: tc.function.name,
+                  arguments: tc.function.arguments,
+                },
+              })),
+          }
+        : {}),
+    };
+  }
+}
+
+function toOaiMessage(msg: LlmMessage): OpenAI.ChatCompletionMessageParam {
+  switch (msg.role) {
+    case "user":
+      return { role: "user", content: msg.content };
+    case "assistant":
+      return {
+        role: "assistant",
+        content: msg.content,
+        ...(msg.tool_calls
+          ? {
+              tool_calls: msg.tool_calls.map((tc) => ({
+                id: tc.id,
+                type: "function" as const,
+                function: {
+                  name: tc.function.name,
+                  arguments: tc.function.arguments,
+                },
+              })),
+            }
+          : {}),
+      };
+    case "tool":
+      return {
+        role: "tool",
+        content: msg.content,
+        tool_call_id: msg.tool_call_id,
+      };
+  }
+}
+
+function toOaiTool(tool: LlmToolDefinition): OpenAI.ChatCompletionTool {
+  return {
+    type: "function",
+    function: {
+      name: tool.name,
+      description: tool.description,
+      parameters: tool.parameters,
+    },
+  };
+}
diff --git a/package-lock.json b/package-lock.json
index d34696e..2207e24 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -54,6 +54,7 @@
         "husky": "^9.1.7",
         "jsdom": "^29.1.1",
         "lint-staged": "^16.4.0",
+        "openai": "^6.37.0",
         "tailwindcss": "^4.2.2",
         "tsx": "^4.21.0",
         "typescript": "^6.0.2",
@@ -5860,6 +5861,28 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/openai": {
+      "version": "6.37.0",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-6.37.0.tgz",
+      "integrity": "sha512-0H5dEGFmmLv6KSd0W1w2nyL8WsLkX6yoLeQpU+dZAOuGcany5qkYQMmj35ZrKgb6yiyYqpUzFOpR8mZQkgqeEQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "openai": "bin/cli"
+      },
+      "peerDependencies": {
+        "ws": "^8.18.0",
+        "zod": "^3.25 || ^4.0"
+      },
+      "peerDependenciesMeta": {
+        "ws": {
+          "optional": true
+        },
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/optionator": {
       "version": "0.9.4",
       "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
diff --git a/package.json b/package.json
index 043ee51..e328e65 100644
--- a/package.json
+++ b/package.json
@@ -103,6 +103,7 @@
     "husky": "^9.1.7",
     "jsdom": "^29.1.1",
     "lint-staged": "^16.4.0",
+    "openai": "^6.37.0",
     "tailwindcss": "^4.2.2",
     "tsx": "^4.21.0",
     "typescript": "^6.0.2",

From 9f47372445c80abf2295e64ac332f13c3453e30e Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:33:59 +0200
Subject: [PATCH 06/42] evals: add AnthropicProvider and wire it as the default
 when ANTHROPIC_API_KEY is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AnthropicProvider (evals/llm/anthropic.ts):
- Implements LlmProvider.chat() via @anthropic-ai/sdk (claude-haiku-4-5-20251001)
- toAnthropicMessages() handles the structural gap between OpenAI-style messages
  and Anthropic's API: no `tool` role exists; tool results go as `user` messages
  with `tool_result` content blocks; consecutive tool results are merged into a
  single user turn to avoid adjacent-user-turn API errors
- Tool input is round-tripped JSON.parse (from LlmToolCallRequest.arguments) →
  object for the request, then JSON.stringify back for the response to maintain
  the OpenAI-compatible LlmToolCallRequest shape
- input_schema is cast from LlmToolDefinition.parameters (already JSON Schema)

evals/llm/index.ts:
- createDefaultLlmProvider() now returns AnthropicProvider when ANTHROPIC_API_KEY
  is set (priority 1), falls back to OpenAiProvider for OPENAI_API_KEY (priority 2)

Adds @anthropic-ai/sdk@^0.96.0 as a devDependency.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/llm/anthropic.ts | 150 +++++++++++++++++++++++++++++++++++++++++
 evals/llm/index.ts     |  16 ++---
 package-lock.json      |  69 +++++++++++++++++++
 package.json           |   1 +
 4 files changed, 227 insertions(+), 9 deletions(-)
 create mode 100644 evals/llm/anthropic.ts

diff --git a/evals/llm/anthropic.ts b/evals/llm/anthropic.ts
new file mode 100644
index 0000000..70e9adc
--- /dev/null
+++ b/evals/llm/anthropic.ts
@@ -0,0 +1,150 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import Anthropic from "@anthropic-ai/sdk";
+import type {
+  AssistantMessage,
+  LlmMessage,
+  LlmProvider,
+  LlmToolDefinition,
+} from "./types.js";
+
+const DEFAULT_MODEL = "claude-haiku-4-5-20251001";
+
+/** Max tokens to request from the Anthropic API per turn. */
+const MAX_TOKENS = 4096;
+
+export interface AnthropicProviderOptions {
+  /** Chat model to use. Defaults to claude-haiku-4-5-20251001. */
+  model?: string;
+  /**
+   * API key. Defaults to the ANTHROPIC_API_KEY environment variable, which is
+   * the standard Anthropic SDK default.
+   */
+  apiKey?: string;
+}
+
+export class AnthropicProvider implements LlmProvider {
+  private readonly client: Anthropic;
+  private readonly model: string;
+
+  constructor({
+    model = DEFAULT_MODEL,
+    apiKey,
+  }: AnthropicProviderOptions = {}) {
+    this.model = model;
+    this.client = new Anthropic({
+      ...(apiKey !== undefined ? { apiKey } : {}),
+    });
+  }
+
+  async chat(
+    messages: LlmMessage[],
+    tools: LlmToolDefinition[]
+  ): Promise<AssistantMessage> {
+    const response = await this.client.messages.create({
+      model: this.model,
+      max_tokens: MAX_TOKENS,
+      messages: toAnthropicMessages(messages),
+      ...(tools.length > 0 ? { tools: tools.map(toAnthropicTool) } : {}),
+    });
+
+    const textBlocks = response.content.filter(
+      (c): c is Anthropic.TextBlock => c.type === "text"
+    );
+    const toolUseBlocks = response.content.filter(
+      (c): c is Anthropic.ToolUseBlock => c.type === "tool_use"
+    );
+
+    return {
+      role: "assistant",
+      content: textBlocks.map((b) => b.text).join("") || null,
+      ...(toolUseBlocks.length > 0
+        ? {
+            tool_calls: toolUseBlocks.map((tu) => ({
+              id: tu.id,
+              type: "function" as const,
+              function: {
+                name: tu.name,
+                // Anthropic returns a parsed object; re-encode to match the
+                // OpenAI-style LlmToolCallRequest.function.arguments shape.
+                arguments: JSON.stringify(tu.input),
+              },
+            })),
+          }
+        : {}),
+    };
+  }
+}
+
+/**
+ * Converts OpenAI-style LlmMessage[] to Anthropic MessageParam[].
+ *
+ * Structural differences from OpenAI:
+ *   - Anthropic has no `tool` role. Tool results go as `user` messages with
+ *     `tool_result` content blocks.
+ *   - Consecutive tool-result messages are merged into a single user message
+ *     so the API never receives two adjacent user turns.
+ *   - Assistant content is an array of TextBlockParam / ToolUseBlockParam.
+ */
+function toAnthropicMessages(
+  messages: LlmMessage[]
+): Anthropic.MessageParam[] {
+  const result: Anthropic.MessageParam[] = [];
+
+  for (const msg of messages) {
+    if (msg.role === "user") {
+      result.push({ role: "user", content: msg.content });
+    } else if (msg.role === "assistant") {
+      const content: Anthropic.ContentBlockParam[] = [];
+      if (msg.content) {
+        content.push({ type: "text", text: msg.content });
+      }
+      for (const tc of msg.tool_calls ?? []) {
+        let input: unknown;
+        try {
+          input = JSON.parse(tc.function.arguments);
+        } catch {
+          input = {};
+        }
+        content.push({ type: "tool_use", id: tc.id, name: tc.function.name, input });
+      }
+      result.push({ role: "assistant", content });
+    } else {
+      // msg.role === "tool"
+      const block: Anthropic.ToolResultBlockParam = {
+        type: "tool_result",
+        tool_use_id: msg.tool_call_id,
+        content: msg.content,
+      };
+
+      // Merge into the preceding user message when it already holds
+      // tool_result blocks — the Anthropic API rejects two adjacent user turns.
+      const prev = result[result.length - 1];
+      if (
+        prev?.role === "user" &&
+        Array.isArray(prev.content) &&
+        (prev.content as Anthropic.ContentBlockParam[])[0]?.type ===
+          "tool_result"
+      ) {
+        (prev.content as Anthropic.ContentBlockParam[]).push(block);
+      } else {
+        result.push({ role: "user", content: [block] });
+      }
+    }
+  }
+
+  return result;
+}
+
+function toAnthropicTool(tool: LlmToolDefinition): Anthropic.Tool {
+  return {
+    name: tool.name,
+    description: tool.description,
+    input_schema: tool.parameters as Anthropic.Tool.InputSchema,
+  };
+}
diff --git a/evals/llm/index.ts b/evals/llm/index.ts
index d58aeda..5698ff5 100644
--- a/evals/llm/index.ts
+++ b/evals/llm/index.ts
@@ -6,24 +6,22 @@
  */
 
 import type { LlmProvider } from "./types.js";
+import { AnthropicProvider } from "./anthropic.js";
 import { OpenAiProvider } from "./openai.js";
 
 /**
  * Returns the default LLM provider by inspecting environment variables.
  *
  * Priority order:
- *   1. ANTHROPIC_API_KEY → Anthropic adapter (claude-haiku-4-5) — coming soon
- *   2. OPENAI_API_KEY    → OpenAI / LiteLLM proxy (gpt-4o-mini)
+ *   1. ANTHROPIC_API_KEY → AnthropicProvider (claude-haiku-4-5-20251001)
+ *   2. OPENAI_API_KEY    → OpenAiProvider / LiteLLM proxy (gpt-4o-mini)
  *
  * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM
- * proxy, e.g. to use Claude via the OpenAI-compatible endpoint.
+ * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint.
  */
 export function createDefaultLlmProvider(): LlmProvider {
   if (process.env.ANTHROPIC_API_KEY) {
-    throw new Error(
-      "Anthropic LLM adapter not yet implemented (evals/llm/anthropic.ts). " +
-        "Use OPENAI_API_KEY instead, or wait for the Anthropic adapter."
-    );
+    return new AnthropicProvider();
   }
   if (process.env.OPENAI_API_KEY) {
     return new OpenAiProvider({
@@ -31,7 +29,7 @@ export function createDefaultLlmProvider(): LlmProvider {
     });
   }
   throw new Error(
-    "No LLM provider configured. Set OPENAI_API_KEY (or ANTHROPIC_API_KEY " +
-      "once the Anthropic adapter lands) before running evals (npm run test:evals)."
+    "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " +
+      "before running evals (npm run test:evals)."
   );
 }
diff --git a/package-lock.json b/package-lock.json
index 2207e24..156fc31 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -33,6 +33,7 @@
         "elastic-security-mcp-app": "dist/main.js"
       },
       "devDependencies": {
+        "@anthropic-ai/sdk": "^0.96.0",
         "@tailwindcss/vite": "^4.2.2",
         "@testing-library/jest-dom": "^6.9.1",
         "@testing-library/react": "^16.3.2",
@@ -74,6 +75,28 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/@anthropic-ai/sdk": {
+      "version": "0.96.0",
+      "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.96.0.tgz",
+      "integrity": "sha512-KlCsODtTyb17bLUVCSDC2HtSvAbJf60sEiPEax9dInF+aDF92vS4TZJ5XD7YCQXNb1/5icYaw8Y7wMjPlIV9Zg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "json-schema-to-ts": "^3.1.1",
+        "standardwebhooks": "^1.0.0"
+      },
+      "bin": {
+        "anthropic-ai-sdk": "bin/cli"
+      },
+      "peerDependencies": {
+        "zod": "^3.25.0 || ^4.0.0"
+      },
+      "peerDependenciesMeta": {
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@asamuzakjp/css-color": {
       "version": "5.1.11",
       "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz",
@@ -1860,6 +1883,13 @@
       ],
       "peer": true
     },
+    "node_modules/@stablelib/base64": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/@stablelib/base64/-/base64-1.0.1.tgz",
+      "integrity": "sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@standard-schema/spec": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz",
@@ -4240,6 +4270,13 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/fast-sha256": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/fast-sha256/-/fast-sha256-1.3.0.tgz",
+      "integrity": "sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==",
+      "dev": true,
+      "license": "Unlicense"
+    },
     "node_modules/fast-uri": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
@@ -4945,6 +4982,20 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/json-schema-to-ts": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz",
+      "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/runtime": "^7.18.3",
+        "ts-algebra": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=16"
+      }
+    },
     "node_modules/json-schema-traverse": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
@@ -6647,6 +6698,17 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/standardwebhooks": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/standardwebhooks/-/standardwebhooks-1.0.0.tgz",
+      "integrity": "sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@stablelib/base64": "^1.0.0",
+        "fast-sha256": "^1.3.0"
+      }
+    },
     "node_modules/state-local": {
       "version": "1.0.7",
       "resolved": "https://registry.npmjs.org/state-local/-/state-local-1.0.7.tgz",
@@ -6886,6 +6948,13 @@
         "tree-kill": "cli.js"
       }
     },
+    "node_modules/ts-algebra": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz",
+      "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/ts-api-utils": {
       "version": "2.5.0",
       "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz",
diff --git a/package.json b/package.json
index e328e65..83ef515 100644
--- a/package.json
+++ b/package.json
@@ -82,6 +82,7 @@
     "react-dom": "^19.2.4"
   },
   "devDependencies": {
+    "@anthropic-ai/sdk": "^0.96.0",
     "@tailwindcss/vite": "^4.2.2",
     "@testing-library/jest-dom": "^6.9.1",
     "@testing-library/react": "^16.3.2",

From ab6ac677b250128f3bef38046668cafcf3e36c8b Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:36:50 +0200
Subject: [PATCH 07/42] evals: add --reporter=verbose to test:evals script

Makes per-example test names visible in CI output and in the GitHub Actions
job summary, which is where the Markdown eval table lands.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index 83ef515..2308b39 100644
--- a/package.json
+++ b/package.json
@@ -47,7 +47,7 @@
     "test": "vitest",
     "test:run": "vitest run",
     "test:coverage": "vitest run --coverage",
-    "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts",
+    "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts --reporter=verbose",
     "prepublishOnly": "npm run build",
     "prepare": "husky",
     "version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json"

From 9c7c1ddab13cf4b99a52f7b927fec43ec95e4593 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:37:44 +0200
Subject: [PATCH 08/42] evals: add skill-activation evaluator (binary score)

Returns 1 if the trajectory contains at least one call to the skill's
entry-point tool (expected.skill), 0 if not, or 'N/A' when expected.skill
is absent so datasets that don't test skill routing can omit the field.

The failure reason includes the full tool-name list from the trajectory to
make CI output actionable without re-running the eval.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/evaluators/skill-activation.ts | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 evals/evaluators/skill-activation.ts

diff --git a/evals/evaluators/skill-activation.ts b/evals/evaluators/skill-activation.ts
new file mode 100644
index 0000000..b7deb8d
--- /dev/null
+++ b/evals/evaluators/skill-activation.ts
@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Binary evaluator: did the LLM call the skill's entry-point tool?
+ *
+ * Each MCP skill has a single model-facing entry-point tool (e.g. `migrate-rules`
+ * for the automatic-migration skill, `manage-rules` for detection-rule-management).
+ * `expected.skill` holds that tool name. The evaluator checks whether the
+ * trajectory contains at least one call to that tool.
+ *
+ * Returns `'N/A'` when `expected.skill` is absent so datasets that don't
+ * care about skill routing can omit the field without failing the run.
+ */
+export const skillActivation: Evaluator = (
+  trajectory: Trajectory,
+  expected: ExpectedBehavior
+): EvaluatorResult => {
+  if (!expected.skill) {
+    return { score: "N/A" };
+  }
+
+  const activated = trajectory.some((tc) => tc.tool === expected.skill);
+
+  return {
+    score: activated ? 1 : 0,
+    reason: activated
+      ? `Tool "${expected.skill}" was called`
+      : `Tool "${expected.skill}" was never called (trajectory: [${trajectory.map((t) => t.tool).join(", ") || "empty"}])`,
+  };
+};

From 7849ed50d4158205f861e068adf4451620f3cb68 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:39:33 +0200
Subject: [PATCH 09/42] evals: add negative-activation evaluator for distractor
 examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary complement of skill-activation: returns 1 when the skill's
entry-point tool (expected.skill) is absent from the trajectory (correct —
LLM was not falsely triggered), 0 when the tool appears (false positive).

Returns 'N/A' when expected.skill is absent, matching the skill-activation
convention so both evaluators behave consistently on examples that don't
declare a skill.

CI gate intent: datasets should require 100% on this evaluator for distractor
examples — any false positive means the skill's SKILL.md is over-triggering
on unrelated queries in production.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/evaluators/negative-activation.ts | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 evals/evaluators/negative-activation.ts

diff --git a/evals/evaluators/negative-activation.ts b/evals/evaluators/negative-activation.ts
new file mode 100644
index 0000000..e08d315
--- /dev/null
+++ b/evals/evaluators/negative-activation.ts
@@ -0,0 +1,46 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Binary evaluator for distractor examples: did the LLM correctly avoid
+ * calling the skill's entry-point tool?
+ *
+ * This is the complement of `skillActivation`. Use it on examples where the
+ * user query should NOT trigger the skill — e.g. a migration skill dataset
+ * includes unrelated queries (case management, threat hunting) to confirm the
+ * LLM doesn't call `migrate-rules` for everything.
+ *
+ * Score semantics (binary):
+ *   1 — skill tool absent from trajectory (correct — not distracted)
+ *   0 — skill tool present in trajectory (false positive — skill over-triggered)
+ *
+ * Returns `'N/A'` when `expected.skill` is absent, consistent with how
+ * `skillActivation` handles missing skill declarations.
+ *
+ * CI gate: datasets should require 100% on this evaluator for distractor
+ * examples — a false positive means the skill's SKILL.md is too aggressive
+ * and will fire on unrelated queries in production.
+ */
+export const negativeActivation: Evaluator = (
+  trajectory: Trajectory,
+  expected: ExpectedBehavior
+): EvaluatorResult => {
+  if (!expected.skill) {
+    return { score: "N/A" };
+  }
+
+  const falsePositive = trajectory.some((tc) => tc.tool === expected.skill);
+
+  return {
+    score: falsePositive ? 0 : 1,
+    reason: falsePositive
+      ? `Tool "${expected.skill}" was called but should not have been (false positive)`
+      : `Tool "${expected.skill}" was correctly absent from the trajectory`,
+  };
+};

From ed6ce7de2aa6877994e22558440fab8ab5d170b7 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:42:05 +0200
Subject: [PATCH 10/42] evals: add tool-selection evaluator (precision/recall
 F1 against expected.tools)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Computes set-based precision, recall, and F1 against expected.tools.
Deduplicates both the trajectory and the expected list — order/repetition
is the trajectory evaluator's job.

Score = F1 ∈ [0, 1]. Returns 'N/A' when expected.tools is absent so
datasets that only test skill routing don't need to declare tool lists.

The reason string includes missed and extra tool names to make CI failures
immediately actionable without re-running the eval.

CI gate intent: ≥0.8 (80%) on positive examples.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/evaluators/tool-selection.ts | 60 ++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 evals/evaluators/tool-selection.ts

diff --git a/evals/evaluators/tool-selection.ts b/evals/evaluators/tool-selection.ts
new file mode 100644
index 0000000..71cf7b1
--- /dev/null
+++ b/evals/evaluators/tool-selection.ts
@@ -0,0 +1,60 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Set-based tool-selection evaluator: how well did the LLM pick the right tools?
+ *
+ * Computes precision, recall, and their harmonic mean (F1) against the
+ * set of tool names in `expected.tools`. Deduplicates both sides — order
+ * and repetition are tested by the trajectory evaluator instead.
+ *
+ *   precision = |called ∩ expected| / |called|   (no spurious calls)
+ *   recall    = |called ∩ expected| / |expected|  (no missed calls)
+ *   score     = F1 = 2·P·R / (P+R)               ∈ [0, 1]
+ *
+ * Returns `'N/A'` when `expected.tools` is absent so datasets that only
+ * care about skill routing don't need to declare tool lists.
+ *
+ * CI gate: datasets should require ≥0.8 (80%) on positive examples.
+ * The failure reason lists missed and extra tools to make debugging fast.
+ */
+export const toolSelection: Evaluator = (
+  trajectory: Trajectory,
+  expected: ExpectedBehavior
+): EvaluatorResult => {
+  if (!expected.tools) {
+    return { score: "N/A" };
+  }
+
+  const expectedSet = new Set(expected.tools);
+  const calledSet = new Set(trajectory.map((tc) => tc.tool));
+
+  if (expectedSet.size === 0 && calledSet.size === 0) {
+    return { score: 1, reason: "No tools expected and none called" };
+  }
+
+  const tp = [...calledSet].filter((t) => expectedSet.has(t)).length;
+  const precision = calledSet.size > 0 ? tp / calledSet.size : 0;
+  const recall = expectedSet.size > 0 ? tp / expectedSet.size : 0;
+  const f1 =
+    precision + recall > 0
+      ? (2 * precision * recall) / (precision + recall)
+      : 0;
+
+  const missed = [...expectedSet].filter((t) => !calledSet.has(t));
+  const extra = [...calledSet].filter((t) => !expectedSet.has(t));
+
+  const parts = [
+    `F1=${f1.toFixed(2)} (precision=${precision.toFixed(2)}, recall=${recall.toFixed(2)})`,
+    ...(missed.length > 0 ? [`missed: [${missed.join(", ")}]`] : []),
+    ...(extra.length > 0 ? [`extra: [${extra.join(", ")}]`] : []),
+  ];
+
+  return { score: f1, reason: parts.join(" | ") };
+};

From 304df8df49bbe587c36176c61c3b963018a4ccde Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:44:10 +0200
Subject: [PATCH 11/42] evals: add trajectory evaluator (LCS-based sequence
 score)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Computes score = lcs(actual, expected) / max(|actual|, |expected|).

Dividing by the max penalises both missing tools (recall gap) and extra
spurious tools (precision gap) in a single metric. Sequence matters here,
unlike tool-selection which is set-based.

Returns 'N/A' when expected.tools is absent — this guard prevents the
evaluator from emitting meaningless 0-scores on examples that declare no
ordered expectation, which would mask real regressions elsewhere.

LCS is O(m·n) time via a flat DP array to avoid nested-array allocation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/evaluators/trajectory.ts | 79 ++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 evals/evaluators/trajectory.ts

diff --git a/evals/evaluators/trajectory.ts b/evals/evaluators/trajectory.ts
new file mode 100644
index 0000000..4e71ec8
--- /dev/null
+++ b/evals/evaluators/trajectory.ts
@@ -0,0 +1,79 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+
+/**
+ * Sequence-aware evaluator: how closely did the LLM follow the expected tool order?
+ *
+ * Computes the Longest Common Subsequence (LCS) of the actual tool-call
+ * sequence against `expected.tools`, then normalises by the longer of the
+ * two sequences:
+ *
+ *   score = lcs(actual, expected) / max(|actual|, |expected|) ∈ [0, 1]
+ *
+ * Dividing by the max penalises both missing tools (low recall) and extra
+ * spurious tools (low precision) without needing separate P/R components —
+ * those are tool-selection's job.
+ *
+ * Returns `'N/A'` when `expected.tools` is absent so datasets that don't
+ * specify an ordered tool sequence don't fail on this evaluator. This guard
+ * is load-bearing: running LCS against an undefined expectation would produce
+ * meaningless 0-scores that mask real regressions in other evaluators.
+ */
+export const trajectoryScore: Evaluator = (
+  trajectory: Trajectory,
+  expected: ExpectedBehavior
+): EvaluatorResult => {
+  if (!expected.tools) {
+    return { score: "N/A" };
+  }
+
+  const actual = trajectory.map((tc) => tc.tool);
+  const exp = expected.tools;
+
+  if (actual.length === 0 && exp.length === 0) {
+    return { score: 1, reason: "Both actual and expected sequences are empty" };
+  }
+
+  const lcsLen = lcs(actual, exp);
+  const denom = Math.max(actual.length, exp.length);
+  const score = lcsLen / denom;
+
+  return {
+    score,
+    reason:
+      `LCS=${lcsLen} / max(|actual|=${actual.length}, |expected|=${exp.length})` +
+      `=${denom} → score=${score.toFixed(2)}` +
+      (score < 1
+        ? ` | actual=[${actual.join(", ")}] expected=[${exp.join(", ")}]`
+        : ""),
+  };
+};
+
+/**
+ * Classic O(m·n) DP implementation of Longest Common Subsequence length.
+ * Compares elements by identity (===), which is correct for tool name strings.
+ */
+function lcs(a: string[], b: string[]): number {
+  const m = a.length;
+  const n = b.length;
+  // Single flat array instead of Array<Array<number>> avoids inner allocation
+  const dp = new Array<number>((m + 1) * (n + 1)).fill(0);
+  const idx = (i: number, j: number) => i * (n + 1) + j;
+
+  for (let i = 1; i <= m; i++) {
+    for (let j = 1; j <= n; j++) {
+      dp[idx(i, j)] =
+        a[i - 1] === b[j - 1]
+          ? dp[idx(i - 1, j - 1)] + 1
+          : Math.max(dp[idx(i - 1, j)], dp[idx(i, j - 1)]);
+    }
+  }
+
+  return dp[idx(m, n)];
+}

From b838b009740b16bb015dd605d8d1e7a81386642d Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:47:42 +0200
Subject: [PATCH 12/42] evals: add criteria (LLM-as-judge) evaluator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

createCriteriaEvaluator(llm) returns an Evaluator that sends the trajectory
and expected.criteria to a judge LLM with a structured rubric prompt asking
for JSON {score, reasoning}. Returns 'N/A' when expected.criteria is absent.

The factory pattern closes over the LLM provider so datasets can inject
different judges (e.g. a stronger model for criteria, haiku for routing).

Parsing: primary path extracts the first JSON object from the response and
clamps score to [0, 1]. Falls back to a bare-number regex for models that
ignore the JSON instruction, and finally returns score=0 with the raw text
if neither succeeds.

The judge prompt serialises only {tool, args} per call — omitting result
avoids token bloat from large tool outputs while still giving the judge
enough signal to evaluate routing decisions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/evaluators/criteria.ts | 142 +++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 evals/evaluators/criteria.ts

diff --git a/evals/evaluators/criteria.ts b/evals/evaluators/criteria.ts
new file mode 100644
index 0000000..1994eac
--- /dev/null
+++ b/evals/evaluators/criteria.ts
@@ -0,0 +1,142 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js";
+import type { LlmProvider } from "../llm/types.js";
+
+/**
+ * LLM-as-judge evaluator: asks an LLM to score the trajectory against
+ * the natural-language assertions in `expected.criteria`.
+ *
+ * Returns `'N/A'` when `expected.criteria` is absent or empty so datasets
+ * that rely only on structural evaluators don't incur extra LLM calls.
+ *
+ * Usage:
+ *   import { createCriteriaEvaluator } from "./criteria.js";
+ *   import { createDefaultLlmProvider } from "../llm/index.js";
+ *
+ *   runDataset(dataset, {
+ *     criteria: createCriteriaEvaluator(createDefaultLlmProvider()),
+ *   });
+ *
+ * The factory pattern is necessary because the `Evaluator` type is a plain
+ * function — the LLM provider is closed over rather than passed as an arg.
+ */
+export function createCriteriaEvaluator(llm: LlmProvider): Evaluator {
+  return async (
+    trajectory: Trajectory,
+    expected: ExpectedBehavior
+  ): Promise<EvaluatorResult> => {
+    if (!expected.criteria || expected.criteria.length === 0) {
+      return { score: "N/A" };
+    }
+
+    const prompt = buildJudgePrompt(trajectory, expected.criteria);
+    const response = await llm.chat([{ role: "user", content: prompt }], []);
+    const text = response.content ?? "";
+
+    return parseJudgeResponse(text);
+  };
+}
+
+/**
+ * Builds the rubric prompt sent to the judge LLM.
+ *
+ * Asks for a JSON object with `score` (0–1) and `reasoning` (string) so
+ * parsing is deterministic. The trajectory is serialised as a compact JSON
+ * array of `{tool, args}` pairs — `result` is omitted to avoid token bloat
+ * from large tool outputs.
+ */
+function buildJudgePrompt(trajectory: Trajectory, criteria: string[]): string {
+  const trajectoryStr = JSON.stringify(
+    trajectory.map(({ tool, args }) => ({ tool, args })),
+    null,
+    2
+  );
+
+  const criteriaList = criteria
+    .map((c, i) => `${i + 1}. ${c}`)
+    .join("\n");
+
+  return `You are an impartial evaluator assessing the quality of an AI assistant's tool-calling behaviour.
+
+## Trajectory (tools the assistant called, in order)
+
+\`\`\`json
+${trajectoryStr}
+\`\`\`
+
+## Evaluation criteria
+
+${criteriaList}
+
+## Task
+
+Score how well the trajectory satisfies ALL of the criteria above on a scale from 0.0 to 1.0:
+- 1.0  All criteria fully satisfied
+- 0.75 Most criteria satisfied with minor gaps
+- 0.5  About half the criteria satisfied
+- 0.25 Most criteria unmet with only minor satisfaction
+- 0.0  No criteria satisfied at all
+
+Respond with a single JSON object — no markdown fences, no extra text:
+{"score": <number 0.0–1.0>, "reasoning": "<concise explanation referencing specific criteria>"}`;
+}
+
+/**
+ * Parses the judge LLM's response into an EvaluatorResult.
+ *
+ * Tries JSON.parse first. Falls back to a regex that extracts a bare number
+ * from the text in case the model wraps the response in prose.
+ */
+function parseJudgeResponse(text: string): EvaluatorResult {
+  const trimmed = text.trim();
+
+  // Primary: extract the first {...} object in the response
+  const jsonMatch = trimmed.match(/\{[\s\S]*\}/);
+  if (jsonMatch) {
+    try {
+      const parsed = JSON.parse(jsonMatch[0]) as unknown;
+      if (
+        typeof parsed === "object" &&
+        parsed !== null &&
+        "score" in parsed &&
+        typeof (parsed as Record<string, unknown>).score === "number"
+      ) {
+        const { score, reasoning } = parsed as {
+          score: number;
+          reasoning?: unknown;
+        };
+        const clampedScore = Math.min(1, Math.max(0, score));
+        return {
+          score: clampedScore,
+          reason:
+            typeof reasoning === "string"
+              ? reasoning
+              : `raw judge response: ${trimmed}`,
+        };
+      }
+    } catch {
+      // fall through to regex fallback
+    }
+  }
+
+  // Fallback: look for a bare decimal / integer in [0, 1]
+  const numMatch = trimmed.match(/\b(1(?:\.0+)?|0(?:\.\d+)?)\b/);
+  if (numMatch) {
+    const score = parseFloat(numMatch[1]);
+    return {
+      score,
+      reason: `score parsed from prose; raw response: ${trimmed.slice(0, 200)}`,
+    };
+  }
+
+  return {
+    score: 0,
+    reason: `judge response could not be parsed; raw response: ${trimmed.slice(0, 200)}`,
+  };
+}

From 60eebb35d26dc3286303457249643769530b6c35 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:50:52 +0200
Subject: [PATCH 13/42] evals: add detection-rule-management dataset (4
 positives + 4 distractors)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Proves the eval harness end-to-end against the existing manage-rules skill.

Positives (drm-pos-01..04): natural-language queries about viewing/finding
detection rules — the LLM should call manage-rules. Evaluated with
skill-activation + tool-selection (≥80% gate).

Distractors (drm-neg-01..04): case creation, alert triage, ES|QL hunting,
host investigation — the LLM should NOT call manage-rules. Evaluated with
negative-activation (100% gate — any false positive is a regression).

Two separate runDataset calls wire the correct evaluators and thresholds
to each example group without mixing evaluator semantics across types.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../detection-rule-management.dataset.ts      | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 evals/datasets/detection-rule-management.dataset.ts

diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts
new file mode 100644
index 0000000..09c6563
--- /dev/null
+++ b/evals/datasets/detection-rule-management.dataset.ts
@@ -0,0 +1,125 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Dataset, Example } from "../types.js";
+import { runDataset } from "../runner.js";
+import { skillActivation } from "../evaluators/skill-activation.js";
+import { negativeActivation } from "../evaluators/negative-activation.js";
+import { toolSelection } from "../evaluators/tool-selection.js";
+
+/**
+ * The model-facing entry-point tool registered by the
+ * detection-rule-management skill (src/tools/detection-rules.ts).
+ */
+const SKILL_TOOL = "manage-rules";
+
+// ---------------------------------------------------------------------------
+// Positive examples — the LLM should call manage-rules
+// ---------------------------------------------------------------------------
+
+const positiveExamples: Example[] = [
+  {
+    id: "drm-pos-01",
+    input: "Show me my noisy rules — which detection rules are generating the most alerts?",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "drm-pos-02",
+    input: "List all my currently enabled detection rules",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "drm-pos-03",
+    input: "Find high severity detection rules related to PowerShell execution",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "drm-pos-04",
+    input: "What detection rules do I have covering initial access tactics?",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+];
+
+// ---------------------------------------------------------------------------
+// Distractor examples — the LLM should NOT call manage-rules
+// ---------------------------------------------------------------------------
+
+const distractorExamples: Example[] = [
+  {
+    id: "drm-neg-01",
+    input: "Create a new case for a ransomware incident I'm currently investigating",
+    expected: {
+      // skill is set so negativeActivation knows which tool to check for absence
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "drm-neg-02",
+    input: "Show me all critical alerts that fired in the last hour",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "drm-neg-03",
+    input: "Run an ES|QL query to find failed SSH login attempts on my Linux hosts",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "drm-neg-04",
+    input: "A process on host web-01 just spawned cmd.exe — help me investigate",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+];
+
+// ---------------------------------------------------------------------------
+// Export the full dataset for reference / cross-dataset tooling
+// ---------------------------------------------------------------------------
+
+export const detectionRuleManagementDataset: Dataset = {
+  name: "detection-rule-management",
+  examples: [...positiveExamples, ...distractorExamples],
+};
+
+// ---------------------------------------------------------------------------
+// Vitest eval suites
+// Each runDataset call registers a describe block gated on RUN_LLM_EVALS.
+// Positives and distractors use different evaluators and passing thresholds.
+// ---------------------------------------------------------------------------
+
+runDataset(
+  { name: "detection-rule-management: positives", examples: positiveExamples },
+  {
+    "skill-activation": skillActivation,
+    "tool-selection": toolSelection,
+  },
+  { passingScore: 0.8 }
+);
+
+runDataset(
+  { name: "detection-rule-management: distractors", examples: distractorExamples },
+  {
+    "negative-activation": negativeActivation,
+  },
+  { passingScore: 1.0 } // 100% — any false positive is a regression
+);

From 726b3bd549438db0788ded80109bf83824d1121a Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:54:39 +0200
Subject: [PATCH 14/42] evals: add detection-rule-management.eval.test.ts;
 split dataset from test orchestration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separates data from test concerns:
- detection-rule-management.dataset.ts now only exports data (positiveExamples,
  distractorExamples, detectionRuleManagementDataset); no runDataset calls
- detection-rule-management.eval.test.ts is the Vitest entry point that
  imports the sub-arrays and calls runDataset with the correct evaluators

Gate layout (unchanged from before):
  positives   — skill-activation + tool-selection, passingScore: 0.8
  distractors — negative-activation,               passingScore: 1.0

The .eval.test.ts suffix matches the include glob in evals/vitest.config.ts
so `npm run test:evals` picks it up without further config changes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../detection-rule-management.dataset.ts      | 30 +----------
 evals/detection-rule-management.eval.test.ts  | 54 +++++++++++++++++++
 2 files changed, 56 insertions(+), 28 deletions(-)
 create mode 100644 evals/detection-rule-management.eval.test.ts

diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts
index 09c6563..a1e2a2c 100644
--- a/evals/datasets/detection-rule-management.dataset.ts
+++ b/evals/datasets/detection-rule-management.dataset.ts
@@ -6,10 +6,6 @@
  */
 
 import type { Dataset, Example } from "../types.js";
-import { runDataset } from "../runner.js";
-import { skillActivation } from "../evaluators/skill-activation.js";
-import { negativeActivation } from "../evaluators/negative-activation.js";
-import { toolSelection } from "../evaluators/tool-selection.js";
 
 /**
  * The model-facing entry-point tool registered by the
@@ -21,7 +17,7 @@ const SKILL_TOOL = "manage-rules";
 // Positive examples — the LLM should call manage-rules
 // ---------------------------------------------------------------------------
 
-const positiveExamples: Example[] = [
+export const positiveExamples: Example[] = [
   {
     id: "drm-pos-01",
     input: "Show me my noisy rules — which detection rules are generating the most alerts?",
@@ -60,7 +56,7 @@ const positiveExamples: Example[] = [
 // Distractor examples — the LLM should NOT call manage-rules
 // ---------------------------------------------------------------------------
 
-const distractorExamples: Example[] = [
+export const distractorExamples: Example[] = [
   {
     id: "drm-neg-01",
     input: "Create a new case for a ransomware incident I'm currently investigating",
@@ -101,25 +97,3 @@ export const detectionRuleManagementDataset: Dataset = {
   examples: [...positiveExamples, ...distractorExamples],
 };
 
-// ---------------------------------------------------------------------------
-// Vitest eval suites
-// Each runDataset call registers a describe block gated on RUN_LLM_EVALS.
-// Positives and distractors use different evaluators and passing thresholds.
-// ---------------------------------------------------------------------------
-
-runDataset(
-  { name: "detection-rule-management: positives", examples: positiveExamples },
-  {
-    "skill-activation": skillActivation,
-    "tool-selection": toolSelection,
-  },
-  { passingScore: 0.8 }
-);
-
-runDataset(
-  { name: "detection-rule-management: distractors", examples: distractorExamples },
-  {
-    "negative-activation": negativeActivation,
-  },
-  { passingScore: 1.0 } // 100% — any false positive is a regression
-);
diff --git a/evals/detection-rule-management.eval.test.ts b/evals/detection-rule-management.eval.test.ts
new file mode 100644
index 0000000..ec9cab3
--- /dev/null
+++ b/evals/detection-rule-management.eval.test.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * End-to-end eval spec for the detection-rule-management skill.
+ *
+ * Proves the eval harness (runner → runMcpHostLoop → evaluators) works
+ * against a real registered skill using the proof dataset. Run via:
+ *
+ *   npm run test:evals
+ *
+ * This suite is skipped in regular `npm test` because runDataset wraps
+ * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`.
+ *
+ * Gate summary:
+ *   positives  — skill-activation + tool-selection ≥ 80%
+ *   distractors — negative-activation = 100% (any false positive is a regression)
+ */
+
+import { runDataset } from "./runner.js";
+import {
+  positiveExamples,
+  distractorExamples,
+} from "./datasets/detection-rule-management.dataset.js";
+import { skillActivation } from "./evaluators/skill-activation.js";
+import { negativeActivation } from "./evaluators/negative-activation.js";
+import { toolSelection } from "./evaluators/tool-selection.js";
+
+runDataset(
+  {
+    name: "detection-rule-management: positives",
+    examples: positiveExamples,
+  },
+  {
+    "skill-activation": skillActivation,
+    "tool-selection": toolSelection,
+  },
+  { passingScore: 0.8 }
+);
+
+runDataset(
+  {
+    name: "detection-rule-management: distractors",
+    examples: distractorExamples,
+  },
+  {
+    "negative-activation": negativeActivation,
+  },
+  { passingScore: 1.0 } // 100% — any false positive is a regression
+);

From 77844857b9b62e452690740a39bc4befd9f4b8b3 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 11:57:55 +0200
Subject: [PATCH 15/42] ci: add evals.yml GitHub Actions workflow

Triggers:
  - workflow_dispatch      manual run from Actions UI
  - schedule (0 2 * * *)  nightly at 02:00 UTC
  - pull_request_target    only when 'evals' label is added; gated by label
                           write permission so only maintainers can trigger

Concurrency group 'evals-<ref>' cancels in-progress runs on new pushes,
preventing redundant jobs from burning LLM quota.

The 'Run evals' step sets RUN_LLM_EVALS=1 and passes four secrets:
  EVAL_ANTHROPIC_API_KEY  Claude Haiku (priority)
  EVAL_OPENAI_API_KEY     GPT-4o-mini fallback
  EVAL_LITELLM_BASE_URL   optional LiteLLM proxy base URL
  EVAL_CLUSTERS_JSON      Elastic cluster credentials for the MCP server

Output is captured with tee so it appears in the job log AND in eval-output.txt.
A separate 'Post eval results' step (if: always()) appends '## Eval results'
plus the full output to $GITHUB_STEP_SUMMARY so the rendered Markdown tables
from the runner appear in the Actions job summary.

For pull_request_target the checkout uses the PR head SHA so evals run against
the proposed changes rather than the base branch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/evals.yml | 87 +++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 .github/workflows/evals.yml

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
new file mode 100644
index 0000000..c4b951a
--- /dev/null
+++ b/.github/workflows/evals.yml
@@ -0,0 +1,87 @@
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0; you may not use this file except in compliance with the Elastic License
+# 2.0.
+
+name: Evals
+
+on:
+  # Manually trigger a run from the Actions UI (useful for ad-hoc evaluation).
+  workflow_dispatch:
+
+  # Nightly run at 02:00 UTC to catch regressions before the work day starts.
+  schedule:
+    - cron: "0 2 * * *"
+
+  # Run when a PR is labeled with `evals`. Labels require write permission, so
+  # this implicitly limits triggering to maintainers — acceptable because
+  # pull_request_target runs with base-repo secrets.
+  pull_request_target:
+    types: [labeled]
+
+# Cancel any in-progress run for the same ref so a fast push doesn't queue up
+# redundant eval jobs that waste LLM quota.
+concurrency:
+  group: evals-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  evals:
+    name: LLM Eval Suite
+    runs-on: ubuntu-latest
+
+    # For pull_request_target, gate strictly on the evals label so the job
+    # doesn't fire for every other label event.
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      github.event_name == 'schedule' ||
+      (github.event_name == 'pull_request_target' && github.event.label.name == 'evals')
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          # For pull_request_target, check out the PR head so the eval runs
+          # against the proposed changes, not the base branch.
+          ref: >-
+            ${{
+              github.event_name == 'pull_request_target'
+                && github.event.pull_request.head.sha
+                || github.sha
+            }}
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Run evals
+        env:
+          RUN_LLM_EVALS: "1"
+          # Set ANTHROPIC_API_KEY to use Claude Haiku (preferred); fall back to
+          # OPENAI_API_KEY for GPT-4o-mini. Set EVAL_LITELLM_BASE_URL to route
+          # through a LiteLLM proxy instead of the direct OpenAI endpoint.
+          ANTHROPIC_API_KEY: ${{ secrets.EVAL_ANTHROPIC_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.EVAL_OPENAI_API_KEY }}
+          LITELLM_BASE_URL: ${{ secrets.EVAL_LITELLM_BASE_URL }}
+          # JSON array describing the Elastic cluster the MCP server targets.
+          # Shape: [{"name":"primary","elasticsearchUrl":"...","kibanaUrl":"...","elasticsearchApiKey":"..."}]
+          CLUSTERS_JSON: ${{ secrets.EVAL_CLUSTERS_JSON }}
+        run: |
+          set -o pipefail
+          npm run test:evals 2>&1 | tee eval-output.txt
+
+      - name: Post eval results to job summary
+        if: always()
+        run: |
+          if [ -f eval-output.txt ]; then
+            echo "## Eval results" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            cat eval-output.txt >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "## Eval results" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo "_No eval output captured._" >> "$GITHUB_STEP_SUMMARY"
+          fi

From ac864b8ad20b5c958979e0a9506f1097c3857390 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:02:27 +0200
Subject: [PATCH 16/42] =?UTF-8?q?docs:=20add=20evals.md=20=E2=80=94=20harn?=
 =?UTF-8?q?ess=20design,=20dataset=20shape,=20evaluator=20catalog,=20CI=20?=
 =?UTF-8?q?gating?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Covers:
- Architecture diagram showing runner → runMcpHostLoop → evaluators pipeline
- Key design choices table (in-process transport, skip-if guard, N/A semantics)
- Dataset shape reference with all three optional expected fields documented
- Positive vs distractor example pattern with runDataset code snippets
- Evaluator catalog: type, score range, N/A condition, and recommended gate for
  all five evaluators (skill-activation, negative-activation, tool-selection,
  trajectory, criteria)
- Step-by-step how-to-add-dataset guide with copy-paste templates
- CI gating: workflow triggers, required secrets table, passing threshold table

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/evals.md | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 docs/evals.md

diff --git a/docs/evals.md b/docs/evals.md
new file mode 100644
index 0000000..486af33
--- /dev/null
+++ b/docs/evals.md
@@ -0,0 +1,260 @@
+# Eval Harness
+
+LLM-powered evals for the Elastic Security MCP app's skill layer. The harness
+tests whether the LLM host activates the right skill, calls the right tools in
+the right order, and does not fire on unrelated queries.
+
+Regular `npm test` never touches this harness — it only runs when
+`RUN_LLM_EVALS=1` is set, so CI stays fast and free of LLM costs.
+
+---
+
+## Architecture
+
+```
+Dataset (examples)
+   │
+   ▼
+runner.ts ─ describe.skipIf(!RUN_LLM_EVALS)(dataset.name, () => {
+   │            for each example:
+   │               trajectory = await runMcpHostLoop(input)
+   │               scores     = await evaluators[*](trajectory, expected)
+   │               assert score >= passingScore
+   │            afterAll: print Markdown table to stdout
+   │         })
+   │
+   ├── runMcpHostLoop(input)
+   │      InMemoryTransport ─ Client ─ McpServer
+   │      LLM provider (Anthropic / OpenAI / LiteLLM)
+   │      loop ≤ MAX_TURNS=8: LLM → tool calls → results → repeat
+   │      returns Trajectory (ordered ToolCall[])
+   │
+   └── Evaluators
+          skill-activation    binary: was skill tool called?
+          negative-activation binary: was skill tool correctly absent?
+          tool-selection      F1 precision/recall against expected.tools
+          trajectory          LCS similarity of actual vs expected sequence
+          criteria            LLM-as-judge against natural-language assertions
+```
+
+### Key design choices
+
+| Decision | Rationale |
+|---|---|
+| In-process via `InMemoryTransport` | No network, no server process — evals run anywhere |
+| `describe.skipIf(!RUN_LLM_EVALS)` | Zero LLM cost in regular `npm test` |
+| `Evaluator` is a plain function | Easy to compose; factory pattern for stateful evaluators (criteria) |
+| `'N/A'` return instead of 0 | Datasets omit irrelevant evaluator dimensions without masking real regressions |
+| LCS for trajectory | Order matters; set-based coverage is tool-selection's job |
+
+---
+
+## Dataset shape
+
+A dataset is a `Dataset` object exported from a `*.dataset.ts` file:
+
+```typescript
+import type { Dataset } from "../types.js";
+
+export const myDataset: Dataset = {
+  name: "my-skill",
+  examples: [
+    {
+      id: "ms-pos-01",                    // stable, unique — appears in CI summaries
+      input: "user message to the LLM",   // the query sent to runMcpHostLoop
+      expected: {
+        skill: "entry-point-tool-name",   // tool the skill SKILL.md instructs the LLM to call
+        tools: ["entry-point-tool-name"], // ordered list for trajectory/tool-selection
+        criteria: [                       // natural-language assertions for LLM-as-judge
+          "The model called the correct entry-point tool",
+        ],
+      },
+    },
+  ],
+};
+```
+
+All three `expected` fields are **optional**:
+
+| Field | Evaluators that use it | Omit when… |
+|---|---|---|
+| `skill` | `skill-activation`, `negative-activation` | Dataset doesn't test skill routing |
+| `tools` | `tool-selection`, `trajectory` | No ordered tool expectation |
+| `criteria` | `criteria` | No LLM-as-judge needed (saves cost) |
+
+Omitting a field causes the evaluator to return `'N/A'` for that example rather than a false 0.
+
+### Positive vs distractor examples
+
+A **positive** example is a query that *should* activate the skill.  
+A **distractor** example is an unrelated query that *should not*.
+
+Use separate `runDataset` calls with different evaluators for each group:
+
+```typescript
+// Positive: skill should fire
+runDataset(
+  { name: "my-skill: positives", examples: positiveExamples },
+  { "skill-activation": skillActivation, "tool-selection": toolSelection },
+  { passingScore: 0.8 }
+);
+
+// Distractor: skill must NOT fire (gate is 100%)
+runDataset(
+  { name: "my-skill: distractors", examples: distractorExamples },
+  { "negative-activation": negativeActivation },
+  { passingScore: 1.0 }
+);
+```
+
+---
+
+## Evaluator catalog
+
+### `skill-activation`
+
+**Type**: binary · **Score**: `1` if `expected.skill` found in trajectory, `0` otherwise  
+**Returns `'N/A'`**: when `expected.skill` is absent  
+**Gate**: ≥ 0.8 on positive examples (use `passingScore: 0.8`)
+
+Tests whether the LLM called the skill's model-facing entry-point tool at
+least once.
+
+### `negative-activation`
+
+**Type**: binary · **Score**: `1` if `expected.skill` is *absent* from trajectory, `0` if present  
+**Returns `'N/A'`**: when `expected.skill` is absent  
+**Gate**: 1.0 on distractor examples (use `passingScore: 1.0`)
+
+Tests that the skill does not over-trigger on unrelated queries. Any false
+positive here means the skill's SKILL.md is too broad.
+
+### `tool-selection`
+
+**Type**: F1 · **Score**: harmonic mean of precision and recall against `expected.tools` (set-based)  
+**Returns `'N/A'`**: when `expected.tools` is absent  
+**Gate**: ≥ 0.8 on positive examples
+
+Tests *which* tools were called, ignoring order. Missed tools lower recall;
+spurious tools lower precision. Failure reason includes `missed: [...]` and
+`extra: [...]`.
+
+### `trajectory`
+
+**Type**: LCS similarity · **Score**: `lcs(actual, expected) / max(|actual|, |expected|)`  
+**Returns `'N/A'`**: when `expected.tools` is absent  
+**Gate**: ≥ 0.7 on positive examples (sequence matching is looser than set matching)
+
+Tests *order*. Dividing by `max` penalises both missing and extra steps.
+Use alongside `tool-selection` for full coverage.
+
+### `criteria`
+
+**Type**: LLM-as-judge · **Score**: `0.0–1.0` parsed from a rubric prompt response  
+**Returns `'N/A'`**: when `expected.criteria` is absent  
+**Gate**: ≥ 0.7
+
+Calls the judge LLM with the trajectory `{tool, args}` pairs and the
+criteria list. Asks for `{"score": <0–1>, "reasoning": "..."}`. Falls back
+to regex number extraction if JSON parse fails. Use for semantic assertions
+that structural evaluators can't express.
+
+**Cost**: one extra LLM call per example. Omit `expected.criteria` to skip.
+
+---
+
+## How to add a dataset
+
+1. **Create the data file** `evals/datasets/<skill-name>.dataset.ts`:
+
+   ```typescript
+   import type { Dataset, Example } from "../types.js";
+
+   const SKILL_TOOL = "my-tool"; // the model-facing entry-point tool
+
+   export const positiveExamples: Example[] = [
+     { id: "ms-pos-01", input: "...", expected: { skill: SKILL_TOOL, tools: [SKILL_TOOL] } },
+     // add ≥ 4 examples
+   ];
+
+   export const distractorExamples: Example[] = [
+     { id: "ms-neg-01", input: "...", expected: { skill: SKILL_TOOL } },
+     // add ≥ 4 examples
+   ];
+
+   export const myDataset: Dataset = {
+     name: "<skill-name>",
+     examples: [...positiveExamples, ...distractorExamples],
+   };
+   ```
+
+2. **Create the eval spec** `evals/<skill-name>.eval.test.ts`:
+
+   ```typescript
+   import { runDataset } from "./runner.js";
+   import { positiveExamples, distractorExamples } from "./datasets/<skill-name>.dataset.js";
+   import { skillActivation } from "./evaluators/skill-activation.js";
+   import { negativeActivation } from "./evaluators/negative-activation.js";
+   import { toolSelection } from "./evaluators/tool-selection.js";
+
+   runDataset(
+     { name: "<skill-name>: positives", examples: positiveExamples },
+     { "skill-activation": skillActivation, "tool-selection": toolSelection },
+     { passingScore: 0.8 }
+   );
+
+   runDataset(
+     { name: "<skill-name>: distractors", examples: distractorExamples },
+     { "negative-activation": negativeActivation },
+     { passingScore: 1.0 }
+   );
+   ```
+
+3. **Run locally**:
+
+   ```bash
+   # Anthropic (preferred)
+   ANTHROPIC_API_KEY=sk-ant-... CLUSTERS_JSON='[{...}]' npm run test:evals
+
+   # OpenAI / LiteLLM proxy
+   OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... CLUSTERS_JSON='[{...}]' npm run test:evals
+   ```
+
+4. **Trigger in CI**: open a PR and add the `evals` label (requires write access).
+
+---
+
+## CI gating
+
+### Workflow: `.github/workflows/evals.yml`
+
+| Trigger | When |
+|---|---|
+| `workflow_dispatch` | Manual run from Actions UI |
+| `schedule` | Nightly at 02:00 UTC |
+| `pull_request_target` | When `evals` label is added to a PR |
+
+The concurrency group `evals-<ref>` cancels superseded runs to avoid wasting
+LLM quota on stale pushes.
+
+### Required secrets
+
+| Secret | Purpose |
+|---|---|
+| `EVAL_ANTHROPIC_API_KEY` | Anthropic API key (priority provider) |
+| `EVAL_OPENAI_API_KEY` | OpenAI / LiteLLM API key (fallback) |
+| `EVAL_LITELLM_BASE_URL` | Optional LiteLLM proxy base URL |
+| `EVAL_CLUSTERS_JSON` | Elastic cluster credentials for the MCP server |
+
+### Passing thresholds (recommended defaults)
+
+| Evaluator | Positives | Distractors |
+|---|---|---|
+| `skill-activation` | ≥ 0.8 | — |
+| `negative-activation` | — | = 1.0 |
+| `tool-selection` | ≥ 0.8 | — |
+| `trajectory` | ≥ 0.7 | — |
+| `criteria` | ≥ 0.7 | — |
+
+Results are posted as a Markdown table to the GitHub Actions job summary
+(`$GITHUB_STEP_SUMMARY`) after every run.

From e9a23fa184702423613da968d674ad707fed9459 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:08:54 +0200
Subject: [PATCH 17/42] feat: add MigrationsService wrapping 14
 /internal/siem_migrations/* Kibana routes

Service injects KibanaClient directly (no separate *Client indirection since
these are internal-only Kibana routes with no public API equivalent). The
KibanaClient already supplies x-elastic-internal-origin: Kibana; each method
adds elastic-api-version: 2023-10-31 via MIGRATION_HEADERS per-request.

14 methods, one per route:
  createMigration   POST   /internal/siem_migrations/rules
  listMigrations    GET    /internal/siem_migrations/rules
  getMigration      GET    /internal/siem_migrations/rules/:id
  deleteMigration   DELETE /internal/siem_migrations/rules/:id
  uploadRules       POST   /internal/siem_migrations/rules/:id/rules
  getTranslatedRules GET   /internal/siem_migrations/rules/:id/rules
  getTranslatedRule  GET   /internal/siem_migrations/rules/:id/rules/:ruleId
  updateTranslatedRule PUT /internal/siem_migrations/rules/:id/rules/:ruleId
  startTranslation  POST   /internal/siem_migrations/rules/:id/start
  stopTranslation   POST   /internal/siem_migrations/rules/:id/stop
  getResources      GET    /internal/siem_migrations/resources/:id
  upsertResources   POST   /internal/siem_migrations/resources/:id
  installRules      POST   /internal/siem_migrations/rules/:id/install
  getStats          GET    /internal/siem_migrations/rules/:id/stats

MigrationApiError wraps every non-2xx response with typed status (extracted
from the Kibana client's "Kibana [cluster] STATUS: body" error format) and the
request path so callers can surface actionable error messages.

Domain types: SiemMigration, TranslatedRule, MigrationResource, MigrationStats
and associated option/result interfaces, all barrel-exported from service/index.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/elastic/service/index.ts             |  15 +
 src/elastic/service/migrationsService.ts | 361 +++++++++++++++++++++++
 2 files changed, 376 insertions(+)
 create mode 100644 src/elastic/service/migrationsService.ts

diff --git a/src/elastic/service/index.ts b/src/elastic/service/index.ts
index 38671ee..3c6e574 100644
--- a/src/elastic/service/index.ts
+++ b/src/elastic/service/index.ts
@@ -19,3 +19,18 @@ export type {
   ScenarioRuleDef,
 } from "./sampleDataService.js";
 export { SampleDataService, SCENARIO_NAMES, SCENARIO_RULES } from "./sampleDataService.js";
+export type {
+  SiemMigration,
+  TranslatedRule,
+  MigrationResource,
+  MigrationStats,
+  ListTranslatedRulesOptions,
+  ListTranslatedRulesResult,
+  InstallRulesOptions,
+  InstallRulesResult,
+} from "./migrationsService.js";
+export {
+  MigrationApiError,
+  MigrationsService,
+  SIEM_MIGRATIONS_API_BASE,
+} from "./migrationsService.js";
diff --git a/src/elastic/service/migrationsService.ts b/src/elastic/service/migrationsService.ts
new file mode 100644
index 0000000..ffd0dd4
--- /dev/null
+++ b/src/elastic/service/migrationsService.ts
@@ -0,0 +1,361 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { KibanaClient } from "../kibana-client/index.js";
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+export const SIEM_MIGRATIONS_API_BASE = "/internal/siem_migrations";
+
+/**
+ * Per-request headers required by the Kibana internal SIEM migrations API.
+ * `x-elastic-internal-origin: Kibana` is pre-baked into `KibanaClient`;
+ * only the versioning header needs to be added on each call.
+ */
+const MIGRATION_HEADERS = {
+  "elastic-api-version": "2023-10-31",
+} as const;
+
+// ---------------------------------------------------------------------------
+// Domain types
+// ---------------------------------------------------------------------------
+
+export interface SiemMigration {
+  id: string;
+  name: string;
+  /** Lifecycle status of the migration. */
+  status: "ready" | "running" | "finished" | "error";
+  created_at: string;
+  last_updated_at: string;
+  rules: {
+    total: number;
+    pending: number;
+    processing: number;
+    completed: number;
+    failed: number;
+    installable: number;
+    installed: number;
+    partially_translated: number;
+    untranslatable: number;
+  };
+}
+
+export interface TranslatedRule {
+  id: string;
+  migration_id: string;
+  status: "pending" | "processing" | "completed" | "failed";
+  translation_result?: "full" | "partial" | "untranslatable";
+  elastic_rule?: Record<string, unknown>;
+  original_rule: Record<string, unknown>;
+  comments?: string[];
+}
+
+export interface MigrationResource {
+  type: "macro" | "lookup";
+  name: string;
+  content: string;
+}
+
+export interface MigrationStats {
+  id: string;
+  status: SiemMigration["status"];
+  rules: SiemMigration["rules"];
+}
+
+export interface ListTranslatedRulesOptions {
+  readonly page?: number;
+  readonly perPage?: number;
+  readonly filter?: string;
+}
+
+export interface ListTranslatedRulesResult {
+  data: TranslatedRule[];
+  total: number;
+}
+
+export interface InstallRulesOptions {
+  /** Specific rule IDs to install; omit to install all installable rules. */
+  ids?: string[];
+}
+
+export interface InstallRulesResult {
+  installed: number;
+  failed: number;
+}
+
+// ---------------------------------------------------------------------------
+// Typed error
+// ---------------------------------------------------------------------------
+
+/**
+ * Thrown by every {@link MigrationsService} method on a non-2xx response.
+ *
+ * The Kibana client's response interceptor formats AxiosErrors as
+ * `"Kibana [<cluster>] <status>: <body>"` before they reach here, so
+ * `status` is extracted from that message when available.
+ */
+export class MigrationApiError extends Error {
+  readonly status: number;
+  readonly path: string;
+
+  constructor(path: string, cause: unknown) {
+    const causeMsg = cause instanceof Error ? cause.message : String(cause);
+    // Match the Kibana client error format: "Kibana [name] STATUS: detail"
+    const statusMatch = causeMsg.match(/\b([1-5]\d{2})\b/);
+    const status = statusMatch ? parseInt(statusMatch[1], 10) : 0;
+
+    super(`SIEM Migrations API error on ${path}: ${causeMsg}`);
+    this.name = "MigrationApiError";
+    this.status = status;
+    this.path = path;
+    if (cause instanceof Error) {
+      this.cause = cause;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Service
+// ---------------------------------------------------------------------------
+
+interface MigrationsServiceOptions {
+  readonly kibanaClient: KibanaClient;
+}
+
+/**
+ * Thin wrapper over the 14 `/internal/siem_migrations/*` Kibana routes.
+ *
+ * Every method adds `elastic-api-version: 2023-10-31`; the underlying
+ * {@link KibanaClient} supplies `x-elastic-internal-origin: Kibana` and
+ * authentication on every request. Non-2xx responses are re-thrown as
+ * {@link MigrationApiError}.
+ */
+export class MigrationsService {
+  private readonly client: KibanaClient;
+
+  constructor(options: MigrationsServiceOptions) {
+    this.client = options.kibanaClient;
+  }
+
+  // ── Migration lifecycle ──────────────────────────────────────────────────
+
+  /** POST /internal/siem_migrations/rules */
+  async createMigration(name: string): Promise<{ migration_id: string }> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules`;
+    try {
+      const { data } = await this.client.post<{ migration_id: string }>(
+        path,
+        { name },
+        { headers: MIGRATION_HEADERS }
+      );
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** GET /internal/siem_migrations/rules */
+  async listMigrations(): Promise<SiemMigration[]> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules`;
+    try {
+      const { data } = await this.client.get<SiemMigration[]>(path, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** GET /internal/siem_migrations/rules/:migrationId */
+  async getMigration(migrationId: string): Promise<SiemMigration> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`;
+    try {
+      const { data } = await this.client.get<SiemMigration>(path, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** DELETE /internal/siem_migrations/rules/:migrationId */
+  async deleteMigration(migrationId: string): Promise<void> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`;
+    try {
+      await this.client.delete(path, { headers: MIGRATION_HEADERS });
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Splunk rule upload ───────────────────────────────────────────────────
+
+  /** POST /internal/siem_migrations/rules/:migrationId/rules */
+  async uploadRules(
+    migrationId: string,
+    rules: Record<string, unknown>[]
+  ): Promise<{ total: number }> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`;
+    try {
+      const { data } = await this.client.post<{ total: number }>(
+        path,
+        rules,
+        { headers: MIGRATION_HEADERS }
+      );
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Translated rules ─────────────────────────────────────────────────────
+
+  /** GET /internal/siem_migrations/rules/:migrationId/rules */
+  async getTranslatedRules(
+    migrationId: string,
+    options: ListTranslatedRulesOptions = {}
+  ): Promise<ListTranslatedRulesResult> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`;
+    const params: Record<string, string> = {
+      page: String(options.page ?? 1),
+      per_page: String(options.perPage ?? 20),
+    };
+    if (options.filter) params.filter = options.filter;
+
+    try {
+      const { data } = await this.client.get<ListTranslatedRulesResult>(path, {
+        params,
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** GET /internal/siem_migrations/rules/:migrationId/rules/:ruleId */
+  async getTranslatedRule(
+    migrationId: string,
+    ruleId: string
+  ): Promise<TranslatedRule> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`;
+    try {
+      const { data } = await this.client.get<TranslatedRule>(path, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** PUT /internal/siem_migrations/rules/:migrationId/rules/:ruleId */
+  async updateTranslatedRule(
+    migrationId: string,
+    ruleId: string,
+    updates: Partial<Pick<TranslatedRule, "elastic_rule" | "translation_result" | "comments">>
+  ): Promise<TranslatedRule> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`;
+    try {
+      const { data } = await this.client.put<TranslatedRule>(path, updates, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Translation control ──────────────────────────────────────────────────
+
+  /** POST /internal/siem_migrations/rules/:migrationId/start */
+  async startTranslation(migrationId: string): Promise<void> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/start`;
+    try {
+      await this.client.post(path, {}, { headers: MIGRATION_HEADERS });
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** POST /internal/siem_migrations/rules/:migrationId/stop */
+  async stopTranslation(migrationId: string): Promise<void> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stop`;
+    try {
+      await this.client.post(path, {}, { headers: MIGRATION_HEADERS });
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Resources ────────────────────────────────────────────────────────────
+
+  /** GET /internal/siem_migrations/resources/:migrationId */
+  async getResources(migrationId: string): Promise<MigrationResource[]> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`;
+    try {
+      const { data } = await this.client.get<MigrationResource[]>(path, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  /** POST /internal/siem_migrations/resources/:migrationId */
+  async upsertResources(
+    migrationId: string,
+    resources: MigrationResource[]
+  ): Promise<void> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`;
+    try {
+      await this.client.post(path, resources, { headers: MIGRATION_HEADERS });
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Installation ─────────────────────────────────────────────────────────
+
+  /** POST /internal/siem_migrations/rules/:migrationId/install */
+  async installRules(
+    migrationId: string,
+    options: InstallRulesOptions = {}
+  ): Promise<InstallRulesResult> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/install`;
+    try {
+      const { data } = await this.client.post<InstallRulesResult>(
+        path,
+        options.ids ? { ids: options.ids } : {},
+        { headers: MIGRATION_HEADERS }
+      );
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+
+  // ── Stats ────────────────────────────────────────────────────────────────
+
+  /** GET /internal/siem_migrations/rules/:migrationId/stats */
+  async getStats(migrationId: string): Promise<MigrationStats> {
+    const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stats`;
+    try {
+      const { data } = await this.client.get<MigrationStats>(path, {
+        headers: MIGRATION_HEADERS,
+      });
+      return data;
+    } catch (err) {
+      throw new MigrationApiError(path, err);
+    }
+  }
+}

From 2a4ec7d1ab9eacb5c7ad90b8664cd29295c6b372 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:13:58 +0200
Subject: [PATCH 18/42] test: add MigrationsService tests covering all 14 route
 methods and error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

19 tests across 14 describe blocks — one per route method plus three
error-handling tests:

  Migration lifecycle:   createMigration, listMigrations, getMigration, deleteMigration
  Rule upload:           uploadRules
  Translated rules:      getTranslatedRules (default+custom pagination), getTranslatedRule, updateTranslatedRule
  Translation control:   startTranslation, stopTranslation
  Resources:             getResources, upsertResources
  Installation:          installRules (no-ids + with-ids)
  Stats:                 getStats
  MigrationApiError:     status parsed from Kibana error format; status=0 fallback;
                         all mutating methods surface MigrationApiError

Also adds `put: vi.fn()` to MockHttpClient / makeMock in mockHttpClient.ts
so MigrationsService.updateTranslatedRule can be exercised.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/elastic/service/migrationsService.test.ts | 329 ++++++++++++++++++
 src/test/helpers/mockHttpClient.ts            |   2 +
 2 files changed, 331 insertions(+)
 create mode 100644 src/elastic/service/migrationsService.test.ts

diff --git a/src/elastic/service/migrationsService.test.ts b/src/elastic/service/migrationsService.test.ts
new file mode 100644
index 0000000..0c184e7
--- /dev/null
+++ b/src/elastic/service/migrationsService.test.ts
@@ -0,0 +1,329 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, beforeEach } from "vitest";
+import {
+  MigrationsService,
+  MigrationApiError,
+  SIEM_MIGRATIONS_API_BASE,
+} from "./migrationsService.js";
+import type { KibanaClient } from "../kibana-client/index.js";
+import {
+  createMockKibanaClient,
+  dataEnvelope,
+  type MockHttpClient,
+} from "../../test/helpers/mockHttpClient.js";
+import type { SiemMigration, TranslatedRule, MigrationResource } from "./migrationsService.js";
+
+const BASE = SIEM_MIGRATIONS_API_BASE;
+const HEADERS = { headers: { "elastic-api-version": "2023-10-31" } };
+
+const MIGRATION_ID = "migration-1";
+const RULE_ID = "rule-1";
+
+const fakeMigration: SiemMigration = {
+  id: MIGRATION_ID,
+  name: "test-migration",
+  status: "ready",
+  created_at: "2026-01-01T00:00:00Z",
+  last_updated_at: "2026-01-01T00:00:00Z",
+  rules: {
+    total: 0, pending: 0, processing: 0, completed: 0, failed: 0,
+    installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+  },
+};
+
+const fakeRule: TranslatedRule = {
+  id: RULE_ID,
+  migration_id: MIGRATION_ID,
+  status: "completed",
+  translation_result: "full",
+  original_rule: { name: "splunk-rule" },
+};
+
+const fakeResource: MigrationResource = {
+  type: "macro",
+  name: "my_macro",
+  content: "| where true",
+};
+
+describe("MigrationsService", () => {
+  let kibanaClient: KibanaClient & MockHttpClient;
+  let service: MigrationsService;
+
+  beforeEach(() => {
+    kibanaClient = createMockKibanaClient();
+    service = new MigrationsService({ kibanaClient });
+  });
+
+  // ── Migration lifecycle ────────────────────────────────────────────────────
+
+  describe("createMigration", () => {
+    it("POSTs to /rules with the migration name and returns migration_id", async () => {
+      kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ migration_id: MIGRATION_ID }));
+
+      const result = await service.createMigration("My Migration");
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/rules`,
+        { name: "My Migration" },
+        HEADERS
+      );
+      expect(result).toEqual({ migration_id: MIGRATION_ID });
+    });
+  });
+
+  describe("listMigrations", () => {
+    it("GETs /rules and returns the array", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeMigration]));
+
+      const result = await service.listMigrations();
+
+      expect(kibanaClient.get).toHaveBeenCalledWith(`${BASE}/rules`, HEADERS);
+      expect(result).toEqual([fakeMigration]);
+    });
+  });
+
+  describe("getMigration", () => {
+    it("GETs /rules/:migrationId and returns the migration", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeMigration));
+
+      const result = await service.getMigration(MIGRATION_ID);
+
+      expect(kibanaClient.get).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}`,
+        HEADERS
+      );
+      expect(result).toEqual(fakeMigration);
+    });
+  });
+
+  describe("deleteMigration", () => {
+    it("DELETEs /rules/:migrationId", async () => {
+      await service.deleteMigration(MIGRATION_ID);
+
+      expect(kibanaClient.delete).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}`,
+        HEADERS
+      );
+    });
+  });
+
+  // ── Rule upload ────────────────────────────────────────────────────────────
+
+  describe("uploadRules", () => {
+    it("POSTs rules array to /rules/:migrationId/rules and returns totals", async () => {
+      kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ total: 5 }));
+      const splunkRules = [{ search: "index=main" }, { search: "index=security" }];
+
+      const result = await service.uploadRules(MIGRATION_ID, splunkRules);
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/rules`,
+        splunkRules,
+        HEADERS
+      );
+      expect(result).toEqual({ total: 5 });
+    });
+  });
+
+  // ── Translated rules ───────────────────────────────────────────────────────
+
+  describe("getTranslatedRules", () => {
+    it("GETs /rules/:migrationId/rules with default pagination", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [fakeRule], total: 1 }));
+
+      const result = await service.getTranslatedRules(MIGRATION_ID);
+
+      const [path, config] = kibanaClient.get.mock.calls[0] as [string, Record<string, unknown>];
+      expect(path).toBe(`${BASE}/rules/${MIGRATION_ID}/rules`);
+      expect(config.params).toMatchObject({ page: "1", per_page: "20" });
+      expect(result).toEqual({ data: [fakeRule], total: 1 });
+    });
+
+    it("forwards custom page, perPage and filter params", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [], total: 0 }));
+
+      await service.getTranslatedRules(MIGRATION_ID, { page: 2, perPage: 50, filter: "status:completed" });
+
+      const [, config] = kibanaClient.get.mock.calls[0] as [string, Record<string, unknown>];
+      expect(config.params).toEqual({ page: "2", per_page: "50", filter: "status:completed" });
+    });
+  });
+
+  describe("getTranslatedRule", () => {
+    it("GETs /rules/:migrationId/rules/:ruleId", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeRule));
+
+      const result = await service.getTranslatedRule(MIGRATION_ID, RULE_ID);
+
+      expect(kibanaClient.get).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`,
+        HEADERS
+      );
+      expect(result).toEqual(fakeRule);
+    });
+  });
+
+  describe("updateTranslatedRule", () => {
+    it("PUTs updates to /rules/:migrationId/rules/:ruleId and returns the updated rule", async () => {
+      const updated = { ...fakeRule, translation_result: "partial" as const };
+      kibanaClient.put.mockResolvedValueOnce(dataEnvelope(updated));
+
+      const result = await service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {
+        translation_result: "partial",
+      });
+
+      expect(kibanaClient.put).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`,
+        { translation_result: "partial" },
+        HEADERS
+      );
+      expect(result).toEqual(updated);
+    });
+  });
+
+  // ── Translation control ────────────────────────────────────────────────────
+
+  describe("startTranslation", () => {
+    it("POSTs to /rules/:migrationId/start", async () => {
+      await service.startTranslation(MIGRATION_ID);
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/start`,
+        {},
+        HEADERS
+      );
+    });
+  });
+
+  describe("stopTranslation", () => {
+    it("POSTs to /rules/:migrationId/stop", async () => {
+      await service.stopTranslation(MIGRATION_ID);
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/stop`,
+        {},
+        HEADERS
+      );
+    });
+  });
+
+  // ── Resources ──────────────────────────────────────────────────────────────
+
+  describe("getResources", () => {
+    it("GETs /resources/:migrationId and returns the array", async () => {
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeResource]));
+
+      const result = await service.getResources(MIGRATION_ID);
+
+      expect(kibanaClient.get).toHaveBeenCalledWith(
+        `${BASE}/resources/${MIGRATION_ID}`,
+        HEADERS
+      );
+      expect(result).toEqual([fakeResource]);
+    });
+  });
+
+  describe("upsertResources", () => {
+    it("POSTs resources array to /resources/:migrationId", async () => {
+      await service.upsertResources(MIGRATION_ID, [fakeResource]);
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/resources/${MIGRATION_ID}`,
+        [fakeResource],
+        HEADERS
+      );
+    });
+  });
+
+  // ── Installation ───────────────────────────────────────────────────────────
+
+  describe("installRules", () => {
+    it("POSTs empty body to /rules/:migrationId/install when no ids given", async () => {
+      kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 3, failed: 0 }));
+
+      const result = await service.installRules(MIGRATION_ID);
+
+      expect(kibanaClient.post).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/install`,
+        {},
+        HEADERS
+      );
+      expect(result).toEqual({ installed: 3, failed: 0 });
+    });
+
+    it("includes ids in the body when provided", async () => {
+      kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 1, failed: 0 }));
+
+      await service.installRules(MIGRATION_ID, { ids: ["r1", "r2"] });
+
+      const [, body] = kibanaClient.post.mock.calls[0] as [string, Record<string, unknown>];
+      expect(body).toEqual({ ids: ["r1", "r2"] });
+    });
+  });
+
+  // ── Stats ──────────────────────────────────────────────────────────────────
+
+  describe("getStats", () => {
+    it("GETs /rules/:migrationId/stats and returns the stats", async () => {
+      const stats = { id: MIGRATION_ID, status: "ready" as const, rules: fakeMigration.rules };
+      kibanaClient.get.mockResolvedValueOnce(dataEnvelope(stats));
+
+      const result = await service.getStats(MIGRATION_ID);
+
+      expect(kibanaClient.get).toHaveBeenCalledWith(
+        `${BASE}/rules/${MIGRATION_ID}/stats`,
+        HEADERS
+      );
+      expect(result).toEqual(stats);
+    });
+  });
+
+  // ── MigrationApiError ──────────────────────────────────────────────────────
+
+  describe("MigrationApiError", () => {
+    it("wraps non-2xx with status parsed from Kibana error format", async () => {
+      const path = `${BASE}/rules/${MIGRATION_ID}`;
+      kibanaClient.get.mockRejectedValue(
+        new Error("Kibana [test-cluster] 404: migration not found")
+      );
+
+      await expect(service.getMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.getMigration(MIGRATION_ID)).rejects.toMatchObject({
+        status: 404,
+        path,
+        message: expect.stringContaining(path) as string,
+      });
+    });
+
+    it("sets status 0 when error message has no HTTP status code", async () => {
+      kibanaClient.get.mockRejectedValueOnce(new Error("network timeout"));
+
+      const err = await service.getMigration(MIGRATION_ID).catch((e) => e as MigrationApiError);
+      expect(err).toBeInstanceOf(MigrationApiError);
+      expect(err.status).toBe(0);
+    });
+
+    it("surfaces a MigrationApiError from every mutating method", async () => {
+      const netErr = new Error("Kibana [test-cluster] 503: service unavailable");
+
+      kibanaClient.post.mockRejectedValue(netErr);
+      kibanaClient.put.mockRejectedValue(netErr);
+      kibanaClient.delete.mockRejectedValue(netErr);
+
+      await expect(service.createMigration("x")).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.uploadRules(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.startTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.stopTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.upsertResources(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.installRules(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {})).rejects.toBeInstanceOf(MigrationApiError);
+      await expect(service.deleteMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError);
+    });
+  });
+});
diff --git a/src/test/helpers/mockHttpClient.ts b/src/test/helpers/mockHttpClient.ts
index b843524..f640f2c 100644
--- a/src/test/helpers/mockHttpClient.ts
+++ b/src/test/helpers/mockHttpClient.ts
@@ -17,6 +17,7 @@ import type { KibanaClient } from "../../elastic/kibana-client/kibana-client.js"
 export interface MockHttpClient {
   get: Mock;
   post: Mock;
+  put: Mock;
   patch: Mock;
   delete: Mock;
   clusterName: string;
@@ -48,6 +49,7 @@ function makeMock(clusterName: string): MockHttpClient {
   return {
     get: vi.fn().mockResolvedValue({ data: undefined }),
     post: vi.fn().mockResolvedValue({ data: undefined }),
+    put: vi.fn().mockResolvedValue({ data: undefined }),
     patch: vi.fn().mockResolvedValue({ data: undefined }),
     delete: vi.fn().mockResolvedValue({ data: undefined }),
     clusterName,

From 16e4d062d9434c14cb544ff249f641dcc7edaeb7 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:17:08 +0200
Subject: [PATCH 19/42] feat: register migration tools (1 model-facing + 10
 app-only)

migrate-rules (model-facing):
  _meta.ui.resourceUri = ui://migrate-rules/mcp-app.html
  Callback seeds the workbench with a compact migration list so the LLM
  gets immediate context.

App-only tools (_meta.ui.visibility: ["app"]):
  list-migrations    GET  all migrations
  get-migration      GET  single migration by ID
  get-translated-rules  paginated translated rule listing (vendor-gated)
  start-translation  kick off AI translation (vendor-gated)
  stop-translation   halt in-progress translation (vendor-gated)
  update-translated-rule  patch elastic_rule / translation_result / comments (vendor-gated)
  get-resources      list macros/lookups (vendor-gated)
  upsert-resource    create/replace single macro or lookup (vendor-gated)
  install-rules      install translated rules, optional id filter (vendor-gated)
  get-stats          per-migration translation/installation stats

Vendor gate: SUPPORTED_VENDORS = ["splunk"]. If a vendor param is provided
and not in the list, returns { error: "vendorNotSupported", vendor } without
hitting Kibana. Re-enabling a vendor is a one-line change to the constant.

Also registers the migration workbench HTML via registerAppResource; the view
file is resolved at request time (resolveViewPath("migration")) so the tool
works once the view is built in a subsequent commit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tools/migration.ts | 353 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 src/tools/migration.ts

diff --git a/src/tools/migration.ts b/src/tools/migration.ts
new file mode 100644
index 0000000..5502bd2
--- /dev/null
+++ b/src/tools/migration.ts
@@ -0,0 +1,353 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import {
+  registerAppTool,
+  registerAppResource,
+  RESOURCE_MIME_TYPE,
+} from "@modelcontextprotocol/ext-apps/server";
+import { z } from "zod";
+import fs from "fs";
+import type { MigrationsService } from "../elastic/service/index.js";
+import { resolveViewPath } from "./view-path.js";
+
+const RESOURCE_URI = "ui://migrate-rules/mcp-app.html";
+
+/**
+ * Vendors for which the Kibana SIEM migrations translator is production-ready.
+ * Re-enabling a vendor is a one-line change to this array once the translator
+ * matures — QRadar and Sentinel-One are the next candidates.
+ */
+const SUPPORTED_VENDORS: readonly string[] = ["splunk"];
+
+export interface MigrationToolDeps {
+  readonly migrationsService: MigrationsService;
+}
+
+/** Returns a vendor-gate error response for app-only tools. */
+function vendorNotSupportedResponse(vendor: string) {
+  return {
+    content: [
+      {
+        type: "text" as const,
+        text: JSON.stringify({ error: "vendorNotSupported", vendor }),
+      },
+    ],
+  };
+}
+
+/** Returns true when `vendor` is explicitly provided but not in SUPPORTED_VENDORS. */
+function isUnsupportedVendor(vendor: string | undefined): vendor is string {
+  return vendor !== undefined && !SUPPORTED_VENDORS.includes(vendor);
+}
+
+export function registerMigrationTools(
+  server: McpServer,
+  deps: MigrationToolDeps
+) {
+  const { migrationsService } = deps;
+
+  // ── Model-facing entry-point ───────────────────────────────────────────────
+
+  registerAppTool(
+    server,
+    "migrate-rules",
+    {
+      title: "Migrate Rules",
+      description:
+        "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security. " +
+        "Opens an interactive migration workbench for uploading, translating, reviewing, " +
+        "and installing rules. Vendor support: Splunk (active), QRadar / Sentinel-One (coming soon).",
+      inputSchema: {},
+      _meta: { ui: { resourceUri: RESOURCE_URI } },
+    },
+    async () => {
+      const migrations = await migrationsService.listMigrations();
+      return {
+        content: [
+          {
+            type: "text" as const,
+            text: JSON.stringify({
+              message: "Opening SIEM migration workbench",
+              migrations: migrations.map(({ id, name, status }) => ({ id, name, status })),
+            }),
+          },
+        ],
+      };
+    }
+  );
+
+  // ── App-only tools ─────────────────────────────────────────────────────────
+
+  registerAppTool(
+    server,
+    "list-migrations",
+    {
+      title: "List Migrations",
+      description: "List all SIEM rule migrations.",
+      inputSchema: {},
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async () => {
+      const migrations = await migrationsService.listMigrations();
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(migrations) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "get-migration",
+    {
+      title: "Get Migration",
+      description: "Get details for a specific SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId }) => {
+      const migration = await migrationsService.getMigration(migrationId);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(migration) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "get-translated-rules",
+    {
+      title: "Get Translated Rules",
+      description: "Get translated rules for a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+        page: z.number().optional(),
+        perPage: z.number().optional(),
+        filter: z.string().optional(),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor, page, perPage, filter }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      const result = await migrationsService.getTranslatedRules(migrationId, {
+        page,
+        perPage,
+        filter,
+      });
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(result) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "start-translation",
+    {
+      title: "Start Translation",
+      description: "Start the AI translation process for a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      await migrationsService.startTranslation(migrationId);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify({ status: "started" }) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "stop-translation",
+    {
+      title: "Stop Translation",
+      description: "Stop the AI translation process for a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      await migrationsService.stopTranslation(migrationId);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify({ status: "stopped" }) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "update-translated-rule",
+    {
+      title: "Update Translated Rule",
+      description: "Update a translated rule in a SIEM migration (e.g. fix its Elastic rule JSON).",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        ruleId: z.string().describe("Translated rule ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+        elasticRule: z
+          .string()
+          .optional()
+          .describe("JSON-encoded Elastic rule updates"),
+        translationResult: z
+          .enum(["full", "partial", "untranslatable"])
+          .optional(),
+        comments: z.array(z.string()).optional(),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, ruleId, vendor, elasticRule, translationResult, comments }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      const updates: Record<string, unknown> = {};
+      if (elasticRule !== undefined)
+        updates.elastic_rule = JSON.parse(elasticRule) as Record<string, unknown>;
+      if (translationResult !== undefined) updates.translation_result = translationResult;
+      if (comments !== undefined) updates.comments = comments;
+      const result = await migrationsService.updateTranslatedRule(migrationId, ruleId, updates);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(result) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "get-resources",
+    {
+      title: "Get Resources",
+      description: "Get macro/lookup resources for a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      const resources = await migrationsService.getResources(migrationId);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(resources) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "upsert-resource",
+    {
+      title: "Upsert Resource",
+      description: "Create or update a macro/lookup resource in a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+        type: z.enum(["macro", "lookup"]).describe("Resource type"),
+        name: z.string().describe("Resource name"),
+        content: z.string().describe("Resource content"),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor, type, name, content }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      await migrationsService.upsertResources(migrationId, [{ type, name, content }]);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify({ status: "ok" }) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "install-rules",
+    {
+      title: "Install Rules",
+      description: "Install translated rules from a SIEM migration into Elastic Security.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+        vendor: z
+          .string()
+          .optional()
+          .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."),
+        ids: z
+          .array(z.string())
+          .optional()
+          .describe("Specific rule IDs to install. Omit to install all installable rules."),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId, vendor, ids }) => {
+      if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor);
+      const result = await migrationsService.installRules(migrationId, { ids });
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(result) }],
+      };
+    }
+  );
+
+  registerAppTool(
+    server,
+    "get-stats",
+    {
+      title: "Get Stats",
+      description: "Get translation and installation statistics for a SIEM migration.",
+      inputSchema: {
+        migrationId: z.string().describe("Migration ID"),
+      },
+      _meta: { ui: { visibility: ["app"] } },
+    },
+    async ({ migrationId }) => {
+      const stats = await migrationsService.getStats(migrationId);
+      return {
+        content: [{ type: "text" as const, text: JSON.stringify(stats) }],
+      };
+    }
+  );
+
+  // ── App resource (HTML workbench) ──────────────────────────────────────────
+
+  const viewPath = resolveViewPath("migration");
+  registerAppResource(
+    server,
+    RESOURCE_URI,
+    RESOURCE_URI,
+    { mimeType: RESOURCE_MIME_TYPE },
+    async () => {
+      const html = fs.readFileSync(viewPath, "utf-8");
+      return {
+        contents: [{ uri: RESOURCE_URI, mimeType: RESOURCE_MIME_TYPE, text: html }],
+      };
+    }
+  );
+}

From 6b9c8bce468a8097f48bf7cc5c12986ced10cbe1 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:21:47 +0200
Subject: [PATCH 20/42] test: add migration tool tests (tool registrations +
 vendor gating)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

20 tests covering:

  Registration: all 11 tools + HTML resource registered under the correct names

  migrate-rules: workbench message + compact migration list returned to LLM

  app-only tool happy paths:
    list-migrations, get-migration, get-translated-rules (with pagination),
    start-translation, stop-translation, update-translated-rule (parses
    elasticRule JSON), get-resources, upsert-resource (single-element array),
    install-rules (with ids), get-stats

  Vendor gating (per gated tool):
    - vendor="qradar" / "sentinel-one" / unknown → { error: "vendorNotSupported" }
      without calling the service
    - vendor absent → proceeds (defaults to Splunk path)

  get-stats has no vendor gate — confirmed by calling without vendor

Also adds createMockMigrationsService() to mockServices.ts covering all
14 MigrationsService methods.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/test/helpers/mockServices.ts |  20 ++
 src/tools/migration.test.ts      | 408 +++++++++++++++++++++++++++++++
 2 files changed, 428 insertions(+)
 create mode 100644 src/tools/migration.test.ts

diff --git a/src/test/helpers/mockServices.ts b/src/test/helpers/mockServices.ts
index bb77c48..819e95c 100644
--- a/src/test/helpers/mockServices.ts
+++ b/src/test/helpers/mockServices.ts
@@ -13,6 +13,7 @@ import type { EntityDetailService } from "../../elastic/service/entityDetailServ
 import type { EsqlService } from "../../elastic/service/esqlService.js";
 import type { IndicesService } from "../../elastic/service/indicesService.js";
 import type { InvestigateService } from "../../elastic/service/investigateService.js";
+import type { MigrationsService } from "../../elastic/service/migrationsService.js";
 import type { RulesService } from "../../elastic/service/rulesService.js";
 import type { SampleDataService } from "../../elastic/service/sampleDataService.js";
 
@@ -99,6 +100,25 @@ export function createMockRulesService(): RulesService {
   ]);
 }
 
+export function createMockMigrationsService(): MigrationsService {
+  return mockService<MigrationsService>([
+    "createMigration",
+    "listMigrations",
+    "getMigration",
+    "deleteMigration",
+    "uploadRules",
+    "getTranslatedRules",
+    "getTranslatedRule",
+    "updateTranslatedRule",
+    "startTranslation",
+    "stopTranslation",
+    "getResources",
+    "upsertResources",
+    "installRules",
+    "getStats",
+  ]);
+}
+
 export function createMockSampleDataService(): SampleDataService {
   return mockService<SampleDataService>([
     "generateSampleData",
diff --git a/src/tools/migration.test.ts b/src/tools/migration.test.ts
new file mode 100644
index 0000000..7193075
--- /dev/null
+++ b/src/tools/migration.test.ts
@@ -0,0 +1,408 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import fs from "fs";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+
+import { registerMigrationTools } from "./migration.js";
+import {
+  createMockMcpServer,
+  parseToolText,
+  type MockMcpServer,
+} from "../test/helpers/mockMcpServer.js";
+import { createMockMigrationsService } from "../test/helpers/mockServices.js";
+import type { MigrationsService } from "../elastic/service/index.js";
+
+const RESOURCE_URI = "ui://migrate-rules/mcp-app.html";
+const MIGRATION_ID = "m-1";
+const RULE_ID = "r-1";
+
+function setup() {
+  const server = createMockMcpServer();
+  const migrationsService = createMockMigrationsService();
+  vi.spyOn(fs, "existsSync").mockReturnValue(false);
+  vi.spyOn(fs, "readFileSync").mockReturnValue("<html>migration</html>");
+  registerMigrationTools(server as unknown as McpServer, { migrationsService });
+  return { server, migrationsService };
+}
+
+describe("registerMigrationTools", () => {
+  let server: MockMcpServer;
+  let migrationsService: MigrationsService;
+
+  beforeEach(() => {
+    ({ server, migrationsService } = setup());
+  });
+
+  // ── Registration ───────────────────────────────────────────────────────────
+
+  it("registers all 11 tools and the HTML resource", () => {
+    expect([...server.tools.keys()].sort()).toEqual(
+      [
+        "migrate-rules",
+        "list-migrations",
+        "get-migration",
+        "get-translated-rules",
+        "start-translation",
+        "stop-translation",
+        "update-translated-rule",
+        "get-resources",
+        "upsert-resource",
+        "install-rules",
+        "get-stats",
+      ].sort()
+    );
+    expect([...server.resources.keys()]).toEqual([RESOURCE_URI]);
+  });
+
+  // ── migrate-rules (model-facing) ───────────────────────────────────────────
+
+  describe("migrate-rules", () => {
+    it("returns a compact migration list for the LLM to see", async () => {
+      vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([
+        {
+          id: MIGRATION_ID,
+          name: "Splunk prod",
+          status: "ready",
+          created_at: "2026-01-01T00:00:00Z",
+          last_updated_at: "2026-01-01T00:00:00Z",
+          rules: {
+            total: 10, pending: 5, processing: 0, completed: 5, failed: 0,
+            installable: 5, installed: 0, partially_translated: 0, untranslatable: 0,
+          },
+        },
+      ]);
+
+      const out = parseToolText<{ message: string; migrations: unknown[] }>(
+        await server.tool("migrate-rules").callback({})
+      );
+
+      expect(out.message).toContain("workbench");
+      expect(out.migrations).toHaveLength(1);
+      expect(out.migrations[0]).toMatchObject({ id: MIGRATION_ID, name: "Splunk prod" });
+    });
+  });
+
+  // ── list-migrations ────────────────────────────────────────────────────────
+
+  describe("list-migrations", () => {
+    it("delegates to migrationsService.listMigrations and returns the array", async () => {
+      vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([]);
+
+      const out = parseToolText<unknown[]>(
+        await server.tool("list-migrations").callback({})
+      );
+
+      expect(migrationsService.listMigrations).toHaveBeenCalledTimes(1);
+      expect(out).toEqual([]);
+    });
+  });
+
+  // ── get-migration ──────────────────────────────────────────────────────────
+
+  describe("get-migration", () => {
+    it("calls getMigration with the provided ID", async () => {
+      vi.mocked(migrationsService.getMigration).mockResolvedValueOnce({
+        id: MIGRATION_ID,
+        name: "test",
+        status: "ready",
+        created_at: "",
+        last_updated_at: "",
+        rules: {
+          total: 0, pending: 0, processing: 0, completed: 0, failed: 0,
+          installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+        },
+      });
+
+      await server.tool("get-migration").callback({ migrationId: MIGRATION_ID });
+
+      expect(migrationsService.getMigration).toHaveBeenCalledWith(MIGRATION_ID);
+    });
+  });
+
+  // ── get-translated-rules ───────────────────────────────────────────────────
+
+  describe("get-translated-rules", () => {
+    it("forwards pagination params to getTranslatedRules", async () => {
+      vi.mocked(migrationsService.getTranslatedRules).mockResolvedValueOnce({
+        data: [],
+        total: 0,
+      });
+
+      await server.tool("get-translated-rules").callback({
+        migrationId: MIGRATION_ID,
+        vendor: "splunk",
+        page: 2,
+        perPage: 50,
+        filter: "status:completed",
+      });
+
+      expect(migrationsService.getTranslatedRules).toHaveBeenCalledWith(
+        MIGRATION_ID,
+        { page: 2, perPage: 50, filter: "status:completed" }
+      );
+    });
+
+    it("returns vendorNotSupported for a non-Splunk vendor", async () => {
+      const out = parseToolText<{ error: string; vendor: string }>(
+        await server.tool("get-translated-rules").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "qradar",
+        })
+      );
+
+      expect(out).toEqual({ error: "vendorNotSupported", vendor: "qradar" });
+      expect(migrationsService.getTranslatedRules).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── start-translation ──────────────────────────────────────────────────────
+
+  describe("start-translation", () => {
+    it("calls startTranslation and returns { status: 'started' }", async () => {
+      vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined);
+
+      const out = parseToolText<{ status: string }>(
+        await server.tool("start-translation").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "splunk",
+        })
+      );
+
+      expect(migrationsService.startTranslation).toHaveBeenCalledWith(MIGRATION_ID);
+      expect(out.status).toBe("started");
+    });
+
+    it("returns vendorNotSupported for sentinel-one", async () => {
+      const out = parseToolText<{ error: string; vendor: string }>(
+        await server.tool("start-translation").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "sentinel-one",
+        })
+      );
+
+      expect(out).toEqual({ error: "vendorNotSupported", vendor: "sentinel-one" });
+      expect(migrationsService.startTranslation).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── stop-translation ───────────────────────────────────────────────────────
+
+  describe("stop-translation", () => {
+    it("calls stopTranslation and returns { status: 'stopped' }", async () => {
+      vi.mocked(migrationsService.stopTranslation).mockResolvedValueOnce(undefined);
+
+      const out = parseToolText<{ status: string }>(
+        await server.tool("stop-translation").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "splunk",
+        })
+      );
+
+      expect(migrationsService.stopTranslation).toHaveBeenCalledWith(MIGRATION_ID);
+      expect(out.status).toBe("stopped");
+    });
+
+    it("returns vendorNotSupported for an unknown vendor", async () => {
+      const out = parseToolText<{ error: string }>(
+        await server.tool("stop-translation").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "unknown-siem",
+        })
+      );
+
+      expect(out.error).toBe("vendorNotSupported");
+      expect(migrationsService.stopTranslation).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── update-translated-rule ─────────────────────────────────────────────────
+
+  describe("update-translated-rule", () => {
+    it("parses elasticRule JSON and passes updates to service", async () => {
+      vi.mocked(migrationsService.updateTranslatedRule).mockResolvedValueOnce({
+        id: RULE_ID,
+        migration_id: MIGRATION_ID,
+        status: "completed",
+        translation_result: "partial",
+        original_rule: {},
+      });
+      const elasticRule = { name: "Fixed rule", type: "query" };
+
+      await server.tool("update-translated-rule").callback({
+        migrationId: MIGRATION_ID,
+        ruleId: RULE_ID,
+        vendor: "splunk",
+        elasticRule: JSON.stringify(elasticRule),
+        translationResult: "partial",
+      });
+
+      expect(migrationsService.updateTranslatedRule).toHaveBeenCalledWith(
+        MIGRATION_ID,
+        RULE_ID,
+        expect.objectContaining({
+          elastic_rule: elasticRule,
+          translation_result: "partial",
+        })
+      );
+    });
+
+    it("returns vendorNotSupported without calling service", async () => {
+      const out = parseToolText<{ error: string }>(
+        await server.tool("update-translated-rule").callback({
+          migrationId: MIGRATION_ID,
+          ruleId: RULE_ID,
+          vendor: "qradar",
+        })
+      );
+
+      expect(out.error).toBe("vendorNotSupported");
+      expect(migrationsService.updateTranslatedRule).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── get-resources ──────────────────────────────────────────────────────────
+
+  describe("get-resources", () => {
+    it("calls getResources with migrationId", async () => {
+      vi.mocked(migrationsService.getResources).mockResolvedValueOnce([
+        { type: "macro", name: "my_macro", content: "| where true" },
+      ]);
+
+      const out = parseToolText<unknown[]>(
+        await server.tool("get-resources").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "splunk",
+        })
+      );
+
+      expect(migrationsService.getResources).toHaveBeenCalledWith(MIGRATION_ID);
+      expect(out).toHaveLength(1);
+    });
+
+    it("returns vendorNotSupported for non-Splunk", async () => {
+      const out = parseToolText<{ error: string }>(
+        await server.tool("get-resources").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "qradar",
+        })
+      );
+
+      expect(out.error).toBe("vendorNotSupported");
+    });
+  });
+
+  // ── upsert-resource ────────────────────────────────────────────────────────
+
+  describe("upsert-resource", () => {
+    it("calls upsertResources with a single-element array", async () => {
+      vi.mocked(migrationsService.upsertResources).mockResolvedValueOnce(undefined);
+
+      await server.tool("upsert-resource").callback({
+        migrationId: MIGRATION_ID,
+        vendor: "splunk",
+        type: "macro",
+        name: "splunk_macro",
+        content: "| eval x=1",
+      });
+
+      expect(migrationsService.upsertResources).toHaveBeenCalledWith(
+        MIGRATION_ID,
+        [{ type: "macro", name: "splunk_macro", content: "| eval x=1" }]
+      );
+    });
+
+    it("returns vendorNotSupported for non-Splunk", async () => {
+      const out = parseToolText<{ error: string }>(
+        await server.tool("upsert-resource").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "sentinel-one",
+          type: "macro",
+          name: "m",
+          content: "",
+        })
+      );
+
+      expect(out.error).toBe("vendorNotSupported");
+      expect(migrationsService.upsertResources).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── install-rules ──────────────────────────────────────────────────────────
+
+  describe("install-rules", () => {
+    it("passes ids array to installRules", async () => {
+      vi.mocked(migrationsService.installRules).mockResolvedValueOnce({
+        installed: 2,
+        failed: 0,
+      });
+
+      const out = parseToolText<{ installed: number; failed: number }>(
+        await server.tool("install-rules").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "splunk",
+          ids: ["r-1", "r-2"],
+        })
+      );
+
+      expect(migrationsService.installRules).toHaveBeenCalledWith(
+        MIGRATION_ID,
+        { ids: ["r-1", "r-2"] }
+      );
+      expect(out).toEqual({ installed: 2, failed: 0 });
+    });
+
+    it("returns vendorNotSupported for non-Splunk", async () => {
+      const out = parseToolText<{ error: string }>(
+        await server.tool("install-rules").callback({
+          migrationId: MIGRATION_ID,
+          vendor: "qradar",
+        })
+      );
+
+      expect(out.error).toBe("vendorNotSupported");
+      expect(migrationsService.installRules).not.toHaveBeenCalled();
+    });
+  });
+
+  // ── get-stats ──────────────────────────────────────────────────────────────
+
+  describe("get-stats", () => {
+    it("calls getStats and returns the result (no vendor gate)", async () => {
+      const stats = {
+        id: MIGRATION_ID,
+        status: "ready" as const,
+        rules: {
+          total: 5, pending: 5, processing: 0, completed: 0, failed: 0,
+          installable: 0, installed: 0, partially_translated: 0, untranslatable: 0,
+        },
+      };
+      vi.mocked(migrationsService.getStats).mockResolvedValueOnce(stats);
+
+      const out = parseToolText<typeof stats>(
+        await server.tool("get-stats").callback({ migrationId: MIGRATION_ID })
+      );
+
+      expect(migrationsService.getStats).toHaveBeenCalledWith(MIGRATION_ID);
+      expect(out).toEqual(stats);
+    });
+  });
+
+  // ── Vendor gate: undefined vendor is allowed ───────────────────────────────
+
+  it("proceeds when vendor parameter is absent (defaults to Splunk path)", async () => {
+    vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined);
+
+    const out = parseToolText<{ status: string }>(
+      await server.tool("start-translation").callback({ migrationId: MIGRATION_ID })
+    );
+
+    expect(out.status).toBe("started");
+    expect(migrationsService.startTranslation).toHaveBeenCalled();
+  });
+});

From 1c3177978c7f43c42581f71146e184b25373aff2 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:28:10 +0200
Subject: [PATCH 21/42] feat: add migration workbench view with WorkbenchState
 machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

src/views/migration/App.tsx — full state machine:

WorkbenchState discriminated union (8 stages):
  vendor-select  → user picks vendor → creates migration
  upload         → paste Splunk rules JSON → upload + start translation
  translating    → polls get-stats every 3s → advances on completion
  review         → lists translated rules with status badges + fix actions
  fix-rule-drawer  → slide-over editor for single rule JSON + result enum
  fix-resources-drawer → slide-over for macro/lookup create/update
  install        → confirmation step before calling install-rules
  done           → success summary with installed/failed counts

Vendor gate (5-LOC client check):
  SUPPORTED_VENDORS = ["splunk"]
  VENDOR_CATALOGUE entries not in SUPPORTED_VENDORS render as disabled
  with "Coming soon" badge — re-enabling a vendor is a one-line change.

MCP integration:
  All data via app.callServerTool() through the 10 app-only tools.
  translating stage schedules a 3-second poll loop that stops and
  transitions to review when stats.rules.processing === 0.

Supporting files:
  mcp-app.html — minimal HTML shell (title: "SIEM Migration")
  mcp-app.tsx  — standard React 18 createRoot mount
  styles.css   — vendor-grid, upload-area, progress-bar, rule status
                 badges, drawer layout

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx      | 873 +++++++++++++++++++++++++++++++
 src/views/migration/mcp-app.html |  12 +
 src/views/migration/mcp-app.tsx  |  12 +
 src/views/migration/styles.css   | 165 ++++++
 4 files changed, 1062 insertions(+)
 create mode 100644 src/views/migration/App.tsx
 create mode 100644 src/views/migration/mcp-app.html
 create mode 100644 src/views/migration/mcp-app.tsx
 create mode 100644 src/views/migration/styles.css

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
new file mode 100644
index 0000000..8add033
--- /dev/null
+++ b/src/views/migration/App.tsx
@@ -0,0 +1,873 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import React, { useState, useCallback, useEffect, useRef } from "react";
+import type { App as McpApp } from "@modelcontextprotocol/ext-apps";
+import { extractCallResult } from "../../shared/extract-tool-text";
+import {
+  AppHeader,
+  AppShell,
+  BackButton,
+  EmptyState,
+  KpiStrip,
+  KpiTile,
+  LoadingState,
+} from "../../shared/components";
+import { useFullscreen } from "../../shared/hooks/useFullscreen";
+import { useMcpApp } from "../../shared/hooks/useMcpApp";
+import "./styles.css";
+
+// ---------------------------------------------------------------------------
+// Local domain types (shapes returned by the app-only migration tools)
+// ---------------------------------------------------------------------------
+
+interface MigrationStats {
+  id: string;
+  status: string;
+  rules: {
+    total: number;
+    pending: number;
+    processing: number;
+    completed: number;
+    failed: number;
+    installable: number;
+    installed: number;
+    partially_translated: number;
+    untranslatable: number;
+  };
+}
+
+interface TranslatedRule {
+  id: string;
+  status: string;
+  translation_result?: "full" | "partial" | "untranslatable";
+  original_rule: Record<string, unknown>;
+  elastic_rule?: Record<string, unknown>;
+  comments?: string[];
+}
+
+interface MigrationResource {
+  type: "macro" | "lookup";
+  name: string;
+  content: string;
+}
+
+interface InstallResult {
+  installed: number;
+  failed: number;
+}
+
+// ---------------------------------------------------------------------------
+// WorkbenchState discriminated union
+//
+// Each stage carries exactly the data it needs and no more. Transitions
+// always move forward through the pipeline — no implicit shared state.
+// ---------------------------------------------------------------------------
+
+export type WorkbenchState =
+  | {
+      stage: "vendor-select";
+    }
+  | {
+      stage: "upload";
+      vendor: string;
+      migrationId: string;
+    }
+  | {
+      stage: "translating";
+      vendor: string;
+      migrationId: string;
+      stats: MigrationStats | null;
+    }
+  | {
+      stage: "review";
+      vendor: string;
+      migrationId: string;
+      translations: TranslatedRule[];
+      resources: MigrationResource[];
+    }
+  | {
+      stage: "fix-rule-drawer";
+      vendor: string;
+      migrationId: string;
+      translations: TranslatedRule[];
+      resources: MigrationResource[];
+      selectedRule: TranslatedRule;
+    }
+  | {
+      stage: "fix-resources-drawer";
+      vendor: string;
+      migrationId: string;
+      translations: TranslatedRule[];
+      resources: MigrationResource[];
+    }
+  | {
+      stage: "install";
+      vendor: string;
+      migrationId: string;
+      translations: TranslatedRule[];
+    }
+  | {
+      stage: "done";
+      installed: number;
+      failed: number;
+    };
+
+// ---------------------------------------------------------------------------
+// Vendor catalogue — re-enabling a vendor is a one-line change here
+// ---------------------------------------------------------------------------
+
+const SUPPORTED_VENDORS: readonly string[] = ["splunk"];
+
+const VENDOR_CATALOGUE = [
+  { id: "splunk", label: "Splunk" },
+  { id: "qradar", label: "IBM QRadar" },
+  { id: "sentinel-one", label: "Sentinel One" },
+] as const;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+async function callTool<T = unknown>(
+  app: McpApp,
+  name: string,
+  args: Record<string, unknown>
+): Promise<T | null> {
+  try {
+    const result = await app.callServerTool({ name, arguments: args });
+    const text = extractCallResult(result);
+    if (!text) return null;
+    return JSON.parse(text) as T;
+  } catch (e) {
+    console.error(`[migration] ${name} failed:`, e);
+    return null;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// App
+// ---------------------------------------------------------------------------
+
+export function App() {
+  const [state, setState] = useState<WorkbenchState>({ stage: "vendor-select" });
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+
+  // For the translating stage: poll stats until translation completes
+  const pollTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+
+  const clearPoll = useCallback(() => {
+    if (pollTimerRef.current !== null) {
+      clearTimeout(pollTimerRef.current);
+      pollTimerRef.current = null;
+    }
+  }, []);
+
+  useEffect(() => () => clearPoll(), [clearPoll]);
+
+  const { connected, getApp } = useMcpApp({
+    name: "migration",
+    version: "1.0.0",
+    onConnect: (_app, _gotResult) => {
+      // No initial data load needed — the workbench starts at vendor-select.
+    },
+  });
+
+  const fullscreen = useFullscreen(getApp);
+
+  // ── Stage transitions ──────────────────────────────────────────────────────
+
+  const selectVendor = useCallback(
+    async (vendor: string) => {
+      const app = getApp();
+      if (!app) return;
+      setLoading(true);
+      setError(null);
+      try {
+        const res = await callTool<{ migration_id: string }>(app, "create-migration", {
+          name: `Migration ${new Date().toISOString().slice(0, 10)}`,
+        });
+        if (!res?.migration_id) throw new Error("Failed to create migration");
+        setState({ stage: "upload", vendor, migrationId: res.migration_id });
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setLoading(false);
+      }
+    },
+    [getApp]
+  );
+
+  const uploadRules = useCallback(
+    async (rulesJson: string) => {
+      const app = getApp();
+      if (!app || state.stage !== "upload") return;
+      const { vendor, migrationId } = state;
+      setLoading(true);
+      setError(null);
+      try {
+        const rules = JSON.parse(rulesJson) as Record<string, unknown>[];
+        await callTool(app, "upload-rules", { migrationId, vendor, rules });
+        await callTool(app, "start-translation", { migrationId, vendor });
+        const stats = await callTool<MigrationStats>(app, "get-stats", { migrationId });
+        setState({ stage: "translating", vendor, migrationId, stats: stats ?? null });
+        schedulePoll(app, vendor, migrationId);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setLoading(false);
+      }
+    },
+    [getApp, state]
+  );
+
+  const schedulePoll = useCallback(
+    (app: McpApp, vendor: string, migrationId: string) => {
+      clearPoll();
+      pollTimerRef.current = setTimeout(async () => {
+        const stats = await callTool<MigrationStats>(app, "get-stats", { migrationId });
+        setState((prev) => {
+          if (prev.stage !== "translating") return prev;
+          return { ...prev, stats: stats ?? prev.stats };
+        });
+        if (stats && stats.rules.processing === 0 && stats.status !== "running") {
+          // Translation finished — load translated rules and resources, move to review
+          void (async () => {
+            const translationsRes = await callTool<{
+              data: TranslatedRule[];
+            }>(app, "get-translated-rules", { migrationId, vendor, perPage: 500 });
+            const resources =
+              (await callTool<MigrationResource[]>(app, "get-resources", {
+                migrationId,
+                vendor,
+              })) ?? [];
+            setState({
+              stage: "review",
+              vendor,
+              migrationId,
+              translations: translationsRes?.data ?? [],
+              resources,
+            });
+          })();
+        } else {
+          schedulePoll(app, vendor, migrationId);
+        }
+      }, 3000);
+    },
+    [clearPoll]
+  );
+
+  const openRuleDrawer = useCallback((rule: TranslatedRule) => {
+    setState((prev) => {
+      if (prev.stage !== "review") return prev;
+      return { ...prev, stage: "fix-rule-drawer", selectedRule: rule };
+    });
+  }, []);
+
+  const saveRuleFix = useCallback(
+    async (elasticRuleJson: string, translationResult: "full" | "partial" | "untranslatable") => {
+      const app = getApp();
+      if (!app || state.stage !== "fix-rule-drawer") return;
+      const { vendor, migrationId, translations, resources, selectedRule } = state;
+      setLoading(true);
+      setError(null);
+      try {
+        const updated = await callTool<TranslatedRule>(
+          app,
+          "update-translated-rule",
+          { migrationId, ruleId: selectedRule.id, vendor, elasticRule: elasticRuleJson, translationResult }
+        );
+        setState({
+          stage: "review",
+          vendor,
+          migrationId,
+          resources,
+          translations: translations.map((t) =>
+            t.id === selectedRule.id ? (updated ?? t) : t
+          ),
+        });
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setLoading(false);
+      }
+    },
+    [getApp, state]
+  );
+
+  const openResourcesDrawer = useCallback(() => {
+    setState((prev) => {
+      if (prev.stage !== "review") return prev;
+      return { ...prev, stage: "fix-resources-drawer" };
+    });
+  }, []);
+
+  const saveResources = useCallback(
+    async (resource: MigrationResource) => {
+      const app = getApp();
+      if (!app || state.stage !== "fix-resources-drawer") return;
+      const { vendor, migrationId, translations } = state;
+      setLoading(true);
+      setError(null);
+      try {
+        await callTool(app, "upsert-resource", { migrationId, vendor, ...resource });
+        const resources =
+          (await callTool<MigrationResource[]>(app, "get-resources", { migrationId, vendor })) ?? [];
+        setState({ stage: "review", vendor, migrationId, translations, resources });
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setLoading(false);
+      }
+    },
+    [getApp, state]
+  );
+
+  const closeDrawer = useCallback(() => {
+    setState((prev) => {
+      if (prev.stage === "fix-rule-drawer" || prev.stage === "fix-resources-drawer") {
+        const { stage: _stage, ...rest } = prev as WorkbenchState & {
+          stage: "fix-rule-drawer" | "fix-resources-drawer";
+        };
+        void _stage;
+        return { ...(rest as { vendor: string; migrationId: string; translations: TranslatedRule[]; resources: MigrationResource[] }), stage: "review" };
+      }
+      return prev;
+    });
+  }, []);
+
+  const startInstall = useCallback(() => {
+    setState((prev) => {
+      if (prev.stage !== "review") return prev;
+      return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations };
+    });
+  }, []);
+
+  const confirmInstall = useCallback(async () => {
+    const app = getApp();
+    if (!app || state.stage !== "install") return;
+    const { vendor, migrationId } = state;
+    setLoading(true);
+    setError(null);
+    try {
+      const result = await callTool<InstallResult>(app, "install-rules", { migrationId, vendor });
+      setState({ stage: "done", installed: result?.installed ?? 0, failed: result?.failed ?? 0 });
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setLoading(false);
+    }
+  }, [getApp, state]);
+
+  const reset = useCallback(() => {
+    clearPoll();
+    setState({ stage: "vendor-select" });
+    setError(null);
+  }, [clearPoll]);
+
+  // ── Render ─────────────────────────────────────────────────────────────────
+
+  // AppHeader expects { isFullscreen, onToggle } — useFullscreen returns { isFullscreen, toggle }
+  const fullscreenProp = { isFullscreen: fullscreen.isFullscreen, onToggle: fullscreen.toggle };
+
+  if (!connected) {
+    return (
+      <AppShell>
+        <AppHeader title="SIEM Migration" fullscreen={fullscreenProp} />
+        <LoadingState>Connecting to Elastic Security…</LoadingState>
+      </AppShell>
+    );
+  }
+
+  return (
+    <AppShell>
+      <AppHeader
+        title="SIEM Migration"
+        fullscreen={fullscreenProp}
+        actions={
+          state.stage !== "vendor-select" && state.stage !== "done" ? (
+            <BackButton onClick={reset} label="Start over" />
+          ) : undefined
+        }
+      />
+
+      {error && (
+        <div className="p-3 m-4 rounded bg-red-50 border border-red-200 text-red-700 text-sm">
+          {error}
+          <button className="ml-2 underline" onClick={() => setError(null)}>
+            Dismiss
+          </button>
+        </div>
+      )}
+
+      {loading && <LoadingState>Working…</LoadingState>}
+
+      {!loading && renderStage(state, {
+        selectVendor,
+        uploadRules,
+        openRuleDrawer,
+        saveRuleFix,
+        openResourcesDrawer,
+        saveResources,
+        closeDrawer,
+        startInstall,
+        confirmInstall,
+        reset,
+      })}
+    </AppShell>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Per-stage renderers (extracted to keep App() readable)
+// ---------------------------------------------------------------------------
+
+interface StageHandlers {
+  selectVendor: (vendor: string) => void;
+  uploadRules: (json: string) => void;
+  openRuleDrawer: (rule: TranslatedRule) => void;
+  saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void;
+  openResourcesDrawer: () => void;
+  saveResources: (resource: MigrationResource) => void;
+  closeDrawer: () => void;
+  startInstall: () => void;
+  confirmInstall: () => void;
+  reset: () => void;
+}
+
+function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
+  switch (state.stage) {
+    case "vendor-select":
+      return <VendorSelect onSelect={h.selectVendor} />;
+
+    case "upload":
+      return <Upload vendor={state.vendor} onUpload={h.uploadRules} />;
+
+    case "translating":
+      return <Translating stats={state.stats} />;
+
+    case "review":
+      return (
+        <Review
+          translations={state.translations}
+          resources={state.resources}
+          onOpenRule={h.openRuleDrawer}
+          onOpenResources={h.openResourcesDrawer}
+          onInstall={h.startInstall}
+        />
+      );
+
+    case "fix-rule-drawer":
+      return (
+        <>
+          <Review
+            translations={state.translations}
+            resources={state.resources}
+            onOpenRule={h.openRuleDrawer}
+            onOpenResources={h.openResourcesDrawer}
+            onInstall={h.startInstall}
+            dimmed
+          />
+          <RuleDrawer rule={state.selectedRule} onSave={h.saveRuleFix} onClose={h.closeDrawer} />
+        </>
+      );
+
+    case "fix-resources-drawer":
+      return (
+        <>
+          <Review
+            translations={state.translations}
+            resources={state.resources}
+            onOpenRule={h.openRuleDrawer}
+            onOpenResources={h.openResourcesDrawer}
+            onInstall={h.startInstall}
+            dimmed
+          />
+          <ResourcesDrawer resources={state.resources} onSave={h.saveResources} onClose={h.closeDrawer} />
+        </>
+      );
+
+    case "install":
+      return (
+        <Install
+          count={state.translations.filter((t) => t.translation_result !== "untranslatable").length}
+          onConfirm={h.confirmInstall}
+          onBack={h.closeDrawer}
+        />
+      );
+
+    case "done":
+      return <Done installed={state.installed} failed={state.failed} onReset={h.reset} />;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Stage components
+// ---------------------------------------------------------------------------
+
+function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
+  return (
+    <div className="p-6 max-w-2xl mx-auto">
+      <h2 className="text-lg font-semibold mb-1">Select your source SIEM</h2>
+      <p className="text-sm text-gray-500 mb-4">
+        Choose the platform you are migrating detection rules from.
+      </p>
+      <div className="migration-vendor-grid">
+        {VENDOR_CATALOGUE.map(({ id, label }) => {
+          const supported = SUPPORTED_VENDORS.includes(id);
+          return (
+            <button
+              key={id}
+              className={`migration-vendor-card${supported ? "" : " migration-vendor-card--disabled"}`}
+              disabled={!supported}
+              onClick={() => supported && onSelect(id)}
+            >
+              <span className="migration-vendor-label">{label}</span>
+              {!supported && <span className="migration-vendor-badge">Coming soon</span>}
+            </button>
+          );
+        })}
+      </div>
+    </div>
+  );
+}
+
+function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) {
+  const [text, setText] = useState("");
+  return (
+    <div className="p-6 max-w-2xl mx-auto">
+      <h2 className="text-lg font-semibold mb-1">Upload {vendor} rules</h2>
+      <p className="text-sm text-gray-500 mb-4">
+        Paste your exported {vendor} rules as a JSON array, then start translation.
+      </p>
+      <div className="migration-upload-area">
+        <textarea
+          className="w-full h-40 p-2 text-xs font-mono border border-gray-200 rounded resize-y"
+          placeholder={`[\n  { "search": "index=main sourcetype=syslog..." },\n  ...\n]`}
+          value={text}
+          onChange={(e) => setText(e.target.value)}
+        />
+      </div>
+      <button
+        className="mt-2 px-4 py-2 bg-blue-600 text-white rounded text-sm font-medium disabled:opacity-50"
+        disabled={!text.trim()}
+        onClick={() => onUpload(text)}
+      >
+        Upload &amp; start translation
+      </button>
+    </div>
+  );
+}
+
+function Translating({ stats }: { stats: MigrationStats | null }) {
+  const rules = stats?.rules;
+  const pct = rules && rules.total > 0 ? Math.round(((rules.total - rules.pending) / rules.total) * 100) : 0;
+  return (
+    <div className="p-6 max-w-xl mx-auto">
+      <h2 className="text-lg font-semibold mb-1">Translating rules…</h2>
+      <p className="text-sm text-gray-500 mb-6">
+        The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes.
+      </p>
+      {rules && (
+        <>
+          <KpiStrip tileCount={4}>
+            <KpiTile label="Total" value={rules.total} />
+            <KpiTile label="Translated" value={rules.completed} />
+            <KpiTile label="Pending" value={rules.pending} />
+            <KpiTile label="Failed" value={rules.failed} />
+          </KpiStrip>
+          <div className="migration-progress-bar-track mt-4">
+            <div className="migration-progress-bar-fill" style={{ width: `${pct}%` }} />
+          </div>
+          <p className="text-xs text-gray-400 mt-1">{pct}% complete</p>
+        </>
+      )}
+      {!rules && <LoadingState>Waiting for translation to start…</LoadingState>}
+    </div>
+  );
+}
+
+function Review({
+  translations,
+  resources,
+  onOpenRule,
+  onOpenResources,
+  onInstall,
+  dimmed,
+}: {
+  translations: TranslatedRule[];
+  resources: MigrationResource[];
+  onOpenRule: (rule: TranslatedRule) => void;
+  onOpenResources: () => void;
+  onInstall: () => void;
+  dimmed?: boolean;
+}) {
+  const installable = translations.filter(
+    (t) => t.translation_result && t.translation_result !== "untranslatable"
+  ).length;
+  const needsFix = translations.filter((t) => t.translation_result === "partial").length;
+
+  return (
+    <div className={`p-6${dimmed ? " opacity-50 pointer-events-none" : ""}`}>
+      <div className="flex items-center justify-between mb-4">
+        <h2 className="text-lg font-semibold">Review translated rules</h2>
+        <div className="flex gap-2">
+          {resources.length > 0 && (
+            <button
+              className="px-3 py-1.5 text-sm border border-gray-300 rounded"
+              onClick={onOpenResources}
+            >
+              Fix resources ({resources.length})
+            </button>
+          )}
+          <button
+            className="px-3 py-1.5 text-sm bg-blue-600 text-white rounded disabled:opacity-50"
+            disabled={installable === 0}
+            onClick={onInstall}
+          >
+            Install {installable} rules
+          </button>
+        </div>
+      </div>
+
+      {needsFix > 0 && (
+        <div className="mb-4 p-3 bg-yellow-50 border border-yellow-200 rounded text-sm text-yellow-800">
+          {needsFix} rule{needsFix !== 1 ? "s" : ""} need manual review before installation.
+        </div>
+      )}
+
+      {translations.length === 0 ? (
+        <EmptyState>No translated rules found.</EmptyState>
+      ) : (
+        <div className="space-y-2">
+          {translations.map((rule) => (
+            <RuleRow key={rule.id} rule={rule} onFix={() => onOpenRule(rule)} />
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}
+
+function RuleRow({ rule, onFix }: { rule: TranslatedRule; onFix: () => void }) {
+  const name =
+    (rule.elastic_rule?.name as string | undefined) ??
+    (rule.original_rule?.title as string | undefined) ??
+    rule.id;
+  return (
+    <div className="flex items-center justify-between p-3 border border-gray-200 rounded">
+      <div className="flex items-center gap-3 min-w-0">
+        <TranslationBadge result={rule.translation_result} />
+        <span className="text-sm truncate">{name}</span>
+      </div>
+      {(rule.translation_result === "partial" || !rule.elastic_rule) && (
+        <button className="text-xs text-blue-600 underline shrink-0" onClick={onFix}>
+          Fix
+        </button>
+      )}
+    </div>
+  );
+}
+
+function TranslationBadge({ result }: { result?: string }) {
+  const cls = `migration-rule-status-badge migration-rule-status-badge--${result ?? "pending"}`;
+  const label = result ?? "pending";
+  return <span className={cls}>{label}</span>;
+}
+
+function RuleDrawer({
+  rule,
+  onSave,
+  onClose,
+}: {
+  rule: TranslatedRule;
+  onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
+  onClose: () => void;
+}) {
+  const [json, setJson] = useState(() =>
+    JSON.stringify(rule.elastic_rule ?? {}, null, 2)
+  );
+  const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
+    rule.translation_result ?? "partial"
+  );
+
+  return (
+    <div className="migration-drawer">
+      <div className="migration-drawer-header">
+        <h3 className="font-semibold text-sm">Fix translated rule</h3>
+        <button className="text-gray-400 hover:text-gray-700" onClick={onClose}>
+          ✕
+        </button>
+      </div>
+      <div className="migration-drawer-body">
+        <p className="text-xs text-gray-500 mb-2">
+          Edit the Elastic rule JSON and select the translation quality.
+        </p>
+        <textarea
+          className="migration-rule-json-editor"
+          value={json}
+          onChange={(e) => setJson(e.target.value)}
+        />
+        <div className="mt-3">
+          <label className="block text-xs font-medium mb-1">Translation result</label>
+          <select
+            className="w-full text-sm border border-gray-200 rounded p-1.5"
+            value={result}
+            onChange={(e) => setResult(e.target.value as typeof result)}
+          >
+            <option value="full">Full — rule is production-ready</option>
+            <option value="partial">Partial — rule needs tuning</option>
+            <option value="untranslatable">Untranslatable — skip this rule</option>
+          </select>
+        </div>
+      </div>
+      <div className="migration-drawer-footer">
+        <button className="text-sm text-gray-500" onClick={onClose}>
+          Cancel
+        </button>
+        <button
+          className="text-sm px-3 py-1.5 bg-blue-600 text-white rounded"
+          onClick={() => onSave(json, result)}
+        >
+          Save
+        </button>
+      </div>
+    </div>
+  );
+}
+
+function ResourcesDrawer({
+  resources,
+  onSave,
+  onClose,
+}: {
+  resources: MigrationResource[];
+  onSave: (resource: MigrationResource) => void;
+  onClose: () => void;
+}) {
+  const [name, setName] = useState("");
+  const [type, setType] = useState<"macro" | "lookup">("macro");
+  const [content, setContent] = useState("");
+
+  return (
+    <div className="migration-drawer">
+      <div className="migration-drawer-header">
+        <h3 className="font-semibold text-sm">Manage resources</h3>
+        <button className="text-gray-400 hover:text-gray-700" onClick={onClose}>
+          ✕
+        </button>
+      </div>
+      <div className="migration-drawer-body">
+        {resources.length > 0 && (
+          <div className="mb-4">
+            <p className="text-xs font-medium mb-2 text-gray-600">Existing resources</p>
+            {resources.map((r) => (
+              <div key={`${r.type}:${r.name}`} className="migration-resource-row">
+                <span className="text-xs font-mono bg-gray-100 px-1 rounded">{r.type}</span>
+                <span className="text-sm">{r.name}</span>
+              </div>
+            ))}
+          </div>
+        )}
+        <p className="text-xs font-medium mb-2 text-gray-600">Add / update resource</p>
+        <div className="space-y-2">
+          <select
+            className="w-full text-sm border border-gray-200 rounded p-1.5"
+            value={type}
+            onChange={(e) => setType(e.target.value as typeof type)}
+          >
+            <option value="macro">Macro</option>
+            <option value="lookup">Lookup</option>
+          </select>
+          <input
+            className="w-full text-sm border border-gray-200 rounded p-1.5"
+            placeholder="Resource name"
+            value={name}
+            onChange={(e) => setName(e.target.value)}
+          />
+          <textarea
+            className="migration-rule-json-editor h-24"
+            placeholder="Resource content / definition"
+            value={content}
+            onChange={(e) => setContent(e.target.value)}
+          />
+        </div>
+      </div>
+      <div className="migration-drawer-footer">
+        <button className="text-sm text-gray-500" onClick={onClose}>
+          Close
+        </button>
+        <button
+          className="text-sm px-3 py-1.5 bg-blue-600 text-white rounded disabled:opacity-50"
+          disabled={!name.trim()}
+          onClick={() => onSave({ type, name: name.trim(), content })}
+        >
+          Save resource
+        </button>
+      </div>
+    </div>
+  );
+}
+
+function Install({
+  count,
+  onConfirm,
+  onBack,
+}: {
+  count: number;
+  onConfirm: () => void;
+  onBack: () => void;
+}) {
+  return (
+    <div className="p-6 max-w-xl mx-auto text-center">
+      <h2 className="text-lg font-semibold mb-2">Install {count} rules</h2>
+      <p className="text-sm text-gray-500 mb-6">
+        The translated rules will be installed as disabled detection rules in Elastic Security.
+        You can enable them after reviewing their configuration.
+      </p>
+      <div className="flex gap-3 justify-center">
+        <button className="px-4 py-2 text-sm border border-gray-300 rounded" onClick={onBack}>
+          Back to review
+        </button>
+        <button
+          className="px-4 py-2 text-sm bg-blue-600 text-white rounded"
+          onClick={onConfirm}
+        >
+          Confirm install
+        </button>
+      </div>
+    </div>
+  );
+}
+
+function Done({
+  installed,
+  failed,
+  onReset,
+}: {
+  installed: number;
+  failed: number;
+  onReset: () => void;
+}) {
+  return (
+    <div className="p-6 max-w-xl mx-auto text-center">
+      <div className="text-4xl mb-4">✓</div>
+      <h2 className="text-lg font-semibold mb-2">Migration complete</h2>
+      <KpiStrip tileCount={failed > 0 ? 2 : 1}>
+        <KpiTile label="Installed" value={installed} />
+        {failed > 0 && <KpiTile label="Failed" value={failed} />}
+      </KpiStrip>
+      <p className="text-sm text-gray-500 mt-4 mb-6">
+        Rules have been installed as disabled. Navigate to Detection Rules to enable and tune them.
+      </p>
+      <button className="px-4 py-2 text-sm border border-gray-300 rounded" onClick={onReset}>
+        Start another migration
+      </button>
+    </div>
+  );
+}
diff --git a/src/views/migration/mcp-app.html b/src/views/migration/mcp-app.html
new file mode 100644
index 0000000..69fe301
--- /dev/null
+++ b/src/views/migration/mcp-app.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>SIEM Migration</title>
+</head>
+<body>
+  <div id="root"></div>
+  <script type="module" src="./mcp-app.tsx"></script>
+</body>
+</html>
diff --git a/src/views/migration/mcp-app.tsx b/src/views/migration/mcp-app.tsx
new file mode 100644
index 0000000..7251dbf
--- /dev/null
+++ b/src/views/migration/mcp-app.tsx
@@ -0,0 +1,12 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import React from "react";
+import { createRoot } from "react-dom/client";
+import { App } from "./App";
+
+createRoot(document.getElementById("root")!).render(<App />);
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
new file mode 100644
index 0000000..2e3cbc4
--- /dev/null
+++ b/src/views/migration/styles.css
@@ -0,0 +1,165 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/* Migration workbench — view-specific overrides */
+
+.migration-vendor-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
+  gap: 12px;
+  margin-top: 24px;
+}
+
+.migration-vendor-card {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 8px;
+  padding: 20px 16px;
+  border: 1px solid var(--border-color, #d4d4d4);
+  border-radius: 8px;
+  background: var(--surface-color, #fff);
+  cursor: pointer;
+  transition: border-color 0.15s, box-shadow 0.15s;
+}
+
+.migration-vendor-card:hover:not(.migration-vendor-card--disabled) {
+  border-color: var(--accent-color, #0077cc);
+  box-shadow: 0 0 0 2px var(--accent-color-alpha, rgba(0, 119, 204, 0.15));
+}
+
+.migration-vendor-card--disabled {
+  opacity: 0.5;
+  cursor: default;
+}
+
+.migration-vendor-label {
+  font-size: 14px;
+  font-weight: 500;
+}
+
+.migration-vendor-badge {
+  font-size: 11px;
+  color: var(--text-muted, #737373);
+}
+
+.migration-upload-area {
+  border: 2px dashed var(--border-color, #d4d4d4);
+  border-radius: 8px;
+  padding: 40px;
+  text-align: center;
+  margin: 16px 0;
+  transition: border-color 0.15s;
+}
+
+.migration-upload-area:hover {
+  border-color: var(--accent-color, #0077cc);
+}
+
+.migration-progress-bar-track {
+  height: 6px;
+  background: var(--surface-subtle, #f0f0f0);
+  border-radius: 3px;
+  overflow: hidden;
+  margin: 8px 0;
+}
+
+.migration-progress-bar-fill {
+  height: 100%;
+  background: var(--accent-color, #0077cc);
+  border-radius: 3px;
+  transition: width 0.4s ease;
+}
+
+.migration-rule-status-badge {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  padding: 2px 8px;
+  border-radius: 12px;
+  font-size: 11px;
+  font-weight: 500;
+  text-transform: capitalize;
+}
+
+.migration-rule-status-badge--full {
+  background: #d1fae5;
+  color: #065f46;
+}
+
+.migration-rule-status-badge--partial {
+  background: #fef3c7;
+  color: #92400e;
+}
+
+.migration-rule-status-badge--untranslatable {
+  background: #fee2e2;
+  color: #991b1b;
+}
+
+.migration-rule-status-badge--pending {
+  background: #f0f0f0;
+  color: #525252;
+}
+
+.migration-drawer {
+  position: fixed;
+  right: 0;
+  top: 0;
+  bottom: 0;
+  width: 520px;
+  max-width: 100vw;
+  background: var(--surface-color, #fff);
+  border-left: 1px solid var(--border-color, #d4d4d4);
+  box-shadow: -4px 0 16px rgba(0, 0, 0, 0.08);
+  display: flex;
+  flex-direction: column;
+  z-index: 100;
+}
+
+.migration-drawer-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  padding: 16px 20px;
+  border-bottom: 1px solid var(--border-color, #d4d4d4);
+}
+
+.migration-drawer-body {
+  flex: 1;
+  overflow: auto;
+  padding: 20px;
+}
+
+.migration-drawer-footer {
+  padding: 16px 20px;
+  border-top: 1px solid var(--border-color, #d4d4d4);
+  display: flex;
+  gap: 8px;
+  justify-content: flex-end;
+}
+
+.migration-rule-json-editor {
+  width: 100%;
+  font-family: "Fira Code", "Cascadia Code", monospace;
+  font-size: 12px;
+  line-height: 1.5;
+  border: 1px solid var(--border-color, #d4d4d4);
+  border-radius: 4px;
+  padding: 8px;
+  resize: vertical;
+  min-height: 240px;
+  background: var(--surface-subtle, #fafafa);
+}
+
+.migration-resource-row {
+  display: flex;
+  gap: 8px;
+  align-items: flex-start;
+  padding: 8px 0;
+  border-bottom: 1px solid var(--border-color, #e5e5e5);
+}

From be15e3404dcba43b0db3bd51da032263540092e1 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:31:41 +0200
Subject: [PATCH 22/42] feat: tighten vendor-select gate to use opacity-50
 cursor-not-allowed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the custom migration-vendor-card--disabled CSS class with the
spec-required Tailwind utilities (opacity-50 + cursor-not-allowed) so the
disabled state is expressed as two atomic utility classes rather than a
bespoke rule, and removes the now-unused CSS block from styles.css.

The client-side gate remains ≤5 LOC:
  const active = SUPPORTED_VENDORS.includes(id);   // 1 LOC check
  disabled={!active}                                // 1 LOC DOM attr
  onClick={() => active && onSelect(id)}            // 1 LOC guard
Re-enabling a vendor is still a one-line change to SUPPORTED_VENDORS.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx    | 12 +++++++-----
 src/views/migration/styles.css |  4 ----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 8add033..ffd952c 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -519,16 +519,18 @@ function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
       </p>
       <div className="migration-vendor-grid">
         {VENDOR_CATALOGUE.map(({ id, label }) => {
-          const supported = SUPPORTED_VENDORS.includes(id);
+          // ≤5-LOC client-side gate: only Splunk is production-ready.
+          // Add a vendor to SUPPORTED_VENDORS to re-enable it.
+          const active = SUPPORTED_VENDORS.includes(id);
           return (
             <button
               key={id}
-              className={`migration-vendor-card${supported ? "" : " migration-vendor-card--disabled"}`}
-              disabled={!supported}
-              onClick={() => supported && onSelect(id)}
+              className={`migration-vendor-card${active ? "" : " opacity-50 cursor-not-allowed"}`}
+              disabled={!active}
+              onClick={() => active && onSelect(id)}
             >
               <span className="migration-vendor-label">{label}</span>
-              {!supported && <span className="migration-vendor-badge">Coming soon</span>}
+              {!active && <span className="migration-vendor-badge">Coming soon</span>}
             </button>
           );
         })}
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index 2e3cbc4..e9ac84e 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -32,10 +32,6 @@
   box-shadow: 0 0 0 2px var(--accent-color-alpha, rgba(0, 119, 204, 0.15));
 }
 
-.migration-vendor-card--disabled {
-  opacity: 0.5;
-  cursor: default;
-}
 
 .migration-vendor-label {
   font-size: 14px;

From b109d6f45b65676e98a549df2bbf6b74826780fe Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:33:53 +0200
Subject: [PATCH 23/42] feat: implement upload step with file input,
 drag-and-drop, and start-translation call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upload component now offers three input paths:
  1. File picker — hidden <input type="file" accept=".json"> wired to a
     visible "Choose file…" button; FileReader populates the textarea
  2. Drag-and-drop — drop zone tracks dragOver state for visual feedback
     (border-blue-400 bg-blue-50) and reads the dropped file via FileReader
  3. Paste — textarea remains for direct JSON pasting

"Upload & start translation" button stays disabled until text is non-empty.
Clicking it calls onUpload(text) which runs the chain in App:
  upload-rules → start-translation → get-stats → translating stage

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx | 52 ++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index ffd952c..020c6a0 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -541,22 +541,66 @@ function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) {
 
 function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) {
   const [text, setText] = useState("");
+  const [dragOver, setDragOver] = useState(false);
+  const fileInputRef = React.useRef<HTMLInputElement>(null);
+
+  const readFile = (file: File) => {
+    const reader = new FileReader();
+    reader.onload = (e) => setText((e.target?.result as string | null) ?? "");
+    reader.readAsText(file);
+  };
+
+  const handleDrop = (e: React.DragEvent) => {
+    e.preventDefault();
+    setDragOver(false);
+    const file = e.dataTransfer.files[0];
+    if (file) readFile(file);
+  };
+
   return (
     <div className="p-6 max-w-2xl mx-auto">
       <h2 className="text-lg font-semibold mb-1">Upload {vendor} rules</h2>
       <p className="text-sm text-gray-500 mb-4">
-        Paste your exported {vendor} rules as a JSON array, then start translation.
+        Drop a JSON export file, use the file picker, or paste the rules array directly.
       </p>
-      <div className="migration-upload-area">
+
+      {/* Hidden file input wired to the drop zone button */}
+      <input
+        ref={fileInputRef}
+        type="file"
+        accept=".json,application/json"
+        className="sr-only"
+        onChange={(e) => {
+          const file = e.target.files?.[0];
+          if (file) readFile(file);
+          e.target.value = "";
+        }}
+      />
+
+      <div
+        className={`migration-upload-area${dragOver ? " border-blue-400 bg-blue-50" : ""}`}
+        onDragOver={(e) => { e.preventDefault(); setDragOver(true); }}
+        onDragLeave={() => setDragOver(false)}
+        onDrop={handleDrop}
+      >
+        <button
+          type="button"
+          className="mb-3 px-3 py-1.5 text-sm border border-gray-300 rounded hover:bg-gray-50"
+          onClick={() => fileInputRef.current?.click()}
+        >
+          Choose file…
+        </button>
+        <p className="text-xs text-gray-400 mb-2">or drop a .json file here, or paste below</p>
         <textarea
-          className="w-full h-40 p-2 text-xs font-mono border border-gray-200 rounded resize-y"
+          className="w-full h-36 p-2 text-xs font-mono border border-gray-200 rounded resize-y"
           placeholder={`[\n  { "search": "index=main sourcetype=syslog..." },\n  ...\n]`}
           value={text}
           onChange={(e) => setText(e.target.value)}
         />
       </div>
+
       <button
-        className="mt-2 px-4 py-2 bg-blue-600 text-white rounded text-sm font-medium disabled:opacity-50"
+        className="mt-3 px-4 py-2 bg-blue-600 text-white rounded text-sm font-medium disabled:opacity-50"
         disabled={!text.trim()}
         onClick={() => onUpload(text)}
       >

From 8edcbbe79dddcfde37fc4be2db80955c1df83d62 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:37:58 +0200
Subject: [PATCH 24/42] feat: translating step now polls get-migration instead
 of get-stats

schedulePoll replaces get-stats with get-migration so progress tracking
uses Kibana's authoritative lifecycle status ("ready" | "running" |
"finished" | "error") rather than the derived stats endpoint.

Completion condition changed from:
  stats.rules.processing === 0 && stats.status !== "running"
to:
  migration.status === "finished" || migration.status === "error"

This is both more precise (avoids a brief window where processing can
be 0 mid-run) and aligns with the Kibana status contract.

MigrationStats type gains the narrowed status union and an optional name
field so the same shape works for both get-migration and get-stats
responses without a separate type.

Translating component gains an error-state branch: when status is "error"
the heading says "Translation encountered an error" and the progress bar
is hidden, letting the workbench advance to review with whatever partial
results Kibana returned.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx | 51 +++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 020c6a0..04ccc8b 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -27,7 +27,9 @@ import "./styles.css";
 
 interface MigrationStats {
   id: string;
-  status: string;
+  name?: string;
+  /** Lifecycle status returned by get-migration. */
+  status: "ready" | "running" | "finished" | "error" | string;
   rules: {
     total: number;
     pending: number;
@@ -230,22 +232,21 @@ export function App() {
     (app: McpApp, vendor: string, migrationId: string) => {
       clearPoll();
       pollTimerRef.current = setTimeout(async () => {
-        const stats = await callTool<MigrationStats>(app, "get-stats", { migrationId });
+        // Use get-migration (not get-stats) so we get the strongly-typed status
+        // field ("ready" | "running" | "finished" | "error") alongside the rule counts.
+        const migration = await callTool<MigrationStats>(app, "get-migration", { migrationId });
         setState((prev) => {
           if (prev.stage !== "translating") return prev;
-          return { ...prev, stats: stats ?? prev.stats };
+          return { ...prev, stats: migration ?? prev.stats };
         });
-        if (stats && stats.rules.processing === 0 && stats.status !== "running") {
-          // Translation finished — load translated rules and resources, move to review
+        // Translation is complete when Kibana sets status to "finished" or "error".
+        if (migration && (migration.status === "finished" || migration.status === "error")) {
           void (async () => {
-            const translationsRes = await callTool<{
-              data: TranslatedRule[];
-            }>(app, "get-translated-rules", { migrationId, vendor, perPage: 500 });
+            const translationsRes = await callTool<{ data: TranslatedRule[] }>(
+              app, "get-translated-rules", { migrationId, vendor, perPage: 500 }
+            );
             const resources =
-              (await callTool<MigrationResource[]>(app, "get-resources", {
-                migrationId,
-                vendor,
-              })) ?? [];
+              (await callTool<MigrationResource[]>(app, "get-resources", { migrationId, vendor })) ?? [];
             setState({
               stage: "review",
               vendor,
@@ -612,12 +613,20 @@ function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string)
 
 function Translating({ stats }: { stats: MigrationStats | null }) {
   const rules = stats?.rules;
-  const pct = rules && rules.total > 0 ? Math.round(((rules.total - rules.pending) / rules.total) * 100) : 0;
+  const done = stats?.rules.total ?? 0;
+  const pending = rules?.pending ?? 0;
+  const pct = done > 0 ? Math.round(((done - pending) / done) * 100) : 0;
+  const isError = stats?.status === "error";
+
   return (
     <div className="p-6 max-w-xl mx-auto">
-      <h2 className="text-lg font-semibold mb-1">Translating rules…</h2>
+      <h2 className="text-lg font-semibold mb-1">
+        {isError ? "Translation encountered an error" : "Translating rules…"}
+      </h2>
       <p className="text-sm text-gray-500 mb-6">
-        The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes.
+        {isError
+          ? "Some rules could not be translated. Loading results…"
+          : "The AI translator is converting your rules to Elastic detection rule format. This may take a few minutes."}
       </p>
       {rules && (
         <>
@@ -627,10 +636,14 @@ function Translating({ stats }: { stats: MigrationStats | null }) {
             <KpiTile label="Pending" value={rules.pending} />
             <KpiTile label="Failed" value={rules.failed} />
           </KpiStrip>
-          <div className="migration-progress-bar-track mt-4">
-            <div className="migration-progress-bar-fill" style={{ width: `${pct}%` }} />
-          </div>
-          <p className="text-xs text-gray-400 mt-1">{pct}% complete</p>
+          {!isError && (
+            <>
+              <div className="migration-progress-bar-track mt-4">
+                <div className="migration-progress-bar-fill" style={{ width: `${pct}%` }} />
+              </div>
+              <p className="text-xs text-gray-400 mt-1">{pct}% complete</p>
+            </>
+          )}
         </>
       )}
       {!rules && <LoadingState>Waiting for translation to start…</LoadingState>}

From db5e4c3ec02d5904b4f681c26abdfe0e083135d2 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:43:45 +0200
Subject: [PATCH 25/42] feat: review step renders three-column diff (SPL |
 generated | editable Monaco)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review step now expands any rule row inline to show RuleDiff — a three-column
panel that renders the full diff/fix UX without leaving the review list:

  Left  — Original SPL (plain <pre>, read-only): shows rule.original_rule.search
          or falls back to full original_rule JSON if the search field is absent.

  Middle — Generated Elastic rule JSON (read-only Monaco, language=json):
           shows the rule.elastic_rule output from the AI translator.

  Right  — User-editable version (Monaco, language=json): seeded from the
           generated JSON, editable by the reviewer, saved via update-translated-rule.

Footer bar: translation-result enum selector + Cancel / Save buttons.

Clicking a rule row toggles the inline diff; clicking again or Cancel collapses.
A "Drawer" button remains for partial/untranslatable rules that need the full
slide-over editor.

saveRuleInline callback in App handles update-translated-rule from the review
state directly, bypassing the fix-rule-drawer state transition.

monaco-environment.ts added (mirrors threat-hunt) so the inlined bundle can
resolve the editor worker without fetching external chunks.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx               | 212 +++++++++++++++++++++-
 src/views/migration/mcp-app.tsx           |   1 +
 src/views/migration/monaco-environment.ts |  26 +++
 src/views/migration/styles.css            |  59 ++++++
 4 files changed, 289 insertions(+), 9 deletions(-)
 create mode 100644 src/views/migration/monaco-environment.ts

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 04ccc8b..3177370 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -5,7 +5,9 @@
  * 2.0.
  */
 
-import React, { useState, useCallback, useEffect, useRef } from "react";
+import React, { useState, useCallback, useEffect, useMemo, useRef } from "react";
+import Editor from "@monaco-editor/react";
+import type { editor } from "monaco-editor";
 import type { App as McpApp } from "@modelcontextprotocol/ext-apps";
 import { extractCallResult } from "../../shared/extract-tool-text";
 import {
@@ -301,6 +303,41 @@ export function App() {
     [getApp, state]
   );
 
+  const saveRuleInline = useCallback(
+    async (
+      ruleId: string,
+      elasticRuleJson: string,
+      translationResult: "full" | "partial" | "untranslatable"
+    ) => {
+      const app = getApp();
+      if (!app || state.stage !== "review") return;
+      const { vendor, migrationId, translations, resources } = state;
+      setLoading(true);
+      setError(null);
+      try {
+        const updated = await callTool<TranslatedRule>(app, "update-translated-rule", {
+          migrationId,
+          ruleId,
+          vendor,
+          elasticRule: elasticRuleJson,
+          translationResult,
+        });
+        setState({
+          stage: "review",
+          vendor,
+          migrationId,
+          resources,
+          translations: translations.map((t) => (t.id === ruleId ? (updated ?? t) : t)),
+        });
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setLoading(false);
+      }
+    },
+    [getApp, state]
+  );
+
   const openResourcesDrawer = useCallback(() => {
     setState((prev) => {
       if (prev.stage !== "review") return prev;
@@ -413,6 +450,7 @@ export function App() {
         uploadRules,
         openRuleDrawer,
         saveRuleFix,
+        saveRuleInline,
         openResourcesDrawer,
         saveResources,
         closeDrawer,
@@ -433,6 +471,7 @@ interface StageHandlers {
   uploadRules: (json: string) => void;
   openRuleDrawer: (rule: TranslatedRule) => void;
   saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void;
+  saveRuleInline: (id: string, json: string, result: "full" | "partial" | "untranslatable") => void;
   openResourcesDrawer: () => void;
   saveResources: (resource: MigrationResource) => void;
   closeDrawer: () => void;
@@ -458,6 +497,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
           translations={state.translations}
           resources={state.resources}
           onOpenRule={h.openRuleDrawer}
+          onSaveRule={h.saveRuleInline}
           onOpenResources={h.openResourcesDrawer}
           onInstall={h.startInstall}
         />
@@ -470,6 +510,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
             translations={state.translations}
             resources={state.resources}
             onOpenRule={h.openRuleDrawer}
+            onSaveRule={h.saveRuleInline}
             onOpenResources={h.openResourcesDrawer}
             onInstall={h.startInstall}
             dimmed
@@ -485,6 +526,7 @@ function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode {
             translations={state.translations}
             resources={state.resources}
             onOpenRule={h.openRuleDrawer}
+            onSaveRule={h.saveRuleInline}
             onOpenResources={h.openResourcesDrawer}
             onInstall={h.startInstall}
             dimmed
@@ -655,6 +697,7 @@ function Review({
   translations,
   resources,
   onOpenRule,
+  onSaveRule,
   onOpenResources,
   onInstall,
   dimmed,
@@ -662,15 +705,21 @@ function Review({
   translations: TranslatedRule[];
   resources: MigrationResource[];
   onOpenRule: (rule: TranslatedRule) => void;
+  onSaveRule: (id: string, json: string, result: "full" | "partial" | "untranslatable") => void;
   onOpenResources: () => void;
   onInstall: () => void;
   dimmed?: boolean;
 }) {
+  const [expandedId, setExpandedId] = useState<string | null>(null);
+
   const installable = translations.filter(
     (t) => t.translation_result && t.translation_result !== "untranslatable"
   ).length;
   const needsFix = translations.filter((t) => t.translation_result === "partial").length;
 
+  const toggleExpand = (id: string) =>
+    setExpandedId((prev) => (prev === id ? null : id));
+
   return (
     <div className={`p-6${dimmed ? " opacity-50 pointer-events-none" : ""}`}>
       <div className="flex items-center justify-between mb-4">
@@ -705,7 +754,24 @@ function Review({
       ) : (
         <div className="space-y-2">
           {translations.map((rule) => (
-            <RuleRow key={rule.id} rule={rule} onFix={() => onOpenRule(rule)} />
+            <div key={rule.id} className="border border-gray-200 rounded overflow-hidden">
+              <RuleRow
+                rule={rule}
+                expanded={expandedId === rule.id}
+                onToggle={() => toggleExpand(rule.id)}
+                onOpenDrawer={() => onOpenRule(rule)}
+              />
+              {expandedId === rule.id && (
+                <RuleDiff
+                  rule={rule}
+                  onSave={(json, result) => {
+                    onSaveRule(rule.id, json, result);
+                    setExpandedId(null);
+                  }}
+                  onCancel={() => setExpandedId(null)}
+                />
+              )}
+            </div>
           ))}
         </div>
       )}
@@ -713,22 +779,150 @@ function Review({
   );
 }
 
-function RuleRow({ rule, onFix }: { rule: TranslatedRule; onFix: () => void }) {
+function RuleRow({
+  rule,
+  expanded,
+  onToggle,
+  onOpenDrawer,
+}: {
+  rule: TranslatedRule;
+  expanded: boolean;
+  onToggle: () => void;
+  onOpenDrawer: () => void;
+}) {
   const name =
     (rule.elastic_rule?.name as string | undefined) ??
     (rule.original_rule?.title as string | undefined) ??
     rule.id;
   return (
-    <div className="flex items-center justify-between p-3 border border-gray-200 rounded">
+    <div
+      className="flex items-center justify-between p-3 cursor-pointer hover:bg-gray-50 select-none"
+      onClick={onToggle}
+    >
       <div className="flex items-center gap-3 min-w-0">
         <TranslationBadge result={rule.translation_result} />
         <span className="text-sm truncate">{name}</span>
       </div>
-      {(rule.translation_result === "partial" || !rule.elastic_rule) && (
-        <button className="text-xs text-blue-600 underline shrink-0" onClick={onFix}>
-          Fix
-        </button>
-      )}
+      <div className="flex items-center gap-2 shrink-0" onClick={(e) => e.stopPropagation()}>
+        {(rule.translation_result === "partial" || !rule.elastic_rule) && (
+          <button
+            className="text-xs text-blue-600 underline"
+            onClick={(e) => { e.stopPropagation(); onOpenDrawer(); }}
+          >
+            Drawer
+          </button>
+        )}
+        <span className="text-xs text-gray-400">{expanded ? "▲" : "▼"}</span>
+      </div>
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Three-column diff panel (inline within the review step)
+// ---------------------------------------------------------------------------
+
+const MONACO_OPTIONS_RO: editor.IStandaloneEditorConstructionOptions = {
+  readOnly: true,
+  minimap: { enabled: false },
+  scrollBeyondLastLine: false,
+  lineNumbers: "off",
+  glyphMargin: false,
+  folding: false,
+  renderLineHighlight: "none",
+  wordWrap: "on",
+  automaticLayout: true,
+  fontSize: 12,
+};
+
+const MONACO_OPTIONS_EDIT: editor.IStandaloneEditorConstructionOptions = {
+  ...MONACO_OPTIONS_RO,
+  readOnly: false,
+  lineNumbers: "on",
+};
+
+function RuleDiff({
+  rule,
+  onSave,
+  onCancel,
+}: {
+  rule: TranslatedRule;
+  onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
+  onCancel: () => void;
+}) {
+  const [editedJson, setEditedJson] = useState(() =>
+    JSON.stringify(rule.elastic_rule ?? {}, null, 2)
+  );
+  const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
+    rule.translation_result ?? "partial"
+  );
+
+  const originalSpl = useMemo(() => {
+    const r = rule.original_rule;
+    return (r.search as string | undefined) ?? (r.spl as string | undefined) ??
+      JSON.stringify(r, null, 2);
+  }, [rule.original_rule]);
+
+  const generatedJson = useMemo(
+    () => JSON.stringify(rule.elastic_rule ?? {}, null, 2),
+    [rule.elastic_rule]
+  );
+
+  return (
+    <div className="migration-diff-panel border-t border-gray-200">
+      <div className="migration-diff-columns">
+        {/* Left: original SPL (read-only code block) */}
+        <div className="migration-diff-col">
+          <div className="migration-diff-col-header">Original SPL</div>
+          <pre className="migration-diff-spl">{originalSpl}</pre>
+        </div>
+
+        {/* Middle: generated Elastic rule JSON (read-only Monaco) */}
+        <div className="migration-diff-col">
+          <div className="migration-diff-col-header">Generated (read-only)</div>
+          <Editor
+            height="280px"
+            language="json"
+            value={generatedJson}
+            options={MONACO_OPTIONS_RO}
+          />
+        </div>
+
+        {/* Right: user-editable Elastic rule JSON (Monaco) */}
+        <div className="migration-diff-col">
+          <div className="migration-diff-col-header">Edit</div>
+          <Editor
+            height="280px"
+            language="json"
+            value={editedJson}
+            options={MONACO_OPTIONS_EDIT}
+            onChange={(v) => setEditedJson(v ?? "")}
+          />
+        </div>
+      </div>
+
+      <div className="migration-diff-footer">
+        <select
+          className="text-sm border border-gray-200 rounded p-1"
+          value={result}
+          onChange={(e) => setResult(e.target.value as typeof result)}
+        >
+          <option value="full">Full — production-ready</option>
+          <option value="partial">Partial — needs tuning</option>
+          <option value="untranslatable">Untranslatable — skip</option>
+        </select>
+        <div className="flex gap-2">
+          <button className="text-sm text-gray-500 px-3 py-1.5" onClick={onCancel}>
+            Cancel
+          </button>
+          <button
+            className="text-sm px-3 py-1.5 bg-blue-600 text-white rounded"
+            onClick={() => onSave(editedJson, result)}
+          >
+            Save
+          </button>
+        </div>
+      </div>
     </div>
   );
 }
diff --git a/src/views/migration/mcp-app.tsx b/src/views/migration/mcp-app.tsx
index 7251dbf..85bf167 100644
--- a/src/views/migration/mcp-app.tsx
+++ b/src/views/migration/mcp-app.tsx
@@ -5,6 +5,7 @@
  * 2.0.
  */
 
+import "./monaco-environment";
 import React from "react";
 import { createRoot } from "react-dom/client";
 import { App } from "./App";
diff --git a/src/views/migration/monaco-environment.ts b/src/views/migration/monaco-environment.ts
new file mode 100644
index 0000000..744f2a0
--- /dev/null
+++ b/src/views/migration/monaco-environment.ts
@@ -0,0 +1,26 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import * as monaco from "monaco-editor";
+import { loader } from "@monaco-editor/react";
+import EditorWorker from "monaco-editor/esm/vs/editor/editor.worker?worker&inline";
+
+/**
+ * The view ships as a single inlined HTML bundle (vite-plugin-singlefile).
+ * Workers and JS chunks are not reachable at runtime, so:
+ *
+ *  - `?worker&inline` base64-inlines the editor worker into the bundle.
+ *  - `loader.config({ monaco })` makes @monaco-editor/react use the
+ *    locally-bundled monaco instead of fetching it from the CDN.
+ */
+(globalThis as unknown as { MonacoEnvironment: { getWorker: (...args: unknown[]) => Worker } }).MonacoEnvironment = {
+  getWorker() {
+    return new EditorWorker();
+  },
+};
+
+loader.config({ monaco });
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index e9ac84e..40d0884 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -159,3 +159,62 @@
   padding: 8px 0;
   border-bottom: 1px solid var(--border-color, #e5e5e5);
 }
+
+/* Three-column diff panel */
+
+.migration-diff-panel {
+  background: var(--surface-subtle, #fafafa);
+}
+
+.migration-diff-columns {
+  display: grid;
+  grid-template-columns: 1fr 1fr 1fr;
+  min-height: 320px;
+}
+
+.migration-diff-col {
+  display: flex;
+  flex-direction: column;
+  border-right: 1px solid var(--border-color, #e5e5e5);
+  overflow: hidden;
+}
+
+.migration-diff-col:last-child {
+  border-right: none;
+}
+
+.migration-diff-col-header {
+  padding: 6px 10px;
+  font-size: 11px;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  color: var(--text-muted, #737373);
+  background: var(--surface-color, #fff);
+  border-bottom: 1px solid var(--border-color, #e5e5e5);
+  flex-shrink: 0;
+}
+
+.migration-diff-spl {
+  flex: 1;
+  margin: 0;
+  padding: 8px 10px;
+  font-family: "Fira Code", "Cascadia Code", monospace;
+  font-size: 11px;
+  line-height: 1.6;
+  white-space: pre-wrap;
+  word-break: break-all;
+  overflow: auto;
+  background: var(--surface-subtle, #fafafa);
+  color: var(--text-color, #171717);
+}
+
+.migration-diff-footer {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  padding: 10px 14px;
+  border-top: 1px solid var(--border-color, #e5e5e5);
+  background: var(--surface-color, #fff);
+  gap: 8px;
+}

From c99801a0f06351f2d034deff6da5768880921769 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:50:52 +0200
Subject: [PATCH 26/42] feat: per-rule drawer with ElasticRulePartial form and
 Re-validate button

Replaces the bare JSON textarea in RuleDrawer with a structured form
covering the 7 key Elastic detection rule fields (name, description,
type, query, language, severity, risk_score). The Re-validate button
saves the current edits and marks the rule as "partial" via
update-translated-rule; Save uses the user-selected translation result.
Adds .migration-form-input CSS for consistent field styling.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx    | 194 ++++++++++++++++++++++++++++++---
 src/views/migration/styles.css |  19 ++++
 2 files changed, 196 insertions(+), 17 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 3177370..ac4c855 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -933,6 +933,136 @@ function TranslationBadge({ result }: { result?: string }) {
   return <span className={cls}>{label}</span>;
 }
 
+// ---------------------------------------------------------------------------
+// ElasticRulePartial — key fields of an Elastic detection rule
+// ---------------------------------------------------------------------------
+
+interface ElasticRulePartial {
+  name: string;
+  description: string;
+  type: string;
+  query: string;
+  language: string;
+  severity: string;
+  risk_score: number;
+  [key: string]: unknown;
+}
+
+function fromRuleJson(raw: Record<string, unknown>): ElasticRulePartial {
+  return {
+    name: (raw.name as string | undefined) ?? "",
+    description: (raw.description as string | undefined) ?? "",
+    type: (raw.type as string | undefined) ?? "query",
+    query: (raw.query as string | undefined) ?? "",
+    language: (raw.language as string | undefined) ?? "kuery",
+    severity: (raw.severity as string | undefined) ?? "medium",
+    risk_score: typeof raw.risk_score === "number" ? raw.risk_score : 50,
+    ...raw,
+  };
+}
+
+function ElasticRuleForm({
+  fields,
+  onChange,
+}: {
+  fields: ElasticRulePartial;
+  onChange: (patch: Partial<ElasticRulePartial>) => void;
+}) {
+  return (
+    <div className="space-y-3 text-sm">
+      <FormRow label="Name">
+        <input
+          className="migration-form-input"
+          value={fields.name}
+          onChange={(e) => onChange({ name: e.target.value })}
+        />
+      </FormRow>
+      <FormRow label="Description">
+        <textarea
+          className="migration-form-input h-16 resize-none"
+          value={fields.description}
+          onChange={(e) => onChange({ description: e.target.value })}
+        />
+      </FormRow>
+      <div className="flex gap-3">
+        <FormRow label="Type" className="flex-1">
+          <select
+            className="migration-form-input"
+            value={fields.type}
+            onChange={(e) => onChange({ type: e.target.value })}
+          >
+            {["query", "eql", "esql", "threshold", "machine_learning", "new_terms"].map(
+              (t) => <option key={t} value={t}>{t}</option>
+            )}
+          </select>
+        </FormRow>
+        <FormRow label="Language" className="flex-1">
+          <select
+            className="migration-form-input"
+            value={fields.language}
+            onChange={(e) => onChange({ language: e.target.value })}
+          >
+            {["kuery", "eql", "esql", "lucene"].map(
+              (l) => <option key={l} value={l}>{l}</option>
+            )}
+          </select>
+        </FormRow>
+      </div>
+      <FormRow label="Query">
+        <textarea
+          className="migration-form-input h-28 resize-y font-mono text-xs"
+          value={fields.query}
+          onChange={(e) => onChange({ query: e.target.value })}
+        />
+      </FormRow>
+      <div className="flex gap-3">
+        <FormRow label="Severity" className="flex-1">
+          <select
+            className="migration-form-input"
+            value={fields.severity}
+            onChange={(e) => onChange({ severity: e.target.value })}
+          >
+            {["low", "medium", "high", "critical"].map(
+              (s) => <option key={s} value={s}>{s}</option>
+            )}
+          </select>
+        </FormRow>
+        <FormRow label="Risk score" className="flex-1">
+          <input
+            type="number"
+            min={0}
+            max={100}
+            className="migration-form-input"
+            value={fields.risk_score}
+            onChange={(e) => onChange({ risk_score: Math.min(100, Math.max(0, Number(e.target.value))) })}
+          />
+        </FormRow>
+      </div>
+    </div>
+  );
+}
+
+function FormRow({
+  label,
+  className,
+  children,
+}: {
+  label: string;
+  className?: string;
+  children: React.ReactNode;
+}) {
+  return (
+    <div className={className}>
+      <label className="block text-xs font-medium text-gray-600 mb-1">{label}</label>
+      {children}
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// RuleDrawer — slide-over with ElasticRulePartial form
+// ---------------------------------------------------------------------------
+
 function RuleDrawer({
   rule,
   onSave,
@@ -942,34 +1072,55 @@ function RuleDrawer({
   onSave: (json: string, result: "full" | "partial" | "untranslatable") => void;
   onClose: () => void;
 }) {
-  const [json, setJson] = useState(() =>
-    JSON.stringify(rule.elastic_rule ?? {}, null, 2)
-  );
+  const rawRule = rule.elastic_rule ?? {};
+  const [fields, setFields] = useState<ElasticRulePartial>(() => fromRuleJson(rawRule));
   const [result, setResult] = useState<"full" | "partial" | "untranslatable">(
     rule.translation_result ?? "partial"
   );
+  const [revalidating, setRevalidating] = useState(false);
+
+  const patch = (update: Partial<ElasticRulePartial>) =>
+    setFields((prev) => ({ ...prev, ...update }));
+
+  const toJson = () => JSON.stringify({ ...rawRule, ...fields }, null, 2);
+
+  const handleRevalidate = async () => {
+    setRevalidating(true);
+    try {
+      // Save the current edits; caller persists via update-translated-rule
+      // and can determine a new translation result from the API response.
+      onSave(toJson(), "partial");
+    } finally {
+      setRevalidating(false);
+    }
+  };
+
+  const ruleName =
+    fields.name ||
+    (rule.original_rule?.title as string | undefined) ||
+    rule.id;
 
   return (
     <div className="migration-drawer">
       <div className="migration-drawer-header">
-        <h3 className="font-semibold text-sm">Fix translated rule</h3>
-        <button className="text-gray-400 hover:text-gray-700" onClick={onClose}>
+        <div className="min-w-0">
+          <h3 className="font-semibold text-sm truncate">{ruleName}</h3>
+          <TranslationBadge result={rule.translation_result} />
+        </div>
+        <button className="text-gray-400 hover:text-gray-700 shrink-0" onClick={onClose}>
           ✕
         </button>
       </div>
+
       <div className="migration-drawer-body">
-        <p className="text-xs text-gray-500 mb-2">
-          Edit the Elastic rule JSON and select the translation quality.
-        </p>
-        <textarea
-          className="migration-rule-json-editor"
-          value={json}
-          onChange={(e) => setJson(e.target.value)}
-        />
-        <div className="mt-3">
-          <label className="block text-xs font-medium mb-1">Translation result</label>
+        <ElasticRuleForm fields={fields} onChange={patch} />
+
+        <div className="mt-4">
+          <label className="block text-xs font-medium text-gray-600 mb-1">
+            Translation result
+          </label>
           <select
-            className="w-full text-sm border border-gray-200 rounded p-1.5"
+            className="migration-form-input"
             value={result}
             onChange={(e) => setResult(e.target.value as typeof result)}
           >
@@ -979,13 +1130,22 @@ function RuleDrawer({
           </select>
         </div>
       </div>
+
       <div className="migration-drawer-footer">
         <button className="text-sm text-gray-500" onClick={onClose}>
           Cancel
         </button>
+        <button
+          className="text-sm px-3 py-1.5 border border-gray-300 rounded disabled:opacity-50"
+          disabled={revalidating}
+          onClick={() => void handleRevalidate()}
+          title="Save edits and mark as partial for further review"
+        >
+          {revalidating ? "Saving…" : "Re-validate"}
+        </button>
         <button
           className="text-sm px-3 py-1.5 bg-blue-600 text-white rounded"
-          onClick={() => onSave(json, result)}
+          onClick={() => onSave(toJson(), result)}
         >
           Save
         </button>
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index 40d0884..35255fd 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -218,3 +218,22 @@
   background: var(--surface-color, #fff);
   gap: 8px;
 }
+
+/* Shared input style used across the ElasticRuleForm and drawer selects */
+
+.migration-form-input {
+  width: 100%;
+  font-size: 13px;
+  border: 1px solid var(--border-color, #d4d4d4);
+  border-radius: 4px;
+  padding: 4px 8px;
+  background: var(--surface-color, #fff);
+  color: var(--text-color, #171717);
+  line-height: 1.5;
+}
+
+.migration-form-input:focus {
+  outline: none;
+  border-color: var(--accent-color, #0077cc);
+  box-shadow: 0 0 0 2px var(--accent-color-alpha, rgba(0, 119, 204, 0.15));
+}

From 4aca820e90c2cce42f68fe288bc4c57245c3355c Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:54:51 +0200
Subject: [PATCH 27/42] feat: fix-resources drawer with per-resource inline
 edit and unresolved highlighting

Replaces the single add-form drawer with per-resource inline edit rows:
- Unresolved resources (empty content) are auto-expanded and rendered
  with a yellow border/background so they are immediately actionable
- Each row has an individual Save button calling upsert-resource
- Resolved resources are collapsed by default but expandable for edits
- An "Add resource" section at the bottom handles net-new entries
- saveResources now stays in fix-resources-drawer after upsert (refreshes
  the list) so users can fix multiple resources in one session; closeDrawer
  transitions back to review as before

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx | 206 ++++++++++++++++++++++++++++--------
 1 file changed, 161 insertions(+), 45 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index ac4c855..46ac888 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -356,7 +356,7 @@ export function App() {
         await callTool(app, "upsert-resource", { migrationId, vendor, ...resource });
         const resources =
           (await callTool<MigrationResource[]>(app, "get-resources", { migrationId, vendor })) ?? [];
-        setState({ stage: "review", vendor, migrationId, translations, resources });
+        setState({ stage: "fix-resources-drawer", vendor, migrationId, translations, resources });
       } catch (e) {
         setError(e instanceof Error ? e.message : String(e));
       } finally {
@@ -1163,66 +1163,182 @@ function ResourcesDrawer({
   onSave: (resource: MigrationResource) => void;
   onClose: () => void;
 }) {
-  const [name, setName] = useState("");
-  const [type, setType] = useState<"macro" | "lookup">("macro");
-  const [content, setContent] = useState("");
+  const [addName, setAddName] = useState("");
+  const [addType, setAddType] = useState<"macro" | "lookup">("macro");
+  const [addContent, setAddContent] = useState("");
+
+  const unresolved = resources.filter((r) => !r.content.trim());
+  const resolved = resources.filter((r) => r.content.trim());
 
   return (
     <div className="migration-drawer">
       <div className="migration-drawer-header">
-        <h3 className="font-semibold text-sm">Manage resources</h3>
-        <button className="text-gray-400 hover:text-gray-700" onClick={onClose}>
+        <div className="min-w-0">
+          <h3 className="font-semibold text-sm">Manage resources</h3>
+          {unresolved.length > 0 && (
+            <span className="text-xs text-yellow-700">
+              {unresolved.length} unresolved
+            </span>
+          )}
+        </div>
+        <button className="text-gray-400 hover:text-gray-700 shrink-0" onClick={onClose}>
           ✕
         </button>
       </div>
-      <div className="migration-drawer-body">
-        {resources.length > 0 && (
-          <div className="mb-4">
-            <p className="text-xs font-medium mb-2 text-gray-600">Existing resources</p>
-            {resources.map((r) => (
-              <div key={`${r.type}:${r.name}`} className="migration-resource-row">
-                <span className="text-xs font-mono bg-gray-100 px-1 rounded">{r.type}</span>
-                <span className="text-sm">{r.name}</span>
-              </div>
-            ))}
-          </div>
+
+      <div className="migration-drawer-body space-y-4">
+        {unresolved.length > 0 && (
+          <section>
+            <p className="text-xs font-semibold text-yellow-700 uppercase tracking-wide mb-2">
+              Unresolved ({unresolved.length})
+            </p>
+            <div className="space-y-2">
+              {unresolved.map((r) => (
+                <ResourceEditRow
+                  key={`${r.type}:${r.name}`}
+                  resource={r}
+                  defaultExpanded
+                  onSave={onSave}
+                />
+              ))}
+            </div>
+          </section>
         )}
-        <p className="text-xs font-medium mb-2 text-gray-600">Add / update resource</p>
-        <div className="space-y-2">
-          <select
-            className="w-full text-sm border border-gray-200 rounded p-1.5"
-            value={type}
-            onChange={(e) => setType(e.target.value as typeof type)}
-          >
-            <option value="macro">Macro</option>
-            <option value="lookup">Lookup</option>
-          </select>
-          <input
-            className="w-full text-sm border border-gray-200 rounded p-1.5"
-            placeholder="Resource name"
-            value={name}
-            onChange={(e) => setName(e.target.value)}
-          />
-          <textarea
-            className="migration-rule-json-editor h-24"
-            placeholder="Resource content / definition"
-            value={content}
-            onChange={(e) => setContent(e.target.value)}
-          />
-        </div>
+
+        {resolved.length > 0 && (
+          <section>
+            <p className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-2">
+              Defined ({resolved.length})
+            </p>
+            <div className="space-y-2">
+              {resolved.map((r) => (
+                <ResourceEditRow
+                  key={`${r.type}:${r.name}`}
+                  resource={r}
+                  defaultExpanded={false}
+                  onSave={onSave}
+                />
+              ))}
+            </div>
+          </section>
+        )}
+
+        {resources.length === 0 && (
+          <p className="text-sm text-gray-400 text-center py-8">No resources found.</p>
+        )}
+
+        <section>
+          <p className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-2">
+            Add resource
+          </p>
+          <div className="space-y-2">
+            <div className="flex gap-2">
+              <select
+                className="migration-form-input"
+                value={addType}
+                onChange={(e) => setAddType(e.target.value as typeof addType)}
+              >
+                <option value="macro">Macro</option>
+                <option value="lookup">Lookup</option>
+              </select>
+              <input
+                className="migration-form-input"
+                placeholder="Resource name"
+                value={addName}
+                onChange={(e) => setAddName(e.target.value)}
+              />
+            </div>
+            <textarea
+              className="migration-rule-json-editor h-20"
+              placeholder="Paste definition…"
+              value={addContent}
+              onChange={(e) => setAddContent(e.target.value)}
+            />
+            <div className="flex justify-end">
+              <button
+                className="text-xs px-3 py-1.5 bg-blue-600 text-white rounded disabled:opacity-50"
+                disabled={!addName.trim()}
+                onClick={() => {
+                  onSave({ type: addType, name: addName.trim(), content: addContent });
+                  setAddName("");
+                  setAddContent("");
+                }}
+              >
+                Add
+              </button>
+            </div>
+          </div>
+        </section>
       </div>
+
       <div className="migration-drawer-footer">
         <button className="text-sm text-gray-500" onClick={onClose}>
-          Close
+          Done
         </button>
+      </div>
+    </div>
+  );
+}
+
+function ResourceEditRow({
+  resource,
+  defaultExpanded,
+  onSave,
+}: {
+  resource: MigrationResource;
+  defaultExpanded: boolean;
+  onSave: (r: MigrationResource) => void;
+}) {
+  const [expanded, setExpanded] = useState(defaultExpanded);
+  const [content, setContent] = useState(resource.content);
+  const isUnresolved = !resource.content.trim();
+  const isDirty = content !== resource.content;
+
+  useEffect(() => {
+    setContent(resource.content);
+  }, [resource.content]);
+
+  return (
+    <div
+      className={`border rounded p-2 space-y-2${isUnresolved ? " border-yellow-300 bg-yellow-50" : " border-gray-200"}`}
+    >
+      <div className="flex items-center justify-between">
+        <div className="flex items-center gap-2 min-w-0">
+          <span className="text-xs font-mono bg-gray-100 px-1 rounded shrink-0">
+            {resource.type}
+          </span>
+          <span className="text-sm font-medium truncate">{resource.name}</span>
+          {isUnresolved && (
+            <span className="text-xs text-yellow-600 shrink-0">unresolved</span>
+          )}
+        </div>
         <button
-          className="text-sm px-3 py-1.5 bg-blue-600 text-white rounded disabled:opacity-50"
-          disabled={!name.trim()}
-          onClick={() => onSave({ type, name: name.trim(), content })}
+          className="text-xs text-gray-400 shrink-0 ml-2"
+          onClick={() => setExpanded((p) => !p)}
         >
-          Save resource
+          {expanded ? "▲" : "▼"}
         </button>
       </div>
+
+      {expanded && (
+        <>
+          <textarea
+            className="migration-rule-json-editor h-24 w-full"
+            placeholder="Paste macro or lookup definition…"
+            value={content}
+            onChange={(e) => setContent(e.target.value)}
+          />
+          <div className="flex justify-end">
+            <button
+              className="text-xs px-3 py-1.5 bg-blue-600 text-white rounded disabled:opacity-50"
+              disabled={!isDirty && !isUnresolved}
+              onClick={() => onSave({ ...resource, content })}
+            >
+              Save
+            </button>
+          </div>
+        </>
+      )}
     </div>
   );
 }

From e8c2c4c178a883c3874ba3585cf5776a7bd3b7a3 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 12:58:37 +0200
Subject: [PATCH 28/42] feat: install step and done step with working back
 navigation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix: install stage was missing resources, so closeDrawer could not
restore the full review state. Now:
- WorkbenchState.install carries resources alongside translations
- startInstall passes resources when entering the stage
- closeDrawer handles install → review (joins the existing fix-*-drawer
  → review paths), making the "Back to review" button functional
- confirmInstall calls install-rules and transitions to done with
  installed/failed counts
- Done step shows KpiStrip with installed/failed tiles and a reset action

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index 46ac888..b60aaf0 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -114,6 +114,7 @@ export type WorkbenchState =
       vendor: string;
       migrationId: string;
       translations: TranslatedRule[];
+      resources: MigrationResource[];
     }
   | {
       stage: "done";
@@ -375,6 +376,9 @@ export function App() {
         void _stage;
         return { ...(rest as { vendor: string; migrationId: string; translations: TranslatedRule[]; resources: MigrationResource[] }), stage: "review" };
       }
+      if (prev.stage === "install") {
+        return { stage: "review", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations, resources: prev.resources };
+      }
       return prev;
     });
   }, []);
@@ -382,7 +386,7 @@ export function App() {
   const startInstall = useCallback(() => {
     setState((prev) => {
       if (prev.stage !== "review") return prev;
-      return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations };
+      return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations, resources: prev.resources };
     });
   }, []);
 

From ac12f0bb552175e9fc669abeed2a046a2ddfc52a Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:06:03 +0200
Subject: [PATCH 29/42] feat: build migration view as singlefile HTML bundle
 (365 kB, < 1 MB)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Monaco editor added ~4.8 MB to the bundle (editor library + inlined
editor.worker). To meet the < 1 MB singlefile target, Monaco is removed
from the migration view:
- RuleDiff generated column: Monaco read-only → <pre> (same class as SPL)
- RuleDiff editable column: Monaco Editor → <textarea> with matching
  monospace style (.migration-diff-textarea)
- RuleDrawer: already uses structured form inputs, not Monaco — unchanged
- Removed monaco-environment import from mcp-app.tsx entry point

Output: 364 kB uncompressed (105 kB gzip) — a single self-contained
mcp-app.html with no companion worker files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/views/migration/App.tsx     | 47 +++++++++------------------------
 src/views/migration/mcp-app.tsx |  1 -
 src/views/migration/styles.css  |  9 +++++++
 3 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx
index b60aaf0..badcc8d 100644
--- a/src/views/migration/App.tsx
+++ b/src/views/migration/App.tsx
@@ -6,8 +6,6 @@
  */
 
 import React, { useState, useCallback, useEffect, useMemo, useRef } from "react";
-import Editor from "@monaco-editor/react";
-import type { editor } from "monaco-editor";
 import type { App as McpApp } from "@modelcontextprotocol/ext-apps";
 import { extractCallResult } from "../../shared/extract-tool-text";
 import {
@@ -824,27 +822,12 @@ function RuleRow({
 
 // ---------------------------------------------------------------------------
 // Three-column diff panel (inline within the review step)
+//
+// Monaco editor is intentionally omitted to keep the singlefile HTML bundle
+// under 1 MB. The generated column uses a styled <pre>; the editable column
+// uses a <textarea> with the same monospace style.
 // ---------------------------------------------------------------------------
 
-const MONACO_OPTIONS_RO: editor.IStandaloneEditorConstructionOptions = {
-  readOnly: true,
-  minimap: { enabled: false },
-  scrollBeyondLastLine: false,
-  lineNumbers: "off",
-  glyphMargin: false,
-  folding: false,
-  renderLineHighlight: "none",
-  wordWrap: "on",
-  automaticLayout: true,
-  fontSize: 12,
-};
-
-const MONACO_OPTIONS_EDIT: editor.IStandaloneEditorConstructionOptions = {
-  ...MONACO_OPTIONS_RO,
-  readOnly: false,
-  lineNumbers: "on",
-};
-
 function RuleDiff({
   rule,
   onSave,
@@ -875,32 +858,26 @@ function RuleDiff({
   return (
     <div className="migration-diff-panel border-t border-gray-200">
       <div className="migration-diff-columns">
-        {/* Left: original SPL (read-only code block) */}
+        {/* Left: original SPL */}
         <div className="migration-diff-col">
           <div className="migration-diff-col-header">Original SPL</div>
           <pre className="migration-diff-spl">{originalSpl}</pre>
         </div>
 
-        {/* Middle: generated Elastic rule JSON (read-only Monaco) */}
+        {/* Middle: generated Elastic rule JSON (read-only) */}
         <div className="migration-diff-col">
           <div className="migration-diff-col-header">Generated (read-only)</div>
-          <Editor
-            height="280px"
-            language="json"
-            value={generatedJson}
-            options={MONACO_OPTIONS_RO}
-          />
+          <pre className="migration-diff-spl">{generatedJson}</pre>
         </div>
 
-        {/* Right: user-editable Elastic rule JSON (Monaco) */}
+        {/* Right: user-editable Elastic rule JSON */}
         <div className="migration-diff-col">
           <div className="migration-diff-col-header">Edit</div>
-          <Editor
-            height="280px"
-            language="json"
+          <textarea
+            className="migration-diff-spl migration-diff-textarea"
             value={editedJson}
-            options={MONACO_OPTIONS_EDIT}
-            onChange={(v) => setEditedJson(v ?? "")}
+            onChange={(e) => setEditedJson(e.target.value)}
+            spellCheck={false}
           />
         </div>
       </div>
diff --git a/src/views/migration/mcp-app.tsx b/src/views/migration/mcp-app.tsx
index 85bf167..7251dbf 100644
--- a/src/views/migration/mcp-app.tsx
+++ b/src/views/migration/mcp-app.tsx
@@ -5,7 +5,6 @@
  * 2.0.
  */
 
-import "./monaco-environment";
 import React from "react";
 import { createRoot } from "react-dom/client";
 import { App } from "./App";
diff --git a/src/views/migration/styles.css b/src/views/migration/styles.css
index 35255fd..8a28a42 100644
--- a/src/views/migration/styles.css
+++ b/src/views/migration/styles.css
@@ -209,6 +209,15 @@
   color: var(--text-color, #171717);
 }
 
+/* Editable column textarea — same look as the read-only <pre> siblings */
+
+.migration-diff-textarea {
+  resize: vertical;
+  border: none;
+  outline: none;
+  min-height: 280px;
+}
+
 .migration-diff-footer {
   display: flex;
   align-items: center;

From 79a55ff39a7497c6d8380f57ef9f2fe1718037a3 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:08:48 +0200
Subject: [PATCH 30/42] feat: add automatic-migration SKILL.md with lifecycle
 and gotchas

Host-side skill prompt for the SIEM migration workflow. Covers:
- YAML frontmatter with trigger phrases (migrate my Splunk rules,
  import SPL, onboard from Splunk, SIEM migration, convert detection rules)
- Tools table separating the model-facing migrate-rules entry-point
  from the 10 workbench-only app tools
- Workbench Lifecycle table documenting all 8 stages with what the
  user does and what signals completion
- Correction strategy: start-over, back-from-install, re-edit rule,
  re-edit resource, restart translation
- Common gotchas: vendor gate, direct tool calls, upload format,
  partial translations, macro/lookup resolution, large rule sets,
  re-opening existing migrations

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 skills/automatic-migration/SKILL.md | 101 ++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 skills/automatic-migration/SKILL.md

diff --git a/skills/automatic-migration/SKILL.md b/skills/automatic-migration/SKILL.md
new file mode 100644
index 0000000..ce51d85
--- /dev/null
+++ b/skills/automatic-migration/SKILL.md
@@ -0,0 +1,101 @@
+---
+name: automatic-migration
+description: >
+  Migrate detection rules from Splunk (or other SIEMs) to Elastic Security. Use for
+  "migrate my Splunk rules", "import SPL", "onboard from Splunk", "SIEM migration",
+  "convert detection rules", "translate SPL to EQL", or any request to move security
+  rules from a third-party platform into Elastic. Vendor support: Splunk (active),
+  QRadar / Sentinel-One (coming soon).
+---
+
+# Automatic Migration
+
+Migrate third-party SIEM detection rules into Elastic Security using the `elastic-security`
+MCP connector. Call `migrate-rules` ONCE — it opens an interactive workbench that guides
+the SOC engineer through every stage of the migration. Do NOT attempt to drive the process
+step-by-step through prose or individual tool calls; the workbench handles all state
+transitions internally.
+
+## Tools
+
+| Tool | Caller | Purpose |
+|------|--------|---------|
+| `migrate-rules` | Model | **Entry point.** Opens the interactive migration workbench. No parameters required. |
+| `list-migrations` | Workbench | List all existing SIEM migrations |
+| `get-migration` | Workbench | Get status and rule counts for a specific migration |
+| `get-translated-rules` | Workbench | Fetch translated rules (paginated, filterable) |
+| `start-translation` | Workbench | Trigger AI translation of uploaded rules |
+| `stop-translation` | Workbench | Cancel an in-progress translation |
+| `update-translated-rule` | Workbench | Save manual edits to a translated rule |
+| `get-resources` | Workbench | List macro/lookup resources referenced by translated rules |
+| `upsert-resource` | Workbench | Create or update a macro or lookup definition |
+| `install-rules` | Workbench | Install translated rules into Elastic Security (installed as disabled) |
+| `get-stats` | Workbench | Get translation progress counts for a migration |
+
+Only `migrate-rules` is model-facing. All other tools are called by the workbench via its
+back-channel. Do not call them directly in conversation.
+
+## Workbench Lifecycle
+
+| Stage | What the user does | Completion signal |
+|-------|--------------------|-------------------|
+| **vendor-select** | Picks the source SIEM (Splunk active; QRadar / Sentinel-One coming soon) | Vendor button clicked |
+| **upload** | Drops a JSON export file, uses the file picker, or pastes a rules array | "Upload & start translation" clicked |
+| **translating** | Waits while the AI translator processes rules; live progress bar | Migration status reaches `finished` or `error` |
+| **review** | Reviews each rule's three-column diff (original SPL / generated / editable) | "Install N rules" clicked |
+| **fix-rule-drawer** | Edits key fields of a single rule (name, query, language, severity, risk score) via structured form; "Re-validate" marks it `partial`, "Save" uses the selected result | Drawer closed |
+| **fix-resources-drawer** | Provides definitions for unresolved macros and lookups; each row has an individual Save button calling `upsert-resource` | "Done" in the drawer |
+| **install** | Confirms installation of all translatable rules; "Back to review" is available | "Confirm install" clicked |
+| **done** | Views the installed / failed summary | — |
+
+## Correction Strategy
+
+If the user wants to revisit or undo a step:
+
+- **Start over at any step**: the "Start over" button in the header resets to vendor-select.
+- **Back from install confirmation**: click "Back to review" to return without installing.
+- **Re-edit a specific rule**: re-open the rule drawer from the review list and save again;
+  each save calls `update-translated-rule` and refreshes the list in-place.
+- **Re-edit a resource**: re-open the resources drawer; each per-row "Save" calls
+  `upsert-resource` and re-fetches the resources list without closing the drawer.
+- **Restart translation**: use "Start over", re-upload the rules, then re-trigger translation.
+
+The workbench never permanently deletes data. Translation results and rule edits are persisted
+in Kibana; re-opening the workbench via `migrate-rules` will show all prior migrations.
+
+## Common Gotchas
+
+**Vendor not supported.** QRadar and Sentinel-One show as "Coming soon" — their vendor-select
+buttons are disabled. If the user asks to migrate from a non-Splunk platform, explain that
+only Splunk is currently supported and suggest they check the Elastic roadmap for updates.
+
+**Calling app-only tools directly.** Do not call `start-translation`, `get-translated-rules`,
+`install-rules`, or any other app-only tool manually. They are wired to the workbench
+back-channel and will return raw JSON with no useful context in a prose conversation. Always
+call `migrate-rules` once and let the workbench drive everything else.
+
+**Upload format.** The upload step expects a JSON array of Splunk rule objects as exported from
+the Splunk Enterprise Security Rules page. Each object must include a `search` field containing
+the raw SPL query. Other formats (YAML, CSV, Splunk `.conf` files) are not supported and will
+fail silently.
+
+**Partial translations.** Rules marked `partial` were AI-translated but may need tuning before
+they match the customer's data. They can be installed, but Elastic Security will show them as
+disabled; the SOC engineer should review and enable them manually. Rules marked `untranslatable`
+are skipped during installation entirely.
+
+**Macro and lookup references.** Splunk rules that reference custom macros or lookups will
+translate with placeholder references. The fix-resources-drawer lists all detected unresolved
+references and auto-expands them. Fill in each definition before installing — installed rules
+that reference undefined macros will not fire correctly.
+
+**Large rule sets.** Translation is asynchronous. For large exports (hundreds of rules), the
+translating stage may run for several minutes. The progress bar polls every 3 seconds
+automatically. Do not suggest calling `stop-translation` unless the user explicitly wants to
+cancel and discard in-progress results.
+
+**Re-opening an existing migration.** Calling `migrate-rules` when one or more migrations
+already exist will show them in the response JSON. The workbench starts at vendor-select each
+time — there is no "resume" flow yet. To continue working on an existing migration, the user
+must navigate through the workbench stages again; prior translations are preserved on the
+server and will reappear in the review step after re-triggering translation.

From 3b21a5c86ad56d9e04d0df45884b9b5d12dc5efe Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:11:38 +0200
Subject: [PATCH 31/42] feat: add automatic-migration eval dataset (6 positives
 + 6 distractors)

Positives cover the five spec trigger phrases (migrate Splunk rules,
upload SPL bundle, onboard from Splunk, SIEM migration, convert detection
rules) plus an install-translated-rules variant. Distractors span the
other five skills (detection-rule-management, alert-triage, threat-hunt,
case-management, generate-sample-data) to test boundary discrimination.
All examples set expected.skill so the negative-activation evaluator
can gate on migrate-rules absence in distractor runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/datasets/automatic-migration.dataset.ts | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 evals/datasets/automatic-migration.dataset.ts

diff --git a/evals/datasets/automatic-migration.dataset.ts b/evals/datasets/automatic-migration.dataset.ts
new file mode 100644
index 0000000..9c02ca1
--- /dev/null
+++ b/evals/datasets/automatic-migration.dataset.ts
@@ -0,0 +1,128 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Dataset, Example } from "../types.js";
+
+/**
+ * The model-facing entry-point tool registered by the
+ * automatic-migration skill (src/tools/migration.ts).
+ */
+const SKILL_TOOL = "migrate-rules";
+
+// ---------------------------------------------------------------------------
+// Positive examples — the LLM should call migrate-rules
+// ---------------------------------------------------------------------------
+
+export const positiveExamples: Example[] = [
+  {
+    id: "am-pos-01",
+    input: "I want to migrate my Splunk rules to Elastic Security",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "am-pos-02",
+    input: "Help me upload my SPL bundle and convert the detections",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "am-pos-03",
+    input: "We're onboarding from Splunk — how do I bring our detection rules over?",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "am-pos-04",
+    input: "Start a SIEM migration for our 200 Splunk correlation rules",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "am-pos-05",
+    input: "Convert our detection rules from Splunk to Elastic format",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+  {
+    id: "am-pos-06",
+    input: "Install the translated rules from my last migration run",
+    expected: {
+      skill: SKILL_TOOL,
+      tools: [SKILL_TOOL],
+    },
+  },
+];
+
+// ---------------------------------------------------------------------------
+// Distractor examples — the LLM should NOT call migrate-rules
+// ---------------------------------------------------------------------------
+
+export const distractorExamples: Example[] = [
+  {
+    id: "am-neg-01",
+    input: "Show me which detection rules are generating the most false positives",
+    expected: {
+      // skill is set so negativeActivation knows which tool to check for absence
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "am-neg-02",
+    input: "Triage the open critical alerts from the last 24 hours",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "am-neg-03",
+    input: "Create a threat hunt for lateral movement via PsExec",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "am-neg-04",
+    input: "Open a new case for the ransomware incident on host SRVWIN04",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "am-neg-05",
+    input: "Run an ES|QL query to find brute-force login attempts in the last hour",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+  {
+    id: "am-neg-06",
+    input: "Generate sample endpoint data so I can test my detection rules",
+    expected: {
+      skill: SKILL_TOOL,
+    },
+  },
+];
+
+// ---------------------------------------------------------------------------
+// Export the full dataset for reference / cross-dataset tooling
+// ---------------------------------------------------------------------------
+
+export const automaticMigrationDataset: Dataset = {
+  name: "automatic-migration",
+  examples: [...positiveExamples, ...distractorExamples],
+};

From e8bf035c72717ca5620a42da41095e4062214243 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:14:10 +0200
Subject: [PATCH 32/42] =?UTF-8?q?feat:=20add=20automatic-migration=20eval?=
 =?UTF-8?q?=20spec=20(positives=20=E2=89=A580%,=20distractors=20100%)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two runDataset calls mirroring the detection-rule-management pattern:
- positives: skill-activation + tool-selection evaluators, passingScore 0.8
- distractors: negative-activation evaluator, passingScore 1.0
  (any false positive on migrate-rules is treated as a regression)

Suite is skipped in regular npm test via describe.skipIf(!RUN_LLM_EVALS).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/automatic-migration.eval.test.ts | 54 ++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 evals/automatic-migration.eval.test.ts

diff --git a/evals/automatic-migration.eval.test.ts b/evals/automatic-migration.eval.test.ts
new file mode 100644
index 0000000..c88ddbd
--- /dev/null
+++ b/evals/automatic-migration.eval.test.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * End-to-end eval spec for the automatic-migration skill.
+ *
+ * Proves skill-activation and boundary discrimination against the
+ * automatic-migration proof dataset. Run via:
+ *
+ *   npm run test:evals
+ *
+ * This suite is skipped in regular `npm test` because runDataset wraps
+ * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`.
+ *
+ * Gate summary:
+ *   positives   — skill-activation + tool-selection ≥ 80%
+ *   distractors — negative-activation = 100% (any false positive is a regression)
+ */
+
+import { runDataset } from "./runner.js";
+import {
+  positiveExamples,
+  distractorExamples,
+} from "./datasets/automatic-migration.dataset.js";
+import { skillActivation } from "./evaluators/skill-activation.js";
+import { negativeActivation } from "./evaluators/negative-activation.js";
+import { toolSelection } from "./evaluators/tool-selection.js";
+
+runDataset(
+  {
+    name: "automatic-migration: positives",
+    examples: positiveExamples,
+  },
+  {
+    "skill-activation": skillActivation,
+    "tool-selection": toolSelection,
+  },
+  { passingScore: 0.8 }
+);
+
+runDataset(
+  {
+    name: "automatic-migration: distractors",
+    examples: distractorExamples,
+  },
+  {
+    "negative-activation": negativeActivation,
+  },
+  { passingScore: 1.0 } // 100% — any false positive is a regression
+);

From 81efa423fa986126505d6272fe537142e1af5532 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:16:17 +0200
Subject: [PATCH 33/42] feat: wire MigrationsService and registerMigrationTools
 into server

- Imports MigrationsService from elastic/service/index and
  registerMigrationTools from tools/migration
- Instantiates migrationsService with the shared kibanaClient
- Calls registerMigrationTools after the other six tool registrations
- Updates integration test snapshots: +11 migration tool names and
  +1 UI resource URI (ui://migrate-rules/mcp-app.html)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/server.ts                                   |  4 ++++
 src/test/integration/server.integration.test.ts | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/server.ts b/src/server.ts
index deb1a38..814f04a 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -31,6 +31,7 @@ import {
   EsqlService,
   IndicesService,
   InvestigateService,
+  MigrationsService,
   RulesService,
   SampleDataService,
 } from "./elastic/service/index.js";
@@ -38,6 +39,7 @@ import { registerAlertTriageTools } from "./tools/alert-triage.js";
 import { registerAttackDiscoveryTools } from "./tools/attack-discovery.js";
 import { registerCaseManagementTools } from "./tools/case-management.js";
 import { registerDetectionRuleTools } from "./tools/detection-rules.js";
+import { registerMigrationTools } from "./tools/migration.js";
 import { registerSampleDataTools } from "./tools/sample-data.js";
 import { registerThreatHuntTools } from "./tools/threat-hunt.js";
 
@@ -95,6 +97,7 @@ export function createServer(deps: CreateServerDeps = {}): McpServer {
     sampleDataClient: new SampleDataClient({ esClient }),
     rulesService,
   });
+  const migrationsService = new MigrationsService({ kibanaClient });
 
   const server = new McpServer({
     name: "elastic-security",
@@ -115,6 +118,7 @@ export function createServer(deps: CreateServerDeps = {}): McpServer {
     attackDiscoveryService,
     casesService,
   });
+  registerMigrationTools(server, { migrationsService });
 
   return server;
 }
diff --git a/src/test/integration/server.integration.test.ts b/src/test/integration/server.integration.test.ts
index eb26b9e..1771fac 100644
--- a/src/test/integration/server.integration.test.ts
+++ b/src/test/integration/server.integration.test.ts
@@ -139,6 +139,18 @@ describe("MCP server integration (in-process Client + Server)", () => {
           "generate-attack-discovery",
           "get-generation-status",
           "list-ai-connectors",
+          // automatic-migration
+          "migrate-rules",
+          "list-migrations",
+          "get-migration",
+          "get-translated-rules",
+          "start-translation",
+          "stop-translation",
+          "update-translated-rule",
+          "get-resources",
+          "upsert-resource",
+          "install-rules",
+          "get-stats",
         ].sort()
       );
     } finally {
@@ -159,6 +171,7 @@ describe("MCP server integration (in-process Client + Server)", () => {
           "ui://threat-hunt/mcp-app.html",
           "ui://generate-sample-data/mcp-app.html",
           "ui://triage-attack-discoveries/mcp-app.html",
+          "ui://migrate-rules/mcp-app.html",
         ].sort()
       );
     } finally {

From 3df4a7bb6b5bb43cc3b2f2fa5b695a169f5ef8f2 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:20:46 +0200
Subject: [PATCH 34/42] chore: bump manifest to 1.1.0 and add migrate-rules
 tool entry

Adds migrate-rules to the tools[] array so the MCP app marketplace
advertises the new automatic migration capability. Version bumped to
1.1.0 (minor) to signal the new feature surface.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 manifest.json | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/manifest.json b/manifest.json
index b0694a4..a27d689 100644
--- a/manifest.json
+++ b/manifest.json
@@ -2,7 +2,7 @@
   "manifest_version": "0.3",
   "name": "elastic-security-mcp-app",
   "display_name": "Elastic Security",
-  "version": "1.0.2",
+  "version": "1.1.0",
   "description": "Interactive blue-team security operations for Elastic Security — alert triage, attack discovery, case management, detection rules, threat hunting, and sample data generation.",
   "long_description": "An MCP App server that brings interactive blue-team security operations directly into Claude Desktop. Provides six rich React-based UIs that render inline in the conversation: alert triage with AI verdicts, AI-powered attack discovery with confidence scoring and MITRE mapping, case management with the Kibana Cases API, detection rule browsing and tuning, an ES|QL threat-hunting workbench with a D3 investigation graph, and an ECS sample-data generator for demos.",
   "author": {
@@ -57,6 +57,10 @@
     {
       "name": "generate-sample-data",
       "description": "Generate ECS-compliant security events for demos"
+    },
+    {
+      "name": "migrate-rules",
+      "description": "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security"
     }
   ],
   "tools_generated": true,

From 76fdde973573bba188e6be1f84a1b2d856471fd9 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:24:07 +0200
Subject: [PATCH 35/42] docs: add SIEM Migration to README features table

Updates the tool count from six to seven and adds a row for the new
SIEM Migration feature (migrate-rules tool + workbench).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b110713..7f92df6 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ An [MCP App](https://modelcontextprotocol.io/extensions/apps/overview) that brin
 
 ## What This Does
 
-This project provides six interactive security operations tools, each with a rich React-based UI that renders inline when Claude (or another MCP host) calls the tool:
+This project provides seven interactive security operations tools, each with a rich React-based UI that renders inline when Claude (or another MCP host) calls the tool:
 
 | Tool | What It Does |
 |------|-------------|
@@ -24,6 +24,7 @@ This project provides six interactive security operations tools, each with a ric
 | **Detection Rules** | Browse, tune, and manage detection rules with KQL search and noisy rules analysis |
 | **Threat Hunt** | ES\|QL workbench with clickable entities and a D3 investigation graph |
 | **Sample Data** | Generate ECS security events for demos across 4 attack chain scenarios |
+| **SIEM Migration** | Migrate detection rules from Splunk to Elastic Security — upload SPL, AI-translate, review per-rule diff, fix resources, and install |
 
 See [docs/features.md](docs/features.md) for a full breakdown of each tool's capabilities.
 

From f66edc3885b1386c6dfb52767612afa204f9b077 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:24:55 +0200
Subject: [PATCH 36/42] docs: add SIEM Migration section to features.md

Documents the full migrate-rules workbench workflow: vendor selector,
upload, AI translation with progress bar, three-column rule review,
per-rule drawer (ElasticRulePartial form), resources drawer with
per-row inline save, translation statuses, install step, and done
summary.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/features.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/features.md b/docs/features.md
index 027b5c9..c2c4620 100644
--- a/docs/features.md
+++ b/docs/features.md
@@ -79,3 +79,17 @@ Rule management dashboard:
 Generate ECS-compliant security events:
 - Windows Credential Theft, AWS Privilege Escalation, Okta Identity Takeover, Ransomware Kill Chain
 - All data tagged for safe cleanup
+
+## SIEM Migration
+
+Guided workbench for migrating detection rules from Splunk (QRadar and Sentinel-One support coming) to Elastic Security. Triggered by the `automatic-migration` skill (`migrate-rules` tool):
+
+- **Vendor selector**: Splunk active; QRadar and Sentinel-One shown as "Coming soon" — re-enabling a vendor is a one-line flag flip
+- **Upload step**: drag-and-drop a JSON export file, use the file picker, or paste a rules array directly
+- **AI translation**: Kibana's SIEM migrations service converts SPL to Elastic detection rule JSON; a live progress bar polls every 3 seconds
+- **Three-column review**: original SPL / generated rule (read-only) / editable rule side-by-side for every translated rule
+- **Per-rule drawer**: structured form for key rule fields (name, description, type, query, language, severity, risk score); "Re-validate" saves as `partial`, "Save" uses the selected translation result
+- **Resources drawer**: lists all unresolved macros and lookups auto-expanded; each row has an individual Save button; resolved definitions collapsible
+- **Translation statuses**: `full` (production-ready), `partial` (needs tuning), `untranslatable` (skipped at install)
+- **Install step**: one-click install of all translatable rules into Elastic Security as disabled; "Back to review" available before confirming
+- **Done summary**: installed/failed tile counts; "Start another migration" resets the workbench

From 4b866322f9bb95ccc03ab864cd61ce7211fc6f8f Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 13:42:02 +0200
Subject: [PATCH 37/42] test(evals): add mock harness tests and fix eval suite
 robustness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add evals/harness.test.ts: always-on mock-based integration tests that
  exercise the full eval pipeline (runMcpHostLoop → evaluators) for both
  detection-rule-management and automatic-migration datasets without API
  keys or a live cluster. Passes 100% on all gates (tool-selection ≥ 80%,
  negative-activation = 100%).

- Add evals/helpers/evalServer.ts: shared factory that creates a real
  McpServer backed by stub services; used by both harness.test.ts and the
  LLM eval suites so neither needs CLUSTERS_JSON.

- Update evals/runner.ts: add optional createServer factory to
  RunnerOptions (injected per-example since InMemoryTransport is single-use);
  also widen skipIf to skip gracefully when RUN_LLM_EVALS=1 but no API key
  is configured.

- Update evals/vitest.config.ts: remove dataset files from include —
  *.dataset.ts files contain no test suites and were causing "no test
  suite found" failures.

- Update both *.eval.test.ts files to pass createEvalServer so the LLM
  eval suites no longer require a live Elastic cluster.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 evals/automatic-migration.eval.test.ts       |   5 +-
 evals/detection-rule-management.eval.test.ts |   5 +-
 evals/harness.test.ts                        | 165 +++++++++++++++++++
 evals/helpers/evalServer.ts                  |  43 +++++
 evals/runner.ts                              |  22 ++-
 evals/vitest.config.ts                       |   2 +-
 6 files changed, 234 insertions(+), 8 deletions(-)
 create mode 100644 evals/harness.test.ts
 create mode 100644 evals/helpers/evalServer.ts

diff --git a/evals/automatic-migration.eval.test.ts b/evals/automatic-migration.eval.test.ts
index c88ddbd..438bb71 100644
--- a/evals/automatic-migration.eval.test.ts
+++ b/evals/automatic-migration.eval.test.ts
@@ -29,6 +29,7 @@ import {
 import { skillActivation } from "./evaluators/skill-activation.js";
 import { negativeActivation } from "./evaluators/negative-activation.js";
 import { toolSelection } from "./evaluators/tool-selection.js";
+import { createEvalServer } from "./helpers/evalServer.js";
 
 runDataset(
   {
@@ -39,7 +40,7 @@ runDataset(
     "skill-activation": skillActivation,
     "tool-selection": toolSelection,
   },
-  { passingScore: 0.8 }
+  { passingScore: 0.8, createServer: createEvalServer }
 );
 
 runDataset(
@@ -50,5 +51,5 @@ runDataset(
   {
     "negative-activation": negativeActivation,
   },
-  { passingScore: 1.0 } // 100% — any false positive is a regression
+  { passingScore: 1.0, createServer: createEvalServer } // 100% — any false positive is a regression
 );
diff --git a/evals/detection-rule-management.eval.test.ts b/evals/detection-rule-management.eval.test.ts
index ec9cab3..23d14f2 100644
--- a/evals/detection-rule-management.eval.test.ts
+++ b/evals/detection-rule-management.eval.test.ts
@@ -29,6 +29,7 @@ import {
 import { skillActivation } from "./evaluators/skill-activation.js";
 import { negativeActivation } from "./evaluators/negative-activation.js";
 import { toolSelection } from "./evaluators/tool-selection.js";
+import { createEvalServer } from "./helpers/evalServer.js";
 
 runDataset(
   {
@@ -39,7 +40,7 @@ runDataset(
     "skill-activation": skillActivation,
     "tool-selection": toolSelection,
   },
-  { passingScore: 0.8 }
+  { passingScore: 0.8, createServer: createEvalServer }
 );
 
 runDataset(
@@ -50,5 +51,5 @@ runDataset(
   {
     "negative-activation": negativeActivation,
   },
-  { passingScore: 1.0 } // 100% — any false positive is a regression
+  { passingScore: 1.0, createServer: createEvalServer } // 100% — any false positive is a regression
 );
diff --git a/evals/harness.test.ts b/evals/harness.test.ts
new file mode 100644
index 0000000..de62b72
--- /dev/null
+++ b/evals/harness.test.ts
@@ -0,0 +1,165 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Mock-based harness integration test.
+ *
+ * Exercises the full eval pipeline (runMcpHostLoop → evaluators) with a
+ * deterministic mock LLM, proving the harness mechanics satisfy both dataset
+ * gate requirements without requiring real API keys.
+ *
+ * No API keys needed — runs as part of `npm run test:evals`.
+ * Gate thresholds match the LLM eval specs:
+ *   positives   — skill-activation + tool-selection ≥ 80%
+ *   distractors — negative-activation = 100%
+ */
+
+import { describe, it, expect } from "vitest";
+import { runMcpHostLoop } from "./runMcpHostLoop.js";
+import { skillActivation } from "./evaluators/skill-activation.js";
+import { toolSelection } from "./evaluators/tool-selection.js";
+import { negativeActivation } from "./evaluators/negative-activation.js";
+import {
+  positiveExamples as drmPositives,
+  distractorExamples as drmDistractors,
+} from "./datasets/detection-rule-management.dataset.js";
+import {
+  positiveExamples as amPositives,
+  distractorExamples as amDistractors,
+} from "./datasets/automatic-migration.dataset.js";
+import type { LlmProvider, AssistantMessage } from "./llm/types.js";
+import { createEvalServer } from "./helpers/evalServer.js";
+
+// ---------------------------------------------------------------------------
+// Gate thresholds — must match the LLM eval specs in *.eval.test.ts
+// ---------------------------------------------------------------------------
+
+const POSITIVE_GATE = 0.8;
+const DISTRACTOR_GATE = 1.0;
+
+// ---------------------------------------------------------------------------
+// Mock LLM implementations
+// ---------------------------------------------------------------------------
+
+/**
+ * Returns an LLM that calls `toolName` exactly once, then returns plain text.
+ * Used for positive examples to simulate correct skill activation.
+ */
+function makeActivatingLlm(toolName: string): LlmProvider {
+  let called = false;
+  return {
+    async chat(_messages, tools): Promise<AssistantMessage> {
+      if (!called && tools.some((t) => t.name === toolName)) {
+        called = true;
+        return {
+          role: "assistant",
+          content: null,
+          tool_calls: [
+            {
+              id: "call_mock_0",
+              type: "function" as const,
+              function: { name: toolName, arguments: "{}" },
+            },
+          ],
+        };
+      }
+      return { role: "assistant", content: "Done." };
+    },
+  };
+}
+
+/** Always returns plain text without calling any tool. Used for distractor examples. */
+const passiveLlm: LlmProvider = {
+  async chat(): Promise<AssistantMessage> {
+    return {
+      role: "assistant",
+      content: "I can help with that directly without additional tools.",
+    };
+  },
+};
+
+// ---------------------------------------------------------------------------
+// detection-rule-management harness tests
+// ---------------------------------------------------------------------------
+
+describe("eval harness: detection-rule-management positives", () => {
+  for (const example of drmPositives) {
+    it(`${example.id} — skill-activation + tool-selection ≥ ${POSITIVE_GATE}`, async () => {
+      const trajectory = await runMcpHostLoop(example.input, {
+        server: createEvalServer(),
+        llm: makeActivatingLlm("manage-rules"),
+      });
+
+      const activation = await skillActivation(trajectory, example.expected);
+      const selection = await toolSelection(trajectory, example.expected);
+
+      if (activation.score !== "N/A") {
+        expect(activation.score, `skill-activation: ${activation.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE);
+      }
+      if (selection.score !== "N/A") {
+        expect(selection.score, `tool-selection: ${selection.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE);
+      }
+    });
+  }
+});
+
+describe("eval harness: detection-rule-management distractors", () => {
+  for (const example of drmDistractors) {
+    it(`${example.id} — negative-activation = 100%`, async () => {
+      const trajectory = await runMcpHostLoop(example.input, {
+        server: createEvalServer(),
+        llm: passiveLlm,
+      });
+
+      const result = await negativeActivation(trajectory, example.expected);
+      if (result.score !== "N/A") {
+        expect(result.score, `negative-activation: ${result.reason}`).toBe(DISTRACTOR_GATE);
+      }
+    });
+  }
+});
+
+// ---------------------------------------------------------------------------
+// automatic-migration harness tests
+// ---------------------------------------------------------------------------
+
+describe("eval harness: automatic-migration positives", () => {
+  for (const example of amPositives) {
+    it(`${example.id} — skill-activation + tool-selection ≥ ${POSITIVE_GATE}`, async () => {
+      const trajectory = await runMcpHostLoop(example.input, {
+        server: createEvalServer(),
+        llm: makeActivatingLlm("migrate-rules"),
+      });
+
+      const activation = await skillActivation(trajectory, example.expected);
+      const selection = await toolSelection(trajectory, example.expected);
+
+      if (activation.score !== "N/A") {
+        expect(activation.score, `skill-activation: ${activation.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE);
+      }
+      if (selection.score !== "N/A") {
+        expect(selection.score, `tool-selection: ${selection.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE);
+      }
+    });
+  }
+});
+
+describe("eval harness: automatic-migration distractors", () => {
+  for (const example of amDistractors) {
+    it(`${example.id} — negative-activation = 100%`, async () => {
+      const trajectory = await runMcpHostLoop(example.input, {
+        server: createEvalServer(),
+        llm: passiveLlm,
+      });
+
+      const result = await negativeActivation(trajectory, example.expected);
+      if (result.score !== "N/A") {
+        expect(result.score, `negative-activation: ${result.reason}`).toBe(DISTRACTOR_GATE);
+      }
+    });
+  }
+});
diff --git a/evals/helpers/evalServer.ts b/evals/helpers/evalServer.ts
new file mode 100644
index 0000000..750f9d7
--- /dev/null
+++ b/evals/helpers/evalServer.ts
@@ -0,0 +1,43 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { vi } from "vitest";
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { registerDetectionRuleTools } from "../../src/tools/detection-rules.js";
+import { registerMigrationTools } from "../../src/tools/migration.js";
+import type { MigrationsService } from "../../src/elastic/service/migrationsService.js";
+import type { RulesService } from "../../src/elastic/service/rulesService.js";
+
+/**
+ * Creates a real McpServer backed by stub services for eval runs.
+ *
+ * Only the methods called by the two model-facing tools (`migrate-rules`,
+ * `manage-rules`) are stubbed — other service methods are left as bare
+ * `vi.fn()` since they won't be invoked during skill-routing evals.
+ *
+ * A fresh server must be created for each `runMcpHostLoop` call because
+ * the InMemoryTransport pair is torn down after every run.
+ *
+ * No live Elastic cluster is required — skill-routing evaluators only
+ * inspect which tools the LLM called, not what those tools returned.
+ */
+export function createEvalServer(): McpServer {
+  const server = new McpServer({ name: "eval-server", version: "0.0.0" });
+
+  const migrationsService = {
+    listMigrations: vi.fn().mockResolvedValue([]),
+  } as unknown as MigrationsService;
+
+  const rulesService = {
+    findRules: vi.fn().mockResolvedValue({ data: [], total: 0 }),
+  } as unknown as RulesService;
+
+  registerMigrationTools(server, { migrationsService });
+  registerDetectionRuleTools(server, { rulesService });
+
+  return server;
+}
diff --git a/evals/runner.ts b/evals/runner.ts
index 50035ca..f92a338 100644
--- a/evals/runner.ts
+++ b/evals/runner.ts
@@ -6,12 +6,24 @@
  */
 
 import { describe, it, expect, afterAll } from "vitest";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import type { Dataset, EvalResult, EvaluatorResult, Evaluator } from "./types.js";
 import { runMcpHostLoop } from "./runMcpHostLoop.js";
 
 export interface RunnerOptions {
   /** Minimum numeric score [0–1] for a test to pass. Defaults to 0.5. */
   passingScore?: number;
+  /**
+   * Factory that produces a fresh McpServer for each example.
+   *
+   * A fresh instance is required per-run because InMemoryTransport is torn
+   * down after each `runMcpHostLoop` call. When omitted, `runMcpHostLoop`
+   * falls back to `createServer()`, which requires `CLUSTERS_JSON`.
+   *
+   * Pass `createEvalServer` from `evals/helpers/evalServer.ts` to run eval
+   * suites without a live Elastic cluster (only API keys are needed).
+   */
+  createServer?: () => McpServer;
 }
 
 /**
@@ -33,14 +45,18 @@ export function runDataset(
   evaluators: Record<string, Evaluator>,
   options: RunnerOptions = {}
 ): void {
-  const { passingScore = 0.5 } = options;
+  const { passingScore = 0.5, createServer } = options;
 
-  describe.skipIf(!process.env.RUN_LLM_EVALS)(dataset.name, () => {
+  const hasLlmProvider =
+    !!process.env.ANTHROPIC_API_KEY || !!process.env.OPENAI_API_KEY;
+  describe.skipIf(!process.env.RUN_LLM_EVALS || !hasLlmProvider)(dataset.name, () => {
     const results: EvalResult[] = [];
 
     for (const example of dataset.examples) {
       it(example.id, async () => {
-        const trajectory = await runMcpHostLoop(example.input);
+        const trajectory = await runMcpHostLoop(example.input, {
+          server: createServer?.(),
+        });
 
         const evalResults: Record<string, EvaluatorResult> = {};
         for (const [name, evaluator] of Object.entries(evaluators)) {
diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts
index 7d2f4d5..9b363f4 100644
--- a/evals/vitest.config.ts
+++ b/evals/vitest.config.ts
@@ -18,7 +18,7 @@ export default defineConfig({
   test: {
     environment: "node",
     globals: true,
-    include: ["evals/**/*.{test,spec,eval}.ts", "evals/datasets/**/*.dataset.ts"],
+    include: ["evals/**/*.{test,spec,eval}.ts"],
     testTimeout: 120_000,
   },
 });

From 621b30972ec8f8e29f2ac19b09822fb25852c086 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 14:25:57 +0200
Subject: [PATCH 38/42] feat(evals): allow OPENAI_MODEL override for Ollama /
 LiteLLM proxies

The OpenAI adapter already accepted a `model` constructor option; this
pipes it through `createDefaultLlmProvider()` so operators can run the
eval suite against a local Ollama daemon at no cost:

    OPENAI_API_KEY=ollama \
    LITELLM_BASE_URL=http://localhost:11434/v1 \
    OPENAI_MODEL=llama3.1:8b \
    npm run test:evals

Default behaviour (gpt-4o-mini when only OPENAI_API_KEY is set) is
unchanged because `OpenAiProvider`'s `model = DEFAULT_MODEL` default
kicks in for `undefined`.
---
 evals/llm/index.ts | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/evals/llm/index.ts b/evals/llm/index.ts
index 5698ff5..b959fd4 100644
--- a/evals/llm/index.ts
+++ b/evals/llm/index.ts
@@ -14,10 +14,12 @@ import { OpenAiProvider } from "./openai.js";
  *
  * Priority order:
  *   1. ANTHROPIC_API_KEY → AnthropicProvider (claude-haiku-4-5-20251001)
- *   2. OPENAI_API_KEY    → OpenAiProvider / LiteLLM proxy (gpt-4o-mini)
+ *   2. OPENAI_API_KEY    → OpenAiProvider / LiteLLM proxy / Ollama (gpt-4o-mini default)
  *
  * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM
- * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint.
+ * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint. Set
+ * OPENAI_MODEL to override the chat model (e.g. `qwen2.5:32b-instruct-q4_K_M`
+ * when proxying through Ollama at `http://localhost:11434/v1`).
  */
 export function createDefaultLlmProvider(): LlmProvider {
   if (process.env.ANTHROPIC_API_KEY) {
@@ -25,6 +27,7 @@ export function createDefaultLlmProvider(): LlmProvider {
   }
   if (process.env.OPENAI_API_KEY) {
     return new OpenAiProvider({
+      model: process.env.OPENAI_MODEL,
       baseURL: process.env.LITELLM_BASE_URL,
     });
   }

From 2ebbf542f7ecaa57069c4b9ee7c6fc5c1789edfa Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 14:25:58 +0200
Subject: [PATCH 39/42] fix(evals): hide app-only tools from the LLM in
 runMcpHostLoop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tools registered via `registerAppTool(...)` with
`_meta.ui.visibility: ["app"]` are invoked by the React workbench via
`app.callServerTool()`. Real MCP hosts (Claude Desktop, Cursor) hide
them from the LLM. The eval harness was passing every tool from
`client.listTools()` straight to the model, so small open-source
models saw `start-translation` / `install-rules` / `find-rules` as
alternatives to `migrate-rules` / `manage-rules` and routed there
instead — collapsing activation rates and misrepresenting what a real
MCP host exposes.

`isVisibleToModel()` mirrors the host-side visibility contract:
- visibility unset → visible (default for model-facing tools)
- visibility includes "model" → visible
- visibility includes "app" without "model" → hidden

Baseline shift on llama3.1:8b (automatic-migration positives):
  before fix:  67% (4/6 — model called start-translation / install-rules)
  after fix:  100% (6/6 — model called migrate-rules every time)
---
 evals/runMcpHostLoop.ts | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
index d6a732c..c157817 100644
--- a/evals/runMcpHostLoop.ts
+++ b/evals/runMcpHostLoop.ts
@@ -16,6 +16,25 @@ import { createDefaultLlmProvider } from "./llm/index.js";
 /** Maximum LLM → tool-call turns before halting to prevent runaway evals. */
 const MAX_TURNS = 8;
 
+/**
+ * Returns true when an MCP tool should be exposed to the LLM.
+ *
+ * Mirrors the MCP host visibility contract — tools marked
+ * `_meta.ui.visibility: ["app"]` (without `"model"`) are invoked exclusively
+ * by an MCP app via `app.callServerTool()`. Real hosts (Claude Desktop,
+ * Cursor) hide those from the LLM; the eval harness must do the same to
+ * match what the model actually sees in production.
+ */
+function isVisibleToModel(tool: { _meta?: unknown }): boolean {
+  const meta = tool._meta as
+    | { ui?: { visibility?: readonly string[] } }
+    | undefined;
+  const visibility = meta?.ui?.visibility;
+  if (!visibility || visibility.length === 0) return true;
+  if (visibility.includes("model")) return true;
+  return !visibility.includes("app");
+}
+
 export interface HostLoopOptions {
   /**
    * Pre-built MCP server to test against.
@@ -72,7 +91,13 @@ export async function runMcpHostLoop(
 
   try {
     const { tools: mcpTools } = await client.listTools();
-    const toolDefs = mcpTools.map((t) => ({
+    // Strip app-only tools — they're invoked by the React workbench via
+    // `app.callServerTool()` and a real MCP host (Claude Desktop, Cursor)
+    // hides them from the LLM by inspecting `_meta.ui.visibility`. Without
+    // this filter the model sees `find-rules`, `start-translation`,
+    // `install-rules`, etc. as alternatives to the model-facing entry
+    // points and the activation rate collapses on smaller models.
+    const toolDefs = mcpTools.filter(isVisibleToModel).map((t) => ({
       name: t.name,
       description: t.description ?? "",
       parameters: t.inputSchema as Record<string, unknown>,

From 0543e2052add18d99c81fce5b6fceb096113970c Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Fri, 15 May 2026 14:26:00 +0200
Subject: [PATCH 40/42] fix(evals): register all 7 model-facing tool groups in
 createEvalServer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the eval server only registered detection-rules and
migration. When a distractor query like "Create a new case for a
ransomware incident" hit the LLM, the model had no `manage-cases`
option to choose, so it forced a poor match on `manage-rules` and
the negative-activation evaluator collapsed.

A real MCP host exposes the full set of model-facing tools — the
eval server should match. Services are stubbed with `vi.fn()` because
skill-routing evaluators only inspect which tools were called, not
what they returned.

Tool groups registered (mirroring src/server.ts):
- alert-triage          → triage-alerts
- attack-discovery      → triage-attack-discoveries
- case-management       → manage-cases
- detection-rules       → manage-rules
- migration             → migrate-rules
- sample-data           → generate-sample-data
- threat-hunt           → threat-hunt

Baseline shift on llama3.1:8b (detection-rule-management distractors):
  before fix: 25% (1/4 — manage-rules over-selected on case/ESQL/alert queries)
  after fix: 100% (4/4 — model picks manage-cases / threat-hunt / etc. correctly)

docs/evals.md updated with the Ollama route and a note that
CLUSTERS_JSON is not required when using createEvalServer.
---
 docs/evals.md               | 14 ++++++-
 evals/helpers/evalServer.ts | 77 +++++++++++++++++++++++++++++++------
 2 files changed, 78 insertions(+), 13 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 486af33..17fb658 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -214,12 +214,22 @@ that structural evaluators can't express.
 
    ```bash
    # Anthropic (preferred)
-   ANTHROPIC_API_KEY=sk-ant-... CLUSTERS_JSON='[{...}]' npm run test:evals
+   ANTHROPIC_API_KEY=sk-ant-... npm run test:evals
 
    # OpenAI / LiteLLM proxy
-   OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... CLUSTERS_JSON='[{...}]' npm run test:evals
+   OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... npm run test:evals
+
+   # Local Ollama (zero-cost smoke run; tool-calling quality varies by model)
+   OPENAI_API_KEY=ollama \
+     LITELLM_BASE_URL=http://localhost:11434/v1 \
+     OPENAI_MODEL=qwen2.5:32b-instruct-q4_K_M \
+     npm run test:evals
    ```
 
+   `createEvalServer` stubs all Elastic-cluster calls, so no `CLUSTERS_JSON`
+   is needed when running skill-routing evaluators (`skill-activation`,
+   `tool-selection`, `negative-activation`, `trajectory`, `criteria`).
+
 4. **Trigger in CI**: open a PR and add the `evals` label (requires write access).
 
 ---
diff --git a/evals/helpers/evalServer.ts b/evals/helpers/evalServer.ts
index 750f9d7..2d1412f 100644
--- a/evals/helpers/evalServer.ts
+++ b/evals/helpers/evalServer.ts
@@ -7,27 +7,68 @@
 
 import { vi } from "vitest";
 import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { registerAlertTriageTools } from "../../src/tools/alert-triage.js";
+import { registerAttackDiscoveryTools } from "../../src/tools/attack-discovery.js";
+import { registerCaseManagementTools } from "../../src/tools/case-management.js";
 import { registerDetectionRuleTools } from "../../src/tools/detection-rules.js";
 import { registerMigrationTools } from "../../src/tools/migration.js";
+import { registerSampleDataTools } from "../../src/tools/sample-data.js";
+import { registerThreatHuntTools } from "../../src/tools/threat-hunt.js";
+import type { AlertsService } from "../../src/elastic/service/alertsService.js";
+import type { AttackDiscoveryService } from "../../src/elastic/service/attackDiscoveryService.js";
+import type { CasesService } from "../../src/elastic/service/casesService.js";
+import type { EntityDetailService } from "../../src/elastic/service/entityDetailService.js";
+import type { EsqlService } from "../../src/elastic/service/esqlService.js";
+import type { IndicesService } from "../../src/elastic/service/indicesService.js";
+import type { InvestigateService } from "../../src/elastic/service/investigateService.js";
 import type { MigrationsService } from "../../src/elastic/service/migrationsService.js";
 import type { RulesService } from "../../src/elastic/service/rulesService.js";
+import type { SampleDataService } from "../../src/elastic/service/sampleDataService.js";
 
 /**
- * Creates a real McpServer backed by stub services for eval runs.
+ * Stubs every service used by the seven tool groups registered on the live
+ * MCP server. Methods invoked by model-facing entry tools resolve to
+ * realistic-shaped empty payloads; other methods are bare `vi.fn()` because
+ * skill-routing evaluators only inspect which tools the LLM called, not
+ * what those tools returned.
  *
- * Only the methods called by the two model-facing tools (`migrate-rules`,
- * `manage-rules`) are stubbed — other service methods are left as bare
- * `vi.fn()` since they won't be invoked during skill-routing evals.
- *
- * A fresh server must be created for each `runMcpHostLoop` call because
- * the InMemoryTransport pair is torn down after every run.
- *
- * No live Elastic cluster is required — skill-routing evaluators only
- * inspect which tools the LLM called, not what those tools returned.
+ * Mirrors `src/server.ts` exactly: the LLM that drives the eval host loop
+ * must see the same tool surface a real MCP host (Claude Desktop, Cursor)
+ * exposes — otherwise we measure skill-selection against an artificially
+ * narrow distractor set and over-state activation rates for small models.
  */
 export function createEvalServer(): McpServer {
   const server = new McpServer({ name: "eval-server", version: "0.0.0" });
 
+  const alertsService = {
+    searchAlerts: vi.fn().mockResolvedValue({ alerts: [], total: 0 }),
+    findAlertById: vi.fn().mockResolvedValue(null),
+  } as unknown as AlertsService;
+
+  const attackDiscoveryService = {
+    listAttackDiscoveries: vi.fn().mockResolvedValue([]),
+  } as unknown as AttackDiscoveryService;
+
+  const casesService = {
+    findCases: vi.fn().mockResolvedValue({ cases: [], total: 0 }),
+  } as unknown as CasesService;
+
+  const entityDetailService = {
+    getEntityDetail: vi.fn().mockResolvedValue(null),
+  } as unknown as EntityDetailService;
+
+  const esqlService = {
+    executeQuery: vi.fn().mockResolvedValue({ columns: [], values: [] }),
+  } as unknown as EsqlService;
+
+  const indicesService = {
+    listIndices: vi.fn().mockResolvedValue([]),
+  } as unknown as IndicesService;
+
+  const investigateService = {
+    getRelatedAlerts: vi.fn().mockResolvedValue([]),
+  } as unknown as InvestigateService;
+
   const migrationsService = {
     listMigrations: vi.fn().mockResolvedValue([]),
   } as unknown as MigrationsService;
@@ -36,8 +77,22 @@ export function createEvalServer(): McpServer {
     findRules: vi.fn().mockResolvedValue({ data: [], total: 0 }),
   } as unknown as RulesService;
 
-  registerMigrationTools(server, { migrationsService });
+  const sampleDataService = {
+    listScenarios: vi.fn().mockResolvedValue([]),
+  } as unknown as SampleDataService;
+
+  registerAlertTriageTools(server, { alertsService });
+  registerAttackDiscoveryTools(server, { attackDiscoveryService, casesService });
+  registerCaseManagementTools(server, { casesService });
   registerDetectionRuleTools(server, { rulesService });
+  registerMigrationTools(server, { migrationsService });
+  registerSampleDataTools(server, { sampleDataService });
+  registerThreatHuntTools(server, {
+    esqlService,
+    indicesService,
+    investigateService,
+    entityDetailService,
+  });
 
   return server;
 }

From fad9e245ad4c546ce3634a09c325cd609b1c2a4b Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Mon, 18 May 2026 10:34:15 +0200
Subject: [PATCH 41/42] feat(evals): HostLoopOptions.systemPrompt + system-role
 on LlmMessage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an optional host-level system prompt to the in-process MCP host
loop so the harness can pin LLM behavior to what a real MCP host
would instruct. Real hosts (Claude Desktop, Cursor) inject a system
prompt that constrains tool selection, response shape, and HITL
confirmation flow. Without one, the harness measures raw
model-vs-tools behavior — which over- or under-reports activation
depending on the model family.

Wired end-to-end:

  - HostLoopOptions.systemPrompt: optional string; empty/whitespace
    treated identically to omitting (the absence is observable in evals).
  - LlmMessage gains a `system` role variant so the prompt flows
    through the unified message shape both adapters consume.
  - OpenAI adapter: appends `role: "system"` as a normal message
    (Chat Completions schema accepts it natively).
  - Anthropic adapter: strips system-roled messages from the array
    and passes them via the top-level `system` parameter on
    `messages.create` — the only place Anthropic accepts a system
    prompt. The `toAnthropicMessages` helper's parameter type is
    narrowed to `Exclude<LlmMessage, { role: "system" }>` so the
    invariant is enforced at the type system, not in prose.

Tests:

  - 3 new harness tests covering the propagation contract:
      (a) systemPrompt is the first message when provided
      (b) no system message is injected when omitted
      (c) empty / whitespace-only strings are treated as omitted
  - All 23 harness tests pass (was 20).
  - Tests use a recording-LLM provider so the assertion is on what
    the adapter actually received, not on response side effects.

Docs:

  - docs/evals.md gains a "Host system prompt" section explaining
    the contract + provider-specific handling.
  - Drive-by: the Ollama example switched from
    `qwen2.5:32b-instruct-q4_K_M` (exposes /generate only, returns
    "does not support chat" against this harness) to `llama3.1:8b`
    which speaks the OpenAI Chat Completions schema. Caught
    end-to-end while validating the harness.

Anti-overengineering self-check:

  - Gate 1 (existing abstraction): HostLoopOptions already exists.
    `systemPrompt?: string` slots in without a new interface.
  - Gate 2 (real consumer): the next eval suite that wants to mimic
    Claude Desktop's HITL prompt; SKILL.md-driven evals that need
    the skill body as system context.
  - Gate 3 (smallest in-place): one new optional field, one new
    role variant, two adapter cases, three tests. ~30 LOC of
    behavior change excluding tests + docs.
  - Gate 6 (cost): default-off, no impact on existing callers.
---
 docs/evals.md           | 42 ++++++++++++++++++++-
 evals/harness.test.ts   | 84 ++++++++++++++++++++++++++++++++++++++++-
 evals/llm/anthropic.ts  | 21 ++++++++++-
 evals/llm/openai.ts     |  2 +
 evals/llm/types.ts      |  7 ++++
 evals/runMcpHostLoop.ts | 27 ++++++++++++-
 6 files changed, 176 insertions(+), 7 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 17fb658..7e39cb2 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -23,11 +23,12 @@ runner.ts ─ describe.skipIf(!RUN_LLM_EVALS)(dataset.name, () => {
    │            afterAll: print Markdown table to stdout
    │         })
    │
-   ├── runMcpHostLoop(input)
+   ├── runMcpHostLoop(input, opts?)
    │      InMemoryTransport ─ Client ─ McpServer
    │      LLM provider (Anthropic / OpenAI / LiteLLM)
    │      loop ≤ MAX_TURNS=8: LLM → tool calls → results → repeat
    │      returns Trajectory (ordered ToolCall[])
+   │      opts.systemPrompt: optional host-level system prompt (see below)
    │
    └── Evaluators
           skill-activation    binary: was skill tool called?
@@ -220,9 +221,12 @@ that structural evaluators can't express.
    OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... npm run test:evals
 
    # Local Ollama (zero-cost smoke run; tool-calling quality varies by model)
+   # Use a model that speaks the OpenAI chat-completions schema. `llama3.1:8b`
+   # is a good baseline; `qwen2.5:32b-instruct-q4_K_M` exposes /generate only
+   # and returns "does not support chat" against this harness.
    OPENAI_API_KEY=ollama \
      LITELLM_BASE_URL=http://localhost:11434/v1 \
-     OPENAI_MODEL=qwen2.5:32b-instruct-q4_K_M \
+     OPENAI_MODEL=llama3.1:8b \
      npm run test:evals
    ```
 
@@ -234,6 +238,40 @@ that structural evaluators can't express.
 
 ---
 
+## Host system prompt (`HostLoopOptions.systemPrompt`)
+
+Real MCP hosts (Claude Desktop, Cursor) inject a host-level system
+prompt that constrains tool selection, response shape, and confirmation
+flow. Without one, the harness measures raw model-vs-tools behavior —
+which can over- or under-report activation depending on the model
+family. Use `HostLoopOptions.systemPrompt` to pin behavior to what
+production will instruct, or to swap in a `SKILL.md` body when testing
+skill-driven flows.
+
+```typescript
+import { runMcpHostLoop } from "./runMcpHostLoop.js";
+import { skillBody } from "../skills/automatic-migration/SKILL.md?raw";
+
+const trajectory = await runMcpHostLoop(example.input, {
+  server: createEvalServer(),
+  systemPrompt: skillBody,    // copy SKILL.md verbatim, like the real host
+});
+```
+
+Provider handling:
+
+- **OpenAI / LiteLLM** — `role: "system"` message is the first entry in
+  the `messages` array, per the Chat Completions schema.
+- **Anthropic** — the adapter strips system-roled messages out of the
+  array and passes their concatenated content via the top-level
+  `system` parameter on `messages.create` (the only place Anthropic
+  accepts a system prompt).
+- **Empty / whitespace-only string** — treated identically to omitting
+  the option (no system message is injected, no top-level parameter is
+  sent). This keeps "absence of system prompt" observable in evals.
+
+---
+
 ## CI gating
 
 ### Workflow: `.github/workflows/evals.yml`
diff --git a/evals/harness.test.ts b/evals/harness.test.ts
index de62b72..07b14ae 100644
--- a/evals/harness.test.ts
+++ b/evals/harness.test.ts
@@ -31,7 +31,11 @@ import {
   positiveExamples as amPositives,
   distractorExamples as amDistractors,
 } from "./datasets/automatic-migration.dataset.js";
-import type { LlmProvider, AssistantMessage } from "./llm/types.js";
+import type {
+  LlmProvider,
+  AssistantMessage,
+  LlmMessage,
+} from "./llm/types.js";
 import { createEvalServer } from "./helpers/evalServer.js";
 
 // ---------------------------------------------------------------------------
@@ -163,3 +167,81 @@ describe("eval harness: automatic-migration distractors", () => {
     });
   }
 });
+
+// ---------------------------------------------------------------------------
+// HostLoopOptions.systemPrompt — propagation contract
+//
+// Real MCP hosts inject a system prompt that constrains tool selection.
+// Verify the option flows from `runMcpHostLoop` to the provider's `chat()`
+// as a `role: "system"` message, AND that empty / whitespace-only strings
+// are dropped so the absence of a system prompt is observable.
+// ---------------------------------------------------------------------------
+
+describe("eval harness: systemPrompt propagation", () => {
+  /**
+   * Captures every `messages` array the provider's `chat()` receives so
+   * the test can assert what the harness handed off.
+   */
+  function makeRecordingLlm(): {
+    provider: LlmProvider;
+    calls: LlmMessage[][];
+  } {
+    const calls: LlmMessage[][] = [];
+    const provider: LlmProvider = {
+      async chat(messages): Promise<AssistantMessage> {
+        calls.push([...messages]);
+        return { role: "assistant", content: "Done." };
+      },
+    };
+    return { provider, calls };
+  }
+
+  it("prepends a system message when systemPrompt is provided", async () => {
+    const { provider, calls } = makeRecordingLlm();
+    await runMcpHostLoop("Find me my noisy rules", {
+      server: createEvalServer(),
+      llm: provider,
+      systemPrompt: "You are a security analyst. Always call a tool before answering.",
+    });
+
+    expect(calls.length).toBeGreaterThanOrEqual(1);
+    const firstTurn = calls[0]!;
+    expect(firstTurn[0]).toEqual({
+      role: "system",
+      content: "You are a security analyst. Always call a tool before answering.",
+    });
+    expect(firstTurn[1]).toEqual({
+      role: "user",
+      content: "Find me my noisy rules",
+    });
+  });
+
+  it("does not inject a system message when systemPrompt is omitted", async () => {
+    const { provider, calls } = makeRecordingLlm();
+    await runMcpHostLoop("Find me my noisy rules", {
+      server: createEvalServer(),
+      llm: provider,
+    });
+
+    expect(calls.length).toBeGreaterThanOrEqual(1);
+    const firstTurn = calls[0]!;
+    expect(firstTurn[0]?.role).toBe("user");
+    expect(firstTurn.some((m) => m.role === "system")).toBe(false);
+  });
+
+  it("treats empty / whitespace-only systemPrompt as omitted", async () => {
+    for (const prompt of ["", "   ", "\n\t"]) {
+      const { provider, calls } = makeRecordingLlm();
+      await runMcpHostLoop("Find me my noisy rules", {
+        server: createEvalServer(),
+        llm: provider,
+        systemPrompt: prompt,
+      });
+      const firstTurn = calls[0]!;
+      expect(
+        firstTurn.some((m) => m.role === "system"),
+        `empty-string systemPrompt (${JSON.stringify(prompt)}) should not inject a system message`
+      ).toBe(false);
+    }
+  });
+});
diff --git a/evals/llm/anthropic.ts b/evals/llm/anthropic.ts
index 70e9adc..a52ba6e 100644
--- a/evals/llm/anthropic.ts
+++ b/evals/llm/anthropic.ts
@@ -46,10 +46,23 @@ export class AnthropicProvider implements LlmProvider {
     messages: LlmMessage[],
     tools: LlmToolDefinition[]
   ): Promise<AssistantMessage> {
+    // Anthropic accepts the system prompt as a top-level parameter, not as
+    // a message in the array. Concatenate any system-roled messages from
+    // the unified LlmMessage shape into one string and strip them before
+    // converting the remaining history.
+    const systemMessages = messages.filter(
+      (m): m is Extract<LlmMessage, { role: "system" }> => m.role === "system"
+    );
+    const system = systemMessages.map((m) => m.content).join("\n\n");
+    const nonSystem = messages.filter(
+      (m): m is Exclude<LlmMessage, { role: "system" }> => m.role !== "system"
+    );
+
     const response = await this.client.messages.create({
       model: this.model,
       max_tokens: MAX_TOKENS,
-      messages: toAnthropicMessages(messages),
+      ...(system.length > 0 ? { system } : {}),
+      messages: toAnthropicMessages(nonSystem),
       ...(tools.length > 0 ? { tools: tools.map(toAnthropicTool) } : {}),
     });
 
@@ -87,12 +100,16 @@ export class AnthropicProvider implements LlmProvider {
  * Structural differences from OpenAI:
  *   - Anthropic has no `tool` role. Tool results go as `user` messages with
  *     `tool_result` content blocks.
+ *   - Anthropic has no `system` message role — system prompts flow through
+ *     the top-level `system` parameter on `messages.create`. Callers strip
+ *     system messages before calling this function; the parameter type
+ *     enforces that invariant.
  *   - Consecutive tool-result messages are merged into a single user message
  *     so the API never receives two adjacent user turns.
  *   - Assistant content is an array of TextBlockParam / ToolUseBlockParam.
  */
 function toAnthropicMessages(
-  messages: LlmMessage[]
+  messages: Exclude<LlmMessage, { role: "system" }>[]
 ): Anthropic.MessageParam[] {
   const result: Anthropic.MessageParam[] = [];
 
diff --git a/evals/llm/openai.ts b/evals/llm/openai.ts
index a6dd59b..ab6e1fc 100644
--- a/evals/llm/openai.ts
+++ b/evals/llm/openai.ts
@@ -88,6 +88,8 @@ export class OpenAiProvider implements LlmProvider {
 
 function toOaiMessage(msg: LlmMessage): OpenAI.ChatCompletionMessageParam {
   switch (msg.role) {
+    case "system":
+      return { role: "system", content: msg.content };
     case "user":
       return { role: "user", content: msg.content };
     case "assistant":
diff --git a/evals/llm/types.ts b/evals/llm/types.ts
index b5fef9b..44a4d04 100644
--- a/evals/llm/types.ts
+++ b/evals/llm/types.ts
@@ -28,8 +28,15 @@ export interface LlmToolCallRequest {
  * Discriminated union covering every role that can appear in a chat thread.
  * Shaped after the OpenAI chat messages API so a single interface works for
  * both the OpenAI and Anthropic adapters (and any LiteLLM proxy in between).
+ *
+ * Anthropic note: Anthropic's HTTP API takes the system prompt as a
+ * top-level `system: string` parameter on `messages.create`, not inside
+ * the messages array. The adapter extracts `system`-roled messages from
+ * the union and passes them via that parameter — this discriminant only
+ * dictates the SHAPE the harness uses internally.
  */
 export type LlmMessage =
+  | { role: "system"; content: string }
   | { role: "user"; content: string }
   | {
       role: "assistant";
diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts
index c157817..b750f2f 100644
--- a/evals/runMcpHostLoop.ts
+++ b/evals/runMcpHostLoop.ts
@@ -57,6 +57,20 @@ export interface HostLoopOptions {
    * Defaults to MAX_TURNS (8).
    */
   maxTurns?: number;
+  /**
+   * Optional system prompt prepended to the message history.
+   *
+   * Real MCP hosts (Claude Desktop, Cursor) inject a host-level system prompt
+   * that constrains tool selection, response shape, and confirmation flow.
+   * Without one, the harness measures raw model-vs-tools behavior, which can
+   * over- or under-report activation depending on the model family. Use this
+   * to pin behavior to what the production host will instruct, or to swap in
+   * a SKILL.md body when testing skill-driven flows.
+   *
+   * Pass a non-empty string. Empty strings are ignored to keep behavior
+   * identical to omitting the option.
+   */
+  systemPrompt?: string;
 }
 
 /**
@@ -78,7 +92,12 @@ export interface HostLoopOptions {
  */
 export async function runMcpHostLoop(
   input: string,
-  { server, llm, maxTurns = MAX_TURNS }: HostLoopOptions = {}
+  {
+    server,
+    llm,
+    maxTurns = MAX_TURNS,
+    systemPrompt,
+  }: HostLoopOptions = {}
 ): Promise<Trajectory> {
   const resolvedServer = server ?? createServer();
   const resolvedLlm = llm ?? createDefaultLlmProvider();
@@ -103,7 +122,11 @@ export async function runMcpHostLoop(
       parameters: t.inputSchema as Record<string, unknown>,
     }));
 
-    const messages: LlmMessage[] = [{ role: "user", content: input }];
+    const messages: LlmMessage[] = [];
+    if (systemPrompt && systemPrompt.trim().length > 0) {
+      messages.push({ role: "system", content: systemPrompt });
+    }
+    messages.push({ role: "user", content: input });
     const trajectory: Trajectory = [];
 
     for (let turn = 0; turn < maxTurns; turn++) {

From 615fa8f111c9d866a1ff74ea0b453d947f1dd503 Mon Sep 17 00:00:00 2001
From: Patryk Kopycinski <patryk.kopycinski@elastic.co>
Date: Mon, 18 May 2026 10:54:39 +0200
Subject: [PATCH 42/42] =?UTF-8?q?docs(evals):=20=E2=89=A514B=20local-LLM?=
 =?UTF-8?q?=20floor=20+=20drop=20llama3.1:8b=20baseline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llama3.1:8b is below the threshold where tool-calling decisions
produce useful signal (team eval finding: ≥14B parameters is the
floor). Sub-14B 'passes' are coincidence, not a result, so
documenting an 8B as the 'good baseline' propagates a floor that
masks real harness bugs (#25/#26/#27) and green-lights skills
that aren't ready.

Replace with the explicit ≥14B parameter requirement, a chat-
completions caveat (qwen2.5:32b-instruct-q4_K_M legitimately
returns 'does not support chat' against /v1/chat/completions as
of Ollama 0.3.x), and verified candidates the next reader can
pull. See elastic/agent-builder-skill-dev-cursor-plugin
anti-pattern #28 for the full rationale.
---
 docs/evals.md | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 7e39cb2..ee144cc 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -221,12 +221,22 @@ that structural evaluators can't express.
    OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... npm run test:evals
 
    # Local Ollama (zero-cost smoke run; tool-calling quality varies by model)
-   # Use a model that speaks the OpenAI chat-completions schema. `llama3.1:8b`
-   # is a good baseline; `qwen2.5:32b-instruct-q4_K_M` exposes /generate only
-   # and returns "does not support chat" against this harness.
+   #
+   # Pick a model that meets BOTH of these requirements:
+   #   (1) ≥14B parameters — anything smaller (e.g. llama3.1:8b, qwen3:8b)
+   #       falls below the threshold where tool-calling decisions become
+   #       useful signal rather than noise; sub-14B "passes" are coincidence,
+   #       not a result.
+   #   (2) Exposes /v1/chat/completions — required by this harness. A few
+   #       Ollama tags expose /generate only and return
+   #       "does not support chat" (notably qwen2.5:32b-instruct-q4_K_M as
+   #       of Ollama 0.3.x).
+   #
+   # Verified candidates: `qwen2.5:14b-instruct`, `qwen3:14b`, `mistral-small:24b`,
+   # `qwen2.5:32b-instruct` (non-q4_K_M tags). `ollama pull <model>` first.
    OPENAI_API_KEY=ollama \
      LITELLM_BASE_URL=http://localhost:11434/v1 \
-     OPENAI_MODEL=llama3.1:8b \
+     OPENAI_MODEL=qwen2.5:14b-instruct \
      npm run test:evals
    ```