diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 0000000..c4b951a --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,87 @@ +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0; you may not use this file except in compliance with the Elastic License +# 2.0. + +name: Evals + +on: + # Manually trigger a run from the Actions UI (useful for ad-hoc evaluation). + workflow_dispatch: + + # Nightly run at 02:00 UTC to catch regressions before the work day starts. + schedule: + - cron: "0 2 * * *" + + # Run when a PR is labeled with `evals`. Labels require write permission, so + # this implicitly limits triggering to maintainers — acceptable because + # pull_request_target runs with base-repo secrets. + pull_request_target: + types: [labeled] + +# Cancel any in-progress run for the same ref so a fast push doesn't queue up +# redundant eval jobs that waste LLM quota. +concurrency: + group: evals-${{ github.ref }} + cancel-in-progress: true + +jobs: + evals: + name: LLM Eval Suite + runs-on: ubuntu-latest + + # For pull_request_target, gate strictly on the evals label so the job + # doesn't fire for every other label event. + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + (github.event_name == 'pull_request_target' && github.event.label.name == 'evals') + + steps: + - uses: actions/checkout@v4 + with: + # For pull_request_target, check out the PR head so the eval runs + # against the proposed changes, not the base branch. + ref: >- + ${{ + github.event_name == 'pull_request_target' + && github.event.pull_request.head.sha + || github.sha + }} + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Run evals + env: + RUN_LLM_EVALS: "1" + # Set ANTHROPIC_API_KEY to use Claude Haiku (preferred); fall back to + # OPENAI_API_KEY for GPT-4o-mini. Set EVAL_LITELLM_BASE_URL to route + # through a LiteLLM proxy instead of the direct OpenAI endpoint. + ANTHROPIC_API_KEY: ${{ secrets.EVAL_ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.EVAL_OPENAI_API_KEY }} + LITELLM_BASE_URL: ${{ secrets.EVAL_LITELLM_BASE_URL }} + # JSON array describing the Elastic cluster the MCP server targets. + # Shape: [{"name":"primary","elasticsearchUrl":"...","kibanaUrl":"...","elasticsearchApiKey":"..."}] + CLUSTERS_JSON: ${{ secrets.EVAL_CLUSTERS_JSON }} + run: | + set -o pipefail + npm run test:evals 2>&1 | tee eval-output.txt + + - name: Post eval results to job summary + if: always() + run: | + if [ -f eval-output.txt ]; then + echo "## Eval results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + cat eval-output.txt >> "$GITHUB_STEP_SUMMARY" + else + echo "## Eval results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "_No eval output captured._" >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/README.md b/README.md index b110713..7f92df6 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ An [MCP App](https://modelcontextprotocol.io/extensions/apps/overview) that brin ## What This Does -This project provides six interactive security operations tools, each with a rich React-based UI that renders inline when Claude (or another MCP host) calls the tool: +This project provides seven interactive security operations tools, each with a rich React-based UI that renders inline when Claude (or another MCP host) calls the tool: | Tool | What It Does | |------|-------------| @@ -24,6 +24,7 @@ This project provides six interactive security operations tools, each with a ric | **Detection Rules** | Browse, tune, and manage detection rules with KQL search and noisy rules analysis | | **Threat Hunt** | ES\|QL workbench with clickable entities and a D3 investigation graph | | **Sample Data** | Generate ECS security events for demos across 4 attack chain scenarios | +| **SIEM Migration** | Migrate detection rules from Splunk to Elastic Security — upload SPL, AI-translate, review per-rule diff, fix resources, and install | See [docs/features.md](docs/features.md) for a full breakdown of each tool's capabilities. diff --git a/docs/evals.md b/docs/evals.md new file mode 100644 index 0000000..ee144cc --- /dev/null +++ b/docs/evals.md @@ -0,0 +1,318 @@ +# Eval Harness + +LLM-powered evals for the Elastic Security MCP app's skill layer. The harness +tests whether the LLM host activates the right skill, calls the right tools in +the right order, and does not fire on unrelated queries. + +Regular `npm test` never touches this harness — it only runs when +`RUN_LLM_EVALS=1` is set, so CI stays fast and free of LLM costs. + +--- + +## Architecture + +``` +Dataset (examples) + │ + ▼ +runner.ts ─ describe.skipIf(!RUN_LLM_EVALS)(dataset.name, () => { + │ for each example: + │ trajectory = await runMcpHostLoop(input) + │ scores = await evaluators[*](trajectory, expected) + │ assert score >= passingScore + │ afterAll: print Markdown table to stdout + │ }) + │ + ├── runMcpHostLoop(input, opts?) + │ InMemoryTransport ─ Client ─ McpServer + │ LLM provider (Anthropic / OpenAI / LiteLLM) + │ loop ≤ MAX_TURNS=8: LLM → tool calls → results → repeat + │ returns Trajectory (ordered ToolCall[]) + │ opts.systemPrompt: optional host-level system prompt (see below) + │ + └── Evaluators + skill-activation binary: was skill tool called? + negative-activation binary: was skill tool correctly absent? + tool-selection F1 precision/recall against expected.tools + trajectory LCS similarity of actual vs expected sequence + criteria LLM-as-judge against natural-language assertions +``` + +### Key design choices + +| Decision | Rationale | +|---|---| +| In-process via `InMemoryTransport` | No network, no server process — evals run anywhere | +| `describe.skipIf(!RUN_LLM_EVALS)` | Zero LLM cost in regular `npm test` | +| `Evaluator` is a plain function | Easy to compose; factory pattern for stateful evaluators (criteria) | +| `'N/A'` return instead of 0 | Datasets omit irrelevant evaluator dimensions without masking real regressions | +| LCS for trajectory | Order matters; set-based coverage is tool-selection's job | + +--- + +## Dataset shape + +A dataset is a `Dataset` object exported from a `*.dataset.ts` file: + +```typescript +import type { Dataset } from "../types.js"; + +export const myDataset: Dataset = { + name: "my-skill", + examples: [ + { + id: "ms-pos-01", // stable, unique — appears in CI summaries + input: "user message to the LLM", // the query sent to runMcpHostLoop + expected: { + skill: "entry-point-tool-name", // tool the skill SKILL.md instructs the LLM to call + tools: ["entry-point-tool-name"], // ordered list for trajectory/tool-selection + criteria: [ // natural-language assertions for LLM-as-judge + "The model called the correct entry-point tool", + ], + }, + }, + ], +}; +``` + +All three `expected` fields are **optional**: + +| Field | Evaluators that use it | Omit when… | +|---|---|---| +| `skill` | `skill-activation`, `negative-activation` | Dataset doesn't test skill routing | +| `tools` | `tool-selection`, `trajectory` | No ordered tool expectation | +| `criteria` | `criteria` | No LLM-as-judge needed (saves cost) | + +Omitting a field causes the evaluator to return `'N/A'` for that example rather than a false 0. + +### Positive vs distractor examples + +A **positive** example is a query that *should* activate the skill. +A **distractor** example is an unrelated query that *should not*. + +Use separate `runDataset` calls with different evaluators for each group: + +```typescript +// Positive: skill should fire +runDataset( + { name: "my-skill: positives", examples: positiveExamples }, + { "skill-activation": skillActivation, "tool-selection": toolSelection }, + { passingScore: 0.8 } +); + +// Distractor: skill must NOT fire (gate is 100%) +runDataset( + { name: "my-skill: distractors", examples: distractorExamples }, + { "negative-activation": negativeActivation }, + { passingScore: 1.0 } +); +``` + +--- + +## Evaluator catalog + +### `skill-activation` + +**Type**: binary · **Score**: `1` if `expected.skill` found in trajectory, `0` otherwise +**Returns `'N/A'`**: when `expected.skill` is absent +**Gate**: ≥ 0.8 on positive examples (use `passingScore: 0.8`) + +Tests whether the LLM called the skill's model-facing entry-point tool at +least once. + +### `negative-activation` + +**Type**: binary · **Score**: `1` if `expected.skill` is *absent* from trajectory, `0` if present +**Returns `'N/A'`**: when `expected.skill` is absent +**Gate**: 1.0 on distractor examples (use `passingScore: 1.0`) + +Tests that the skill does not over-trigger on unrelated queries. Any false +positive here means the skill's SKILL.md is too broad. + +### `tool-selection` + +**Type**: F1 · **Score**: harmonic mean of precision and recall against `expected.tools` (set-based) +**Returns `'N/A'`**: when `expected.tools` is absent +**Gate**: ≥ 0.8 on positive examples + +Tests *which* tools were called, ignoring order. Missed tools lower recall; +spurious tools lower precision. Failure reason includes `missed: [...]` and +`extra: [...]`. + +### `trajectory` + +**Type**: LCS similarity · **Score**: `lcs(actual, expected) / max(|actual|, |expected|)` +**Returns `'N/A'`**: when `expected.tools` is absent +**Gate**: ≥ 0.7 on positive examples (sequence matching is looser than set matching) + +Tests *order*. Dividing by `max` penalises both missing and extra steps. +Use alongside `tool-selection` for full coverage. + +### `criteria` + +**Type**: LLM-as-judge · **Score**: `0.0–1.0` parsed from a rubric prompt response +**Returns `'N/A'`**: when `expected.criteria` is absent +**Gate**: ≥ 0.7 + +Calls the judge LLM with the trajectory `{tool, args}` pairs and the +criteria list. Asks for `{"score": <0–1>, "reasoning": "..."}`. Falls back +to regex number extraction if JSON parse fails. Use for semantic assertions +that structural evaluators can't express. + +**Cost**: one extra LLM call per example. Omit `expected.criteria` to skip. + +--- + +## How to add a dataset + +1. **Create the data file** `evals/datasets/.dataset.ts`: + + ```typescript + import type { Dataset, Example } from "../types.js"; + + const SKILL_TOOL = "my-tool"; // the model-facing entry-point tool + + export const positiveExamples: Example[] = [ + { id: "ms-pos-01", input: "...", expected: { skill: SKILL_TOOL, tools: [SKILL_TOOL] } }, + // add ≥ 4 examples + ]; + + export const distractorExamples: Example[] = [ + { id: "ms-neg-01", input: "...", expected: { skill: SKILL_TOOL } }, + // add ≥ 4 examples + ]; + + export const myDataset: Dataset = { + name: "", + examples: [...positiveExamples, ...distractorExamples], + }; + ``` + +2. **Create the eval spec** `evals/.eval.test.ts`: + + ```typescript + import { runDataset } from "./runner.js"; + import { positiveExamples, distractorExamples } from "./datasets/.dataset.js"; + import { skillActivation } from "./evaluators/skill-activation.js"; + import { negativeActivation } from "./evaluators/negative-activation.js"; + import { toolSelection } from "./evaluators/tool-selection.js"; + + runDataset( + { name: ": positives", examples: positiveExamples }, + { "skill-activation": skillActivation, "tool-selection": toolSelection }, + { passingScore: 0.8 } + ); + + runDataset( + { name: ": distractors", examples: distractorExamples }, + { "negative-activation": negativeActivation }, + { passingScore: 1.0 } + ); + ``` + +3. **Run locally**: + + ```bash + # Anthropic (preferred) + ANTHROPIC_API_KEY=sk-ant-... npm run test:evals + + # OpenAI / LiteLLM proxy + OPENAI_API_KEY=sk-... LITELLM_BASE_URL=https://... npm run test:evals + + # Local Ollama (zero-cost smoke run; tool-calling quality varies by model) + # + # Pick a model that meets BOTH of these requirements: + # (1) ≥14B parameters — anything smaller (e.g. llama3.1:8b, qwen3:8b) + # falls below the threshold where tool-calling decisions become + # useful signal rather than noise; sub-14B "passes" are coincidence, + # not a result. + # (2) Exposes /v1/chat/completions — required by this harness. A few + # Ollama tags expose /generate only and return + # "does not support chat" (notably qwen2.5:32b-instruct-q4_K_M as + # of Ollama 0.3.x). + # + # Verified candidates: `qwen2.5:14b-instruct`, `qwen3:14b`, `mistral-small:24b`, + # `qwen2.5:32b-instruct` (non-q4_K_M tags). `ollama pull ` first. + OPENAI_API_KEY=ollama \ + LITELLM_BASE_URL=http://localhost:11434/v1 \ + OPENAI_MODEL=qwen2.5:14b-instruct \ + npm run test:evals + ``` + + `createEvalServer` stubs all Elastic-cluster calls, so no `CLUSTERS_JSON` + is needed when running skill-routing evaluators (`skill-activation`, + `tool-selection`, `negative-activation`, `trajectory`, `criteria`). + +4. **Trigger in CI**: open a PR and add the `evals` label (requires write access). + +--- + +## Host system prompt (`HostLoopOptions.systemPrompt`) + +Real MCP hosts (Claude Desktop, Cursor) inject a host-level system +prompt that constrains tool selection, response shape, and confirmation +flow. Without one, the harness measures raw model-vs-tools behavior — +which can over- or under-report activation depending on the model +family. Use `HostLoopOptions.systemPrompt` to pin behavior to what +production will instruct, or to swap in a `SKILL.md` body when testing +skill-driven flows. + +```typescript +import { runMcpHostLoop } from "./runMcpHostLoop.js"; +import { skillBody } from "../skills/automatic-migration/SKILL.md?raw"; + +const trajectory = await runMcpHostLoop(example.input, { + server: createEvalServer(), + systemPrompt: skillBody, // copy SKILL.md verbatim, like the real host +}); +``` + +Provider handling: + +- **OpenAI / LiteLLM** — `role: "system"` message is the first entry in + the `messages` array, per the Chat Completions schema. +- **Anthropic** — the adapter strips system-roled messages out of the + array and passes their concatenated content via the top-level + `system` parameter on `messages.create` (the only place Anthropic + accepts a system prompt). +- **Empty / whitespace-only string** — treated identically to omitting + the option (no system message is injected, no top-level parameter is + sent). This keeps "absence of system prompt" observable in evals. + +--- + +## CI gating + +### Workflow: `.github/workflows/evals.yml` + +| Trigger | When | +|---|---| +| `workflow_dispatch` | Manual run from Actions UI | +| `schedule` | Nightly at 02:00 UTC | +| `pull_request_target` | When `evals` label is added to a PR | + +The concurrency group `evals-` cancels superseded runs to avoid wasting +LLM quota on stale pushes. + +### Required secrets + +| Secret | Purpose | +|---|---| +| `EVAL_ANTHROPIC_API_KEY` | Anthropic API key (priority provider) | +| `EVAL_OPENAI_API_KEY` | OpenAI / LiteLLM API key (fallback) | +| `EVAL_LITELLM_BASE_URL` | Optional LiteLLM proxy base URL | +| `EVAL_CLUSTERS_JSON` | Elastic cluster credentials for the MCP server | + +### Passing thresholds (recommended defaults) + +| Evaluator | Positives | Distractors | +|---|---|---| +| `skill-activation` | ≥ 0.8 | — | +| `negative-activation` | — | = 1.0 | +| `tool-selection` | ≥ 0.8 | — | +| `trajectory` | ≥ 0.7 | — | +| `criteria` | ≥ 0.7 | — | + +Results are posted as a Markdown table to the GitHub Actions job summary +(`$GITHUB_STEP_SUMMARY`) after every run. diff --git a/docs/features.md b/docs/features.md index 027b5c9..c2c4620 100644 --- a/docs/features.md +++ b/docs/features.md @@ -79,3 +79,17 @@ Rule management dashboard: Generate ECS-compliant security events: - Windows Credential Theft, AWS Privilege Escalation, Okta Identity Takeover, Ransomware Kill Chain - All data tagged for safe cleanup + +## SIEM Migration + +Guided workbench for migrating detection rules from Splunk (QRadar and Sentinel-One support coming) to Elastic Security. Triggered by the `automatic-migration` skill (`migrate-rules` tool): + +- **Vendor selector**: Splunk active; QRadar and Sentinel-One shown as "Coming soon" — re-enabling a vendor is a one-line flag flip +- **Upload step**: drag-and-drop a JSON export file, use the file picker, or paste a rules array directly +- **AI translation**: Kibana's SIEM migrations service converts SPL to Elastic detection rule JSON; a live progress bar polls every 3 seconds +- **Three-column review**: original SPL / generated rule (read-only) / editable rule side-by-side for every translated rule +- **Per-rule drawer**: structured form for key rule fields (name, description, type, query, language, severity, risk score); "Re-validate" saves as `partial`, "Save" uses the selected translation result +- **Resources drawer**: lists all unresolved macros and lookups auto-expanded; each row has an individual Save button; resolved definitions collapsible +- **Translation statuses**: `full` (production-ready), `partial` (needs tuning), `untranslatable` (skipped at install) +- **Install step**: one-click install of all translatable rules into Elastic Security as disabled; "Back to review" available before confirming +- **Done summary**: installed/failed tile counts; "Start another migration" resets the workbench diff --git a/eslint.config.js b/eslint.config.js index 382ca72..cde436c 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -16,6 +16,7 @@ export default tseslint.config( files: [ 'src/**/*.ts', 'src/**/*.tsx', + 'evals/**/*.ts', '*.ts', 'scripts/**/*.js', '*.mjs', diff --git a/evals/automatic-migration.eval.test.ts b/evals/automatic-migration.eval.test.ts new file mode 100644 index 0000000..438bb71 --- /dev/null +++ b/evals/automatic-migration.eval.test.ts @@ -0,0 +1,55 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * End-to-end eval spec for the automatic-migration skill. + * + * Proves skill-activation and boundary discrimination against the + * automatic-migration proof dataset. Run via: + * + * npm run test:evals + * + * This suite is skipped in regular `npm test` because runDataset wraps + * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`. + * + * Gate summary: + * positives — skill-activation + tool-selection ≥ 80% + * distractors — negative-activation = 100% (any false positive is a regression) + */ + +import { runDataset } from "./runner.js"; +import { + positiveExamples, + distractorExamples, +} from "./datasets/automatic-migration.dataset.js"; +import { skillActivation } from "./evaluators/skill-activation.js"; +import { negativeActivation } from "./evaluators/negative-activation.js"; +import { toolSelection } from "./evaluators/tool-selection.js"; +import { createEvalServer } from "./helpers/evalServer.js"; + +runDataset( + { + name: "automatic-migration: positives", + examples: positiveExamples, + }, + { + "skill-activation": skillActivation, + "tool-selection": toolSelection, + }, + { passingScore: 0.8, createServer: createEvalServer } +); + +runDataset( + { + name: "automatic-migration: distractors", + examples: distractorExamples, + }, + { + "negative-activation": negativeActivation, + }, + { passingScore: 1.0, createServer: createEvalServer } // 100% — any false positive is a regression +); diff --git a/evals/datasets/automatic-migration.dataset.ts b/evals/datasets/automatic-migration.dataset.ts new file mode 100644 index 0000000..9c02ca1 --- /dev/null +++ b/evals/datasets/automatic-migration.dataset.ts @@ -0,0 +1,128 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Dataset, Example } from "../types.js"; + +/** + * The model-facing entry-point tool registered by the + * automatic-migration skill (src/tools/migration.ts). + */ +const SKILL_TOOL = "migrate-rules"; + +// --------------------------------------------------------------------------- +// Positive examples — the LLM should call migrate-rules +// --------------------------------------------------------------------------- + +export const positiveExamples: Example[] = [ + { + id: "am-pos-01", + input: "I want to migrate my Splunk rules to Elastic Security", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "am-pos-02", + input: "Help me upload my SPL bundle and convert the detections", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "am-pos-03", + input: "We're onboarding from Splunk — how do I bring our detection rules over?", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "am-pos-04", + input: "Start a SIEM migration for our 200 Splunk correlation rules", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "am-pos-05", + input: "Convert our detection rules from Splunk to Elastic format", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "am-pos-06", + input: "Install the translated rules from my last migration run", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, +]; + +// --------------------------------------------------------------------------- +// Distractor examples — the LLM should NOT call migrate-rules +// --------------------------------------------------------------------------- + +export const distractorExamples: Example[] = [ + { + id: "am-neg-01", + input: "Show me which detection rules are generating the most false positives", + expected: { + // skill is set so negativeActivation knows which tool to check for absence + skill: SKILL_TOOL, + }, + }, + { + id: "am-neg-02", + input: "Triage the open critical alerts from the last 24 hours", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "am-neg-03", + input: "Create a threat hunt for lateral movement via PsExec", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "am-neg-04", + input: "Open a new case for the ransomware incident on host SRVWIN04", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "am-neg-05", + input: "Run an ES|QL query to find brute-force login attempts in the last hour", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "am-neg-06", + input: "Generate sample endpoint data so I can test my detection rules", + expected: { + skill: SKILL_TOOL, + }, + }, +]; + +// --------------------------------------------------------------------------- +// Export the full dataset for reference / cross-dataset tooling +// --------------------------------------------------------------------------- + +export const automaticMigrationDataset: Dataset = { + name: "automatic-migration", + examples: [...positiveExamples, ...distractorExamples], +}; diff --git a/evals/datasets/detection-rule-management.dataset.ts b/evals/datasets/detection-rule-management.dataset.ts new file mode 100644 index 0000000..a1e2a2c --- /dev/null +++ b/evals/datasets/detection-rule-management.dataset.ts @@ -0,0 +1,99 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Dataset, Example } from "../types.js"; + +/** + * The model-facing entry-point tool registered by the + * detection-rule-management skill (src/tools/detection-rules.ts). + */ +const SKILL_TOOL = "manage-rules"; + +// --------------------------------------------------------------------------- +// Positive examples — the LLM should call manage-rules +// --------------------------------------------------------------------------- + +export const positiveExamples: Example[] = [ + { + id: "drm-pos-01", + input: "Show me my noisy rules — which detection rules are generating the most alerts?", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-02", + input: "List all my currently enabled detection rules", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-03", + input: "Find high severity detection rules related to PowerShell execution", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, + { + id: "drm-pos-04", + input: "What detection rules do I have covering initial access tactics?", + expected: { + skill: SKILL_TOOL, + tools: [SKILL_TOOL], + }, + }, +]; + +// --------------------------------------------------------------------------- +// Distractor examples — the LLM should NOT call manage-rules +// --------------------------------------------------------------------------- + +export const distractorExamples: Example[] = [ + { + id: "drm-neg-01", + input: "Create a new case for a ransomware incident I'm currently investigating", + expected: { + // skill is set so negativeActivation knows which tool to check for absence + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-02", + input: "Show me all critical alerts that fired in the last hour", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-03", + input: "Run an ES|QL query to find failed SSH login attempts on my Linux hosts", + expected: { + skill: SKILL_TOOL, + }, + }, + { + id: "drm-neg-04", + input: "A process on host web-01 just spawned cmd.exe — help me investigate", + expected: { + skill: SKILL_TOOL, + }, + }, +]; + +// --------------------------------------------------------------------------- +// Export the full dataset for reference / cross-dataset tooling +// --------------------------------------------------------------------------- + +export const detectionRuleManagementDataset: Dataset = { + name: "detection-rule-management", + examples: [...positiveExamples, ...distractorExamples], +}; + diff --git a/evals/detection-rule-management.eval.test.ts b/evals/detection-rule-management.eval.test.ts new file mode 100644 index 0000000..23d14f2 --- /dev/null +++ b/evals/detection-rule-management.eval.test.ts @@ -0,0 +1,55 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * End-to-end eval spec for the detection-rule-management skill. + * + * Proves the eval harness (runner → runMcpHostLoop → evaluators) works + * against a real registered skill using the proof dataset. Run via: + * + * npm run test:evals + * + * This suite is skipped in regular `npm test` because runDataset wraps + * everything in `describe.skipIf(!process.env.RUN_LLM_EVALS)`. + * + * Gate summary: + * positives — skill-activation + tool-selection ≥ 80% + * distractors — negative-activation = 100% (any false positive is a regression) + */ + +import { runDataset } from "./runner.js"; +import { + positiveExamples, + distractorExamples, +} from "./datasets/detection-rule-management.dataset.js"; +import { skillActivation } from "./evaluators/skill-activation.js"; +import { negativeActivation } from "./evaluators/negative-activation.js"; +import { toolSelection } from "./evaluators/tool-selection.js"; +import { createEvalServer } from "./helpers/evalServer.js"; + +runDataset( + { + name: "detection-rule-management: positives", + examples: positiveExamples, + }, + { + "skill-activation": skillActivation, + "tool-selection": toolSelection, + }, + { passingScore: 0.8, createServer: createEvalServer } +); + +runDataset( + { + name: "detection-rule-management: distractors", + examples: distractorExamples, + }, + { + "negative-activation": negativeActivation, + }, + { passingScore: 1.0, createServer: createEvalServer } // 100% — any false positive is a regression +); diff --git a/evals/evaluators/criteria.ts b/evals/evaluators/criteria.ts new file mode 100644 index 0000000..1994eac --- /dev/null +++ b/evals/evaluators/criteria.ts @@ -0,0 +1,142 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; +import type { LlmProvider } from "../llm/types.js"; + +/** + * LLM-as-judge evaluator: asks an LLM to score the trajectory against + * the natural-language assertions in `expected.criteria`. + * + * Returns `'N/A'` when `expected.criteria` is absent or empty so datasets + * that rely only on structural evaluators don't incur extra LLM calls. + * + * Usage: + * import { createCriteriaEvaluator } from "./criteria.js"; + * import { createDefaultLlmProvider } from "../llm/index.js"; + * + * runDataset(dataset, { + * criteria: createCriteriaEvaluator(createDefaultLlmProvider()), + * }); + * + * The factory pattern is necessary because the `Evaluator` type is a plain + * function — the LLM provider is closed over rather than passed as an arg. + */ +export function createCriteriaEvaluator(llm: LlmProvider): Evaluator { + return async ( + trajectory: Trajectory, + expected: ExpectedBehavior + ): Promise => { + if (!expected.criteria || expected.criteria.length === 0) { + return { score: "N/A" }; + } + + const prompt = buildJudgePrompt(trajectory, expected.criteria); + const response = await llm.chat([{ role: "user", content: prompt }], []); + const text = response.content ?? ""; + + return parseJudgeResponse(text); + }; +} + +/** + * Builds the rubric prompt sent to the judge LLM. + * + * Asks for a JSON object with `score` (0–1) and `reasoning` (string) so + * parsing is deterministic. The trajectory is serialised as a compact JSON + * array of `{tool, args}` pairs — `result` is omitted to avoid token bloat + * from large tool outputs. + */ +function buildJudgePrompt(trajectory: Trajectory, criteria: string[]): string { + const trajectoryStr = JSON.stringify( + trajectory.map(({ tool, args }) => ({ tool, args })), + null, + 2 + ); + + const criteriaList = criteria + .map((c, i) => `${i + 1}. ${c}`) + .join("\n"); + + return `You are an impartial evaluator assessing the quality of an AI assistant's tool-calling behaviour. + +## Trajectory (tools the assistant called, in order) + +\`\`\`json +${trajectoryStr} +\`\`\` + +## Evaluation criteria + +${criteriaList} + +## Task + +Score how well the trajectory satisfies ALL of the criteria above on a scale from 0.0 to 1.0: +- 1.0 All criteria fully satisfied +- 0.75 Most criteria satisfied with minor gaps +- 0.5 About half the criteria satisfied +- 0.25 Most criteria unmet with only minor satisfaction +- 0.0 No criteria satisfied at all + +Respond with a single JSON object — no markdown fences, no extra text: +{"score": , "reasoning": ""}`; +} + +/** + * Parses the judge LLM's response into an EvaluatorResult. + * + * Tries JSON.parse first. Falls back to a regex that extracts a bare number + * from the text in case the model wraps the response in prose. + */ +function parseJudgeResponse(text: string): EvaluatorResult { + const trimmed = text.trim(); + + // Primary: extract the first {...} object in the response + const jsonMatch = trimmed.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + const parsed = JSON.parse(jsonMatch[0]) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "score" in parsed && + typeof (parsed as Record).score === "number" + ) { + const { score, reasoning } = parsed as { + score: number; + reasoning?: unknown; + }; + const clampedScore = Math.min(1, Math.max(0, score)); + return { + score: clampedScore, + reason: + typeof reasoning === "string" + ? reasoning + : `raw judge response: ${trimmed}`, + }; + } + } catch { + // fall through to regex fallback + } + } + + // Fallback: look for a bare decimal / integer in [0, 1] + const numMatch = trimmed.match(/\b(1(?:\.0+)?|0(?:\.\d+)?)\b/); + if (numMatch) { + const score = parseFloat(numMatch[1]); + return { + score, + reason: `score parsed from prose; raw response: ${trimmed.slice(0, 200)}`, + }; + } + + return { + score: 0, + reason: `judge response could not be parsed; raw response: ${trimmed.slice(0, 200)}`, + }; +} diff --git a/evals/evaluators/negative-activation.ts b/evals/evaluators/negative-activation.ts new file mode 100644 index 0000000..e08d315 --- /dev/null +++ b/evals/evaluators/negative-activation.ts @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Binary evaluator for distractor examples: did the LLM correctly avoid + * calling the skill's entry-point tool? + * + * This is the complement of `skillActivation`. Use it on examples where the + * user query should NOT trigger the skill — e.g. a migration skill dataset + * includes unrelated queries (case management, threat hunting) to confirm the + * LLM doesn't call `migrate-rules` for everything. + * + * Score semantics (binary): + * 1 — skill tool absent from trajectory (correct — not distracted) + * 0 — skill tool present in trajectory (false positive — skill over-triggered) + * + * Returns `'N/A'` when `expected.skill` is absent, consistent with how + * `skillActivation` handles missing skill declarations. + * + * CI gate: datasets should require 100% on this evaluator for distractor + * examples — a false positive means the skill's SKILL.md is too aggressive + * and will fire on unrelated queries in production. + */ +export const negativeActivation: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.skill) { + return { score: "N/A" }; + } + + const falsePositive = trajectory.some((tc) => tc.tool === expected.skill); + + return { + score: falsePositive ? 0 : 1, + reason: falsePositive + ? `Tool "${expected.skill}" was called but should not have been (false positive)` + : `Tool "${expected.skill}" was correctly absent from the trajectory`, + }; +}; diff --git a/evals/evaluators/skill-activation.ts b/evals/evaluators/skill-activation.ts new file mode 100644 index 0000000..b7deb8d --- /dev/null +++ b/evals/evaluators/skill-activation.ts @@ -0,0 +1,37 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Binary evaluator: did the LLM call the skill's entry-point tool? + * + * Each MCP skill has a single model-facing entry-point tool (e.g. `migrate-rules` + * for the automatic-migration skill, `manage-rules` for detection-rule-management). + * `expected.skill` holds that tool name. The evaluator checks whether the + * trajectory contains at least one call to that tool. + * + * Returns `'N/A'` when `expected.skill` is absent so datasets that don't + * care about skill routing can omit the field without failing the run. + */ +export const skillActivation: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.skill) { + return { score: "N/A" }; + } + + const activated = trajectory.some((tc) => tc.tool === expected.skill); + + return { + score: activated ? 1 : 0, + reason: activated + ? `Tool "${expected.skill}" was called` + : `Tool "${expected.skill}" was never called (trajectory: [${trajectory.map((t) => t.tool).join(", ") || "empty"}])`, + }; +}; diff --git a/evals/evaluators/tool-selection.ts b/evals/evaluators/tool-selection.ts new file mode 100644 index 0000000..71cf7b1 --- /dev/null +++ b/evals/evaluators/tool-selection.ts @@ -0,0 +1,60 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Set-based tool-selection evaluator: how well did the LLM pick the right tools? + * + * Computes precision, recall, and their harmonic mean (F1) against the + * set of tool names in `expected.tools`. Deduplicates both sides — order + * and repetition are tested by the trajectory evaluator instead. + * + * precision = |called ∩ expected| / |called| (no spurious calls) + * recall = |called ∩ expected| / |expected| (no missed calls) + * score = F1 = 2·P·R / (P+R) ∈ [0, 1] + * + * Returns `'N/A'` when `expected.tools` is absent so datasets that only + * care about skill routing don't need to declare tool lists. + * + * CI gate: datasets should require ≥0.8 (80%) on positive examples. + * The failure reason lists missed and extra tools to make debugging fast. + */ +export const toolSelection: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.tools) { + return { score: "N/A" }; + } + + const expectedSet = new Set(expected.tools); + const calledSet = new Set(trajectory.map((tc) => tc.tool)); + + if (expectedSet.size === 0 && calledSet.size === 0) { + return { score: 1, reason: "No tools expected and none called" }; + } + + const tp = [...calledSet].filter((t) => expectedSet.has(t)).length; + const precision = calledSet.size > 0 ? tp / calledSet.size : 0; + const recall = expectedSet.size > 0 ? tp / expectedSet.size : 0; + const f1 = + precision + recall > 0 + ? (2 * precision * recall) / (precision + recall) + : 0; + + const missed = [...expectedSet].filter((t) => !calledSet.has(t)); + const extra = [...calledSet].filter((t) => !expectedSet.has(t)); + + const parts = [ + `F1=${f1.toFixed(2)} (precision=${precision.toFixed(2)}, recall=${recall.toFixed(2)})`, + ...(missed.length > 0 ? [`missed: [${missed.join(", ")}]`] : []), + ...(extra.length > 0 ? [`extra: [${extra.join(", ")}]`] : []), + ]; + + return { score: f1, reason: parts.join(" | ") }; +}; diff --git a/evals/evaluators/trajectory.ts b/evals/evaluators/trajectory.ts new file mode 100644 index 0000000..4e71ec8 --- /dev/null +++ b/evals/evaluators/trajectory.ts @@ -0,0 +1,79 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Evaluator, EvaluatorResult, ExpectedBehavior, Trajectory } from "../types.js"; + +/** + * Sequence-aware evaluator: how closely did the LLM follow the expected tool order? + * + * Computes the Longest Common Subsequence (LCS) of the actual tool-call + * sequence against `expected.tools`, then normalises by the longer of the + * two sequences: + * + * score = lcs(actual, expected) / max(|actual|, |expected|) ∈ [0, 1] + * + * Dividing by the max penalises both missing tools (low recall) and extra + * spurious tools (low precision) without needing separate P/R components — + * those are tool-selection's job. + * + * Returns `'N/A'` when `expected.tools` is absent so datasets that don't + * specify an ordered tool sequence don't fail on this evaluator. This guard + * is load-bearing: running LCS against an undefined expectation would produce + * meaningless 0-scores that mask real regressions in other evaluators. + */ +export const trajectoryScore: Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +): EvaluatorResult => { + if (!expected.tools) { + return { score: "N/A" }; + } + + const actual = trajectory.map((tc) => tc.tool); + const exp = expected.tools; + + if (actual.length === 0 && exp.length === 0) { + return { score: 1, reason: "Both actual and expected sequences are empty" }; + } + + const lcsLen = lcs(actual, exp); + const denom = Math.max(actual.length, exp.length); + const score = lcsLen / denom; + + return { + score, + reason: + `LCS=${lcsLen} / max(|actual|=${actual.length}, |expected|=${exp.length})` + + `=${denom} → score=${score.toFixed(2)}` + + (score < 1 + ? ` | actual=[${actual.join(", ")}] expected=[${exp.join(", ")}]` + : ""), + }; +}; + +/** + * Classic O(m·n) DP implementation of Longest Common Subsequence length. + * Compares elements by identity (===), which is correct for tool name strings. + */ +function lcs(a: string[], b: string[]): number { + const m = a.length; + const n = b.length; + // Single flat array instead of Array> avoids inner allocation + const dp = new Array((m + 1) * (n + 1)).fill(0); + const idx = (i: number, j: number) => i * (n + 1) + j; + + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + dp[idx(i, j)] = + a[i - 1] === b[j - 1] + ? dp[idx(i - 1, j - 1)] + 1 + : Math.max(dp[idx(i - 1, j)], dp[idx(i, j - 1)]); + } + } + + return dp[idx(m, n)]; +} diff --git a/evals/harness.test.ts b/evals/harness.test.ts new file mode 100644 index 0000000..07b14ae --- /dev/null +++ b/evals/harness.test.ts @@ -0,0 +1,247 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Mock-based harness integration test. + * + * Exercises the full eval pipeline (runMcpHostLoop → evaluators) with a + * deterministic mock LLM, proving the harness mechanics satisfy both dataset + * gate requirements without requiring real API keys. + * + * No API keys needed — runs as part of `npm run test:evals`. + * Gate thresholds match the LLM eval specs: + * positives — skill-activation + tool-selection ≥ 80% + * distractors — negative-activation = 100% + */ + +import { describe, it, expect } from "vitest"; +import { runMcpHostLoop } from "./runMcpHostLoop.js"; +import { skillActivation } from "./evaluators/skill-activation.js"; +import { toolSelection } from "./evaluators/tool-selection.js"; +import { negativeActivation } from "./evaluators/negative-activation.js"; +import { + positiveExamples as drmPositives, + distractorExamples as drmDistractors, +} from "./datasets/detection-rule-management.dataset.js"; +import { + positiveExamples as amPositives, + distractorExamples as amDistractors, +} from "./datasets/automatic-migration.dataset.js"; +import type { + LlmProvider, + AssistantMessage, + LlmMessage, +} from "./llm/types.js"; +import { createEvalServer } from "./helpers/evalServer.js"; + +// --------------------------------------------------------------------------- +// Gate thresholds — must match the LLM eval specs in *.eval.test.ts +// --------------------------------------------------------------------------- + +const POSITIVE_GATE = 0.8; +const DISTRACTOR_GATE = 1.0; + +// --------------------------------------------------------------------------- +// Mock LLM implementations +// --------------------------------------------------------------------------- + +/** + * Returns an LLM that calls `toolName` exactly once, then returns plain text. + * Used for positive examples to simulate correct skill activation. + */ +function makeActivatingLlm(toolName: string): LlmProvider { + let called = false; + return { + async chat(_messages, tools): Promise { + if (!called && tools.some((t) => t.name === toolName)) { + called = true; + return { + role: "assistant", + content: null, + tool_calls: [ + { + id: "call_mock_0", + type: "function" as const, + function: { name: toolName, arguments: "{}" }, + }, + ], + }; + } + return { role: "assistant", content: "Done." }; + }, + }; +} + +/** Always returns plain text without calling any tool. Used for distractor examples. */ +const passiveLlm: LlmProvider = { + async chat(): Promise { + return { + role: "assistant", + content: "I can help with that directly without additional tools.", + }; + }, +}; + +// --------------------------------------------------------------------------- +// detection-rule-management harness tests +// --------------------------------------------------------------------------- + +describe("eval harness: detection-rule-management positives", () => { + for (const example of drmPositives) { + it(`${example.id} — skill-activation + tool-selection ≥ ${POSITIVE_GATE}`, async () => { + const trajectory = await runMcpHostLoop(example.input, { + server: createEvalServer(), + llm: makeActivatingLlm("manage-rules"), + }); + + const activation = await skillActivation(trajectory, example.expected); + const selection = await toolSelection(trajectory, example.expected); + + if (activation.score !== "N/A") { + expect(activation.score, `skill-activation: ${activation.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE); + } + if (selection.score !== "N/A") { + expect(selection.score, `tool-selection: ${selection.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE); + } + }); + } +}); + +describe("eval harness: detection-rule-management distractors", () => { + for (const example of drmDistractors) { + it(`${example.id} — negative-activation = 100%`, async () => { + const trajectory = await runMcpHostLoop(example.input, { + server: createEvalServer(), + llm: passiveLlm, + }); + + const result = await negativeActivation(trajectory, example.expected); + if (result.score !== "N/A") { + expect(result.score, `negative-activation: ${result.reason}`).toBe(DISTRACTOR_GATE); + } + }); + } +}); + +// --------------------------------------------------------------------------- +// automatic-migration harness tests +// --------------------------------------------------------------------------- + +describe("eval harness: automatic-migration positives", () => { + for (const example of amPositives) { + it(`${example.id} — skill-activation + tool-selection ≥ ${POSITIVE_GATE}`, async () => { + const trajectory = await runMcpHostLoop(example.input, { + server: createEvalServer(), + llm: makeActivatingLlm("migrate-rules"), + }); + + const activation = await skillActivation(trajectory, example.expected); + const selection = await toolSelection(trajectory, example.expected); + + if (activation.score !== "N/A") { + expect(activation.score, `skill-activation: ${activation.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE); + } + if (selection.score !== "N/A") { + expect(selection.score, `tool-selection: ${selection.reason}`).toBeGreaterThanOrEqual(POSITIVE_GATE); + } + }); + } +}); + +describe("eval harness: automatic-migration distractors", () => { + for (const example of amDistractors) { + it(`${example.id} — negative-activation = 100%`, async () => { + const trajectory = await runMcpHostLoop(example.input, { + server: createEvalServer(), + llm: passiveLlm, + }); + + const result = await negativeActivation(trajectory, example.expected); + if (result.score !== "N/A") { + expect(result.score, `negative-activation: ${result.reason}`).toBe(DISTRACTOR_GATE); + } + }); + } +}); + +// --------------------------------------------------------------------------- +// HostLoopOptions.systemPrompt — propagation contract +// +// Real MCP hosts inject a system prompt that constrains tool selection. +// Verify the option flows from `runMcpHostLoop` to the provider's `chat()` +// as a `role: "system"` message, AND that empty / whitespace-only strings +// are dropped so the absence of a system prompt is observable. +// --------------------------------------------------------------------------- + +describe("eval harness: systemPrompt propagation", () => { + /** + * Captures every `messages` array the provider's `chat()` receives so + * the test can assert what the harness handed off. + */ + function makeRecordingLlm(): { + provider: LlmProvider; + calls: LlmMessage[][]; + } { + const calls: LlmMessage[][] = []; + const provider: LlmProvider = { + async chat(messages): Promise { + calls.push([...messages]); + return { role: "assistant", content: "Done." }; + }, + }; + return { provider, calls }; + } + + it("prepends a system message when systemPrompt is provided", async () => { + const { provider, calls } = makeRecordingLlm(); + await runMcpHostLoop("Find me my noisy rules", { + server: createEvalServer(), + llm: provider, + systemPrompt: "You are a security analyst. Always call a tool before answering.", + }); + + expect(calls.length).toBeGreaterThanOrEqual(1); + const firstTurn = calls[0]!; + expect(firstTurn[0]).toEqual({ + role: "system", + content: "You are a security analyst. Always call a tool before answering.", + }); + expect(firstTurn[1]).toEqual({ + role: "user", + content: "Find me my noisy rules", + }); + }); + + it("does not inject a system message when systemPrompt is omitted", async () => { + const { provider, calls } = makeRecordingLlm(); + await runMcpHostLoop("Find me my noisy rules", { + server: createEvalServer(), + llm: provider, + }); + + expect(calls.length).toBeGreaterThanOrEqual(1); + const firstTurn = calls[0]!; + expect(firstTurn[0]?.role).toBe("user"); + expect(firstTurn.some((m) => m.role === "system")).toBe(false); + }); + + it("treats empty / whitespace-only systemPrompt as omitted", async () => { + for (const prompt of ["", " ", "\n\t"]) { + const { provider, calls } = makeRecordingLlm(); + await runMcpHostLoop("Find me my noisy rules", { + server: createEvalServer(), + llm: provider, + systemPrompt: prompt, + }); + const firstTurn = calls[0]!; + expect( + firstTurn.some((m) => m.role === "system"), + `empty-string systemPrompt (${JSON.stringify(prompt)}) should not inject a system message` + ).toBe(false); + } + }); +}); diff --git a/evals/helpers/evalServer.ts b/evals/helpers/evalServer.ts new file mode 100644 index 0000000..2d1412f --- /dev/null +++ b/evals/helpers/evalServer.ts @@ -0,0 +1,98 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { vi } from "vitest"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { registerAlertTriageTools } from "../../src/tools/alert-triage.js"; +import { registerAttackDiscoveryTools } from "../../src/tools/attack-discovery.js"; +import { registerCaseManagementTools } from "../../src/tools/case-management.js"; +import { registerDetectionRuleTools } from "../../src/tools/detection-rules.js"; +import { registerMigrationTools } from "../../src/tools/migration.js"; +import { registerSampleDataTools } from "../../src/tools/sample-data.js"; +import { registerThreatHuntTools } from "../../src/tools/threat-hunt.js"; +import type { AlertsService } from "../../src/elastic/service/alertsService.js"; +import type { AttackDiscoveryService } from "../../src/elastic/service/attackDiscoveryService.js"; +import type { CasesService } from "../../src/elastic/service/casesService.js"; +import type { EntityDetailService } from "../../src/elastic/service/entityDetailService.js"; +import type { EsqlService } from "../../src/elastic/service/esqlService.js"; +import type { IndicesService } from "../../src/elastic/service/indicesService.js"; +import type { InvestigateService } from "../../src/elastic/service/investigateService.js"; +import type { MigrationsService } from "../../src/elastic/service/migrationsService.js"; +import type { RulesService } from "../../src/elastic/service/rulesService.js"; +import type { SampleDataService } from "../../src/elastic/service/sampleDataService.js"; + +/** + * Stubs every service used by the seven tool groups registered on the live + * MCP server. Methods invoked by model-facing entry tools resolve to + * realistic-shaped empty payloads; other methods are bare `vi.fn()` because + * skill-routing evaluators only inspect which tools the LLM called, not + * what those tools returned. + * + * Mirrors `src/server.ts` exactly: the LLM that drives the eval host loop + * must see the same tool surface a real MCP host (Claude Desktop, Cursor) + * exposes — otherwise we measure skill-selection against an artificially + * narrow distractor set and over-state activation rates for small models. + */ +export function createEvalServer(): McpServer { + const server = new McpServer({ name: "eval-server", version: "0.0.0" }); + + const alertsService = { + searchAlerts: vi.fn().mockResolvedValue({ alerts: [], total: 0 }), + findAlertById: vi.fn().mockResolvedValue(null), + } as unknown as AlertsService; + + const attackDiscoveryService = { + listAttackDiscoveries: vi.fn().mockResolvedValue([]), + } as unknown as AttackDiscoveryService; + + const casesService = { + findCases: vi.fn().mockResolvedValue({ cases: [], total: 0 }), + } as unknown as CasesService; + + const entityDetailService = { + getEntityDetail: vi.fn().mockResolvedValue(null), + } as unknown as EntityDetailService; + + const esqlService = { + executeQuery: vi.fn().mockResolvedValue({ columns: [], values: [] }), + } as unknown as EsqlService; + + const indicesService = { + listIndices: vi.fn().mockResolvedValue([]), + } as unknown as IndicesService; + + const investigateService = { + getRelatedAlerts: vi.fn().mockResolvedValue([]), + } as unknown as InvestigateService; + + const migrationsService = { + listMigrations: vi.fn().mockResolvedValue([]), + } as unknown as MigrationsService; + + const rulesService = { + findRules: vi.fn().mockResolvedValue({ data: [], total: 0 }), + } as unknown as RulesService; + + const sampleDataService = { + listScenarios: vi.fn().mockResolvedValue([]), + } as unknown as SampleDataService; + + registerAlertTriageTools(server, { alertsService }); + registerAttackDiscoveryTools(server, { attackDiscoveryService, casesService }); + registerCaseManagementTools(server, { casesService }); + registerDetectionRuleTools(server, { rulesService }); + registerMigrationTools(server, { migrationsService }); + registerSampleDataTools(server, { sampleDataService }); + registerThreatHuntTools(server, { + esqlService, + indicesService, + investigateService, + entityDetailService, + }); + + return server; +} diff --git a/evals/llm/anthropic.ts b/evals/llm/anthropic.ts new file mode 100644 index 0000000..a52ba6e --- /dev/null +++ b/evals/llm/anthropic.ts @@ -0,0 +1,167 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Anthropic from "@anthropic-ai/sdk"; +import type { + AssistantMessage, + LlmMessage, + LlmProvider, + LlmToolDefinition, +} from "./types.js"; + +const DEFAULT_MODEL = "claude-haiku-4-5-20251001"; + +/** Max tokens to request from the Anthropic API per turn. */ +const MAX_TOKENS = 4096; + +export interface AnthropicProviderOptions { + /** Chat model to use. Defaults to claude-haiku-4-5-20251001. */ + model?: string; + /** + * API key. Defaults to the ANTHROPIC_API_KEY environment variable, which is + * the standard Anthropic SDK default. + */ + apiKey?: string; +} + +export class AnthropicProvider implements LlmProvider { + private readonly client: Anthropic; + private readonly model: string; + + constructor({ + model = DEFAULT_MODEL, + apiKey, + }: AnthropicProviderOptions = {}) { + this.model = model; + this.client = new Anthropic({ + ...(apiKey !== undefined ? { apiKey } : {}), + }); + } + + async chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise { + // Anthropic accepts the system prompt as a top-level parameter, not as + // a message in the array. Concatenate any system-roled messages from + // the unified LlmMessage shape into one string and strip them before + // converting the remaining history. + const systemMessages = messages.filter( + (m): m is Extract => m.role === "system" + ); + const system = systemMessages.map((m) => m.content).join("\n\n"); + const nonSystem = messages.filter( + (m): m is Exclude => m.role !== "system" + ); + + const response = await this.client.messages.create({ + model: this.model, + max_tokens: MAX_TOKENS, + ...(system.length > 0 ? { system } : {}), + messages: toAnthropicMessages(nonSystem), + ...(tools.length > 0 ? { tools: tools.map(toAnthropicTool) } : {}), + }); + + const textBlocks = response.content.filter( + (c): c is Anthropic.TextBlock => c.type === "text" + ); + const toolUseBlocks = response.content.filter( + (c): c is Anthropic.ToolUseBlock => c.type === "tool_use" + ); + + return { + role: "assistant", + content: textBlocks.map((b) => b.text).join("") || null, + ...(toolUseBlocks.length > 0 + ? { + tool_calls: toolUseBlocks.map((tu) => ({ + id: tu.id, + type: "function" as const, + function: { + name: tu.name, + // Anthropic returns a parsed object; re-encode to match the + // OpenAI-style LlmToolCallRequest.function.arguments shape. + arguments: JSON.stringify(tu.input), + }, + })), + } + : {}), + }; + } +} + +/** + * Converts OpenAI-style LlmMessage[] to Anthropic MessageParam[]. + * + * Structural differences from OpenAI: + * - Anthropic has no `tool` role. Tool results go as `user` messages with + * `tool_result` content blocks. + * - Anthropic has no `system` message role — system prompts flow through + * the top-level `system` parameter on `messages.create`. Callers strip + * system messages before calling this function; the parameter type + * enforces that invariant. + * - Consecutive tool-result messages are merged into a single user message + * so the API never receives two adjacent user turns. + * - Assistant content is an array of TextBlockParam / ToolUseBlockParam. + */ +function toAnthropicMessages( + messages: Exclude[] +): Anthropic.MessageParam[] { + const result: Anthropic.MessageParam[] = []; + + for (const msg of messages) { + if (msg.role === "user") { + result.push({ role: "user", content: msg.content }); + } else if (msg.role === "assistant") { + const content: Anthropic.ContentBlockParam[] = []; + if (msg.content) { + content.push({ type: "text", text: msg.content }); + } + for (const tc of msg.tool_calls ?? []) { + let input: unknown; + try { + input = JSON.parse(tc.function.arguments); + } catch { + input = {}; + } + content.push({ type: "tool_use", id: tc.id, name: tc.function.name, input }); + } + result.push({ role: "assistant", content }); + } else { + // msg.role === "tool" + const block: Anthropic.ToolResultBlockParam = { + type: "tool_result", + tool_use_id: msg.tool_call_id, + content: msg.content, + }; + + // Merge into the preceding user message when it already holds + // tool_result blocks — the Anthropic API rejects two adjacent user turns. + const prev = result[result.length - 1]; + if ( + prev?.role === "user" && + Array.isArray(prev.content) && + (prev.content as Anthropic.ContentBlockParam[])[0]?.type === + "tool_result" + ) { + (prev.content as Anthropic.ContentBlockParam[]).push(block); + } else { + result.push({ role: "user", content: [block] }); + } + } + } + + return result; +} + +function toAnthropicTool(tool: LlmToolDefinition): Anthropic.Tool { + return { + name: tool.name, + description: tool.description, + input_schema: tool.parameters as Anthropic.Tool.InputSchema, + }; +} diff --git a/evals/llm/index.ts b/evals/llm/index.ts new file mode 100644 index 0000000..b959fd4 --- /dev/null +++ b/evals/llm/index.ts @@ -0,0 +1,38 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { LlmProvider } from "./types.js"; +import { AnthropicProvider } from "./anthropic.js"; +import { OpenAiProvider } from "./openai.js"; + +/** + * Returns the default LLM provider by inspecting environment variables. + * + * Priority order: + * 1. ANTHROPIC_API_KEY → AnthropicProvider (claude-haiku-4-5-20251001) + * 2. OPENAI_API_KEY → OpenAiProvider / LiteLLM proxy / Ollama (gpt-4o-mini default) + * + * Set LITELLM_BASE_URL alongside OPENAI_API_KEY to route through a LiteLLM + * proxy, e.g. to reach Claude via the OpenAI-compatible endpoint. Set + * OPENAI_MODEL to override the chat model (e.g. `qwen2.5:32b-instruct-q4_K_M` + * when proxying through Ollama at `http://localhost:11434/v1`). + */ +export function createDefaultLlmProvider(): LlmProvider { + if (process.env.ANTHROPIC_API_KEY) { + return new AnthropicProvider(); + } + if (process.env.OPENAI_API_KEY) { + return new OpenAiProvider({ + model: process.env.OPENAI_MODEL, + baseURL: process.env.LITELLM_BASE_URL, + }); + } + throw new Error( + "No LLM provider configured. Set ANTHROPIC_API_KEY or OPENAI_API_KEY " + + "before running evals (npm run test:evals)." + ); +} diff --git a/evals/llm/openai.ts b/evals/llm/openai.ts new file mode 100644 index 0000000..ab6e1fc --- /dev/null +++ b/evals/llm/openai.ts @@ -0,0 +1,130 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import OpenAI from "openai"; +import type { + AssistantMessage, + LlmMessage, + LlmProvider, + LlmToolDefinition, +} from "./types.js"; + +const DEFAULT_MODEL = "gpt-4o-mini"; + +export interface OpenAiProviderOptions { + /** Chat model to use. Defaults to gpt-4o-mini. */ + model?: string; + /** + * Override the API base URL. Point this at a LiteLLM proxy to route calls + * through any provider the proxy supports without changing client code. + */ + baseURL?: string; + /** + * API key. Defaults to the OPENAI_API_KEY environment variable, which is + * the standard OpenAI SDK default. + */ + apiKey?: string; +} + +export class OpenAiProvider implements LlmProvider { + private readonly client: OpenAI; + private readonly model: string; + + constructor({ + model = DEFAULT_MODEL, + baseURL, + apiKey, + }: OpenAiProviderOptions = {}) { + this.model = model; + this.client = new OpenAI({ + ...(apiKey !== undefined ? { apiKey } : {}), + ...(baseURL !== undefined ? { baseURL } : {}), + }); + } + + async chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise { + const response = await this.client.chat.completions.create({ + model: this.model, + messages: messages.map(toOaiMessage), + ...(tools.length > 0 ? { tools: tools.map(toOaiTool) } : {}), + }); + + const choice = response.choices[0]; + if (!choice) { + throw new Error("OpenAI returned no choices"); + } + + const msg = choice.message; + return { + role: "assistant", + content: msg.content ?? null, + ...(msg.tool_calls + ? { + tool_calls: msg.tool_calls + .filter( + (tc): tc is OpenAI.ChatCompletionMessageFunctionToolCall => + tc.type === "function" + ) + .map((tc) => ({ + id: tc.id, + type: "function" as const, + function: { + name: tc.function.name, + arguments: tc.function.arguments, + }, + })), + } + : {}), + }; + } +} + +function toOaiMessage(msg: LlmMessage): OpenAI.ChatCompletionMessageParam { + switch (msg.role) { + case "system": + return { role: "system", content: msg.content }; + case "user": + return { role: "user", content: msg.content }; + case "assistant": + return { + role: "assistant", + content: msg.content, + ...(msg.tool_calls + ? { + tool_calls: msg.tool_calls.map((tc) => ({ + id: tc.id, + type: "function" as const, + function: { + name: tc.function.name, + arguments: tc.function.arguments, + }, + })), + } + : {}), + }; + case "tool": + return { + role: "tool", + content: msg.content, + tool_call_id: msg.tool_call_id, + }; + } +} + +function toOaiTool(tool: LlmToolDefinition): OpenAI.ChatCompletionTool { + return { + type: "function", + function: { + name: tool.name, + description: tool.description, + parameters: tool.parameters, + }, + }; +} diff --git a/evals/llm/types.ts b/evals/llm/types.ts new file mode 100644 index 0000000..44a4d04 --- /dev/null +++ b/evals/llm/types.ts @@ -0,0 +1,61 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** A single tool the LLM may call, described in JSON Schema. */ +export interface LlmToolDefinition { + name: string; + description: string; + /** JSON Schema object describing the tool's input parameters. */ + parameters: Record; +} + +/** One tool invocation requested by the LLM in an assistant turn. */ +export interface LlmToolCallRequest { + id: string; + type: "function"; + function: { + name: string; + /** JSON-encoded argument object. */ + arguments: string; + }; +} + +/** + * Discriminated union covering every role that can appear in a chat thread. + * Shaped after the OpenAI chat messages API so a single interface works for + * both the OpenAI and Anthropic adapters (and any LiteLLM proxy in between). + * + * Anthropic note: Anthropic's HTTP API takes the system prompt as a + * top-level `system: string` parameter on `messages.create`, not inside + * the messages array. The adapter extracts `system`-roled messages from + * the union and passes them via that parameter — this discriminant only + * dictates the SHAPE the harness uses internally. + */ +export type LlmMessage = + | { role: "system"; content: string } + | { role: "user"; content: string } + | { + role: "assistant"; + content: string | null; + tool_calls?: LlmToolCallRequest[]; + } + | { role: "tool"; content: string; tool_call_id: string }; + +/** Narrowed assistant message — what LlmProvider.chat() must return. */ +export type AssistantMessage = Extract; + +/** + * Minimal provider contract every LLM adapter must satisfy. + * The interface is intentionally thin: give it a message history + tool + * catalogue, get back the next assistant turn (possibly with tool calls). + */ +export interface LlmProvider { + chat( + messages: LlmMessage[], + tools: LlmToolDefinition[] + ): Promise; +} diff --git a/evals/runMcpHostLoop.ts b/evals/runMcpHostLoop.ts new file mode 100644 index 0000000..b750f2f --- /dev/null +++ b/evals/runMcpHostLoop.ts @@ -0,0 +1,183 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { createServer } from "../src/server.js"; +import type { Trajectory, ToolCall } from "./types.js"; +import type { LlmProvider, LlmMessage } from "./llm/types.js"; +import { createDefaultLlmProvider } from "./llm/index.js"; + +/** Maximum LLM → tool-call turns before halting to prevent runaway evals. */ +const MAX_TURNS = 8; + +/** + * Returns true when an MCP tool should be exposed to the LLM. + * + * Mirrors the MCP host visibility contract — tools marked + * `_meta.ui.visibility: ["app"]` (without `"model"`) are invoked exclusively + * by an MCP app via `app.callServerTool()`. Real hosts (Claude Desktop, + * Cursor) hide those from the LLM; the eval harness must do the same to + * match what the model actually sees in production. + */ +function isVisibleToModel(tool: { _meta?: unknown }): boolean { + const meta = tool._meta as + | { ui?: { visibility?: readonly string[] } } + | undefined; + const visibility = meta?.ui?.visibility; + if (!visibility || visibility.length === 0) return true; + if (visibility.includes("model")) return true; + return !visibility.includes("app"); +} + +export interface HostLoopOptions { + /** + * Pre-built MCP server to test against. + * + * Pass a server constructed with mocked services for dataset-level evals + * that don't need a live cluster. Omit to use `createServer()`, which reads + * CLUSTERS_JSON / CLUSTERS_FILE and requires a real Elastic cluster. + * + * Each call to `runMcpHostLoop` should receive a **fresh** server instance; + * reusing a connected server across calls is not supported. + */ + server?: McpServer; + /** + * LLM provider used to simulate the MCP host making tool-call decisions. + * Defaults to auto-selecting from ANTHROPIC_API_KEY / OPENAI_API_KEY. + */ + llm?: LlmProvider; + /** + * Maximum number of LLM→tool-call turns per run. + * Defaults to MAX_TURNS (8). + */ + maxTurns?: number; + /** + * Optional system prompt prepended to the message history. + * + * Real MCP hosts (Claude Desktop, Cursor) inject a host-level system prompt + * that constrains tool selection, response shape, and confirmation flow. + * Without one, the harness measures raw model-vs-tools behavior, which can + * over- or under-report activation depending on the model family. Use this + * to pin behavior to what the production host will instruct, or to swap in + * a SKILL.md body when testing skill-driven flows. + * + * Pass a non-empty string. Empty strings are ignored to keep behavior + * identical to omitting the option. + */ + systemPrompt?: string; +} + +/** + * Simulates one MCP host loop run entirely in-process. + * + * Architecture: + * LLM ↔ Client ↔─InMemoryTransport─↔ McpServer ↔ (ES / Kibana clients) + * + * The function: + * 1. Wires a fresh Client to the server via InMemoryTransport. + * 2. Lists available MCP tools and hands them to the LLM as tool definitions. + * 3. Loops up to `maxTurns` times: + * a. Asks the LLM for the next assistant turn. + * b. If the LLM emits tool calls, executes each via client.callTool(). + * c. Records every call in the trajectory. + * d. Feeds results back into the message history. + * e. Breaks when the LLM emits no tool calls (task complete). + * 4. Closes the client and returns the trajectory. + */ +export async function runMcpHostLoop( + input: string, + { + server, + llm, + maxTurns = MAX_TURNS, + systemPrompt, + }: HostLoopOptions = {} +): Promise { + const resolvedServer = server ?? createServer(); + const resolvedLlm = llm ?? createDefaultLlmProvider(); + + const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair(); + await resolvedServer.connect(serverTransport); + + const client = new Client({ name: "eval-host", version: "1.0.0" }); + await client.connect(clientTransport); + + try { + const { tools: mcpTools } = await client.listTools(); + // Strip app-only tools — they're invoked by the React workbench via + // `app.callServerTool()` and a real MCP host (Claude Desktop, Cursor) + // hides them from the LLM by inspecting `_meta.ui.visibility`. Without + // this filter the model sees `find-rules`, `start-translation`, + // `install-rules`, etc. as alternatives to the model-facing entry + // points and the activation rate collapses on smaller models. + const toolDefs = mcpTools.filter(isVisibleToModel).map((t) => ({ + name: t.name, + description: t.description ?? "", + parameters: t.inputSchema as Record, + })); + + const messages: LlmMessage[] = []; + if (systemPrompt && systemPrompt.trim().length > 0) { + messages.push({ role: "system", content: systemPrompt }); + } + messages.push({ role: "user", content: input }); + const trajectory: Trajectory = []; + + for (let turn = 0; turn < maxTurns; turn++) { + const response = await resolvedLlm.chat(messages, toolDefs); + messages.push(response); + + if (!response.tool_calls || response.tool_calls.length === 0) { + // LLM chose not to call a tool — simulation complete. + break; + } + + for (const toolCall of response.tool_calls) { + const toolName = toolCall.function.name; + let toolArgs: Record; + try { + toolArgs = JSON.parse(toolCall.function.arguments) as Record< + string, + unknown + >; + } catch { + // Malformed JSON from the LLM; record the call with empty args + // so the trajectory evaluator can detect the failure. + toolArgs = {}; + } + + const result = await client.callTool({ + name: toolName, + arguments: toolArgs, + }); + + const record: ToolCall = { + tool: toolName, + args: toolArgs, + result: result.content, + }; + trajectory.push(record); + + // Feed the tool result back so the LLM can reason about it. + messages.push({ + role: "tool", + content: JSON.stringify(result.content), + tool_call_id: toolCall.id, + }); + } + } + + return trajectory; + } finally { + // Closing the client also closes clientTransport, which triggers + // serverTransport.onclose() — the InMemoryTransport linked pair + // tears down cleanly without needing an explicit server.close(). + await client.close(); + } +} diff --git a/evals/runner.ts b/evals/runner.ts new file mode 100644 index 0000000..f92a338 --- /dev/null +++ b/evals/runner.ts @@ -0,0 +1,128 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, afterAll } from "vitest"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import type { Dataset, EvalResult, EvaluatorResult, Evaluator } from "./types.js"; +import { runMcpHostLoop } from "./runMcpHostLoop.js"; + +export interface RunnerOptions { + /** Minimum numeric score [0–1] for a test to pass. Defaults to 0.5. */ + passingScore?: number; + /** + * Factory that produces a fresh McpServer for each example. + * + * A fresh instance is required per-run because InMemoryTransport is torn + * down after each `runMcpHostLoop` call. When omitted, `runMcpHostLoop` + * falls back to `createServer()`, which requires `CLUSTERS_JSON`. + * + * Pass `createEvalServer` from `evals/helpers/evalServer.ts` to run eval + * suites without a live Elastic cluster (only API keys are needed). + */ + createServer?: () => McpServer; +} + +/** + * Registers a Vitest suite for every example in `dataset`. + * + * The entire suite is skipped unless `RUN_LLM_EVALS=1` is set in the + * environment, so regular `npm test` incurs zero LLM cost. + * + * Each example becomes one `it` that: + * 1. Runs the in-process MCP host loop to collect a trajectory. + * 2. Passes the trajectory to every evaluator. + * 3. Asserts that numeric scores meet `passingScore`. + * + * After all examples complete, a Markdown summary is written to stdout so + * the GitHub Actions job summary (>> $GITHUB_STEP_SUMMARY) can capture it. + */ +export function runDataset( + dataset: Dataset, + evaluators: Record, + options: RunnerOptions = {} +): void { + const { passingScore = 0.5, createServer } = options; + + const hasLlmProvider = + !!process.env.ANTHROPIC_API_KEY || !!process.env.OPENAI_API_KEY; + describe.skipIf(!process.env.RUN_LLM_EVALS || !hasLlmProvider)(dataset.name, () => { + const results: EvalResult[] = []; + + for (const example of dataset.examples) { + it(example.id, async () => { + const trajectory = await runMcpHostLoop(example.input, { + server: createServer?.(), + }); + + const evalResults: Record = {}; + for (const [name, evaluator] of Object.entries(evaluators)) { + evalResults[name] = await evaluator(trajectory, example.expected); + } + + const result: EvalResult = { + exampleId: example.id, + input: example.input, + trajectory, + evaluators: evalResults, + }; + results.push(result); + + for (const [name, evalResult] of Object.entries(evalResults)) { + if (evalResult.score !== "N/A") { + expect( + evalResult.score, + `[${name}] score ${evalResult.score.toFixed(2)} < ${passingScore}` + + (evalResult.reason ? `: ${evalResult.reason}` : "") + ).toBeGreaterThanOrEqual(passingScore); + } + } + }); + } + + afterAll(() => { + process.stdout.write(buildMarkdownSummary(dataset.name, results) + "\n"); + }); + }); +} + +function buildMarkdownSummary(datasetName: string, results: EvalResult[]): string { + if (results.length === 0) { + return `## Eval results: ${datasetName}\n\n_No examples ran._\n`; + } + + const evaluatorNames = Array.from( + new Set(results.flatMap((r) => Object.keys(r.evaluators))) + ); + + const headers = ["id", "input", ...evaluatorNames]; + const separator = headers.map(() => "---"); + + const rows = results.map((r) => { + const scoreCells = evaluatorNames.map((name) => { + const e = r.evaluators[name]; + if (!e) return "—"; + if (e.score === "N/A") return "N/A"; + return `${(e.score * 100).toFixed(0)}%`; + }); + return [r.exampleId, truncate(r.input, 60), ...scoreCells]; + }); + + const lines = [ + `## Eval results: ${datasetName}`, + "", + `| ${headers.join(" | ")} |`, + `| ${separator.join(" | ")} |`, + ...rows.map((row) => `| ${row.join(" | ")} |`), + "", + ]; + + return lines.join("\n"); +} + +function truncate(s: string, maxLen: number): string { + return s.length <= maxLen ? s : `${s.slice(0, maxLen - 1)}…`; +} diff --git a/evals/types.ts b/evals/types.ts new file mode 100644 index 0000000..4722075 --- /dev/null +++ b/evals/types.ts @@ -0,0 +1,75 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** A single tool invocation captured during an MCP host loop run. */ +export interface ToolCall { + tool: string; + args: Record; + result?: unknown; +} + +/** Ordered sequence of tool calls produced by one eval run. */ +export type Trajectory = ToolCall[]; + +/** + * What a passing run should look like. + * `tools` and `criteria` are both optional — evaluators that depend on them + * return `'N/A'` when the field is absent, so a dataset can omit whichever + * dimension is irrelevant for a given example. + */ +export interface ExpectedBehavior { + /** Ordered list of tool names the host should call. Used by trajectory / tool-selection evaluators. */ + tools?: string[]; + /** Natural-language assertions checked by the criteria (LLM-as-judge) evaluator. */ + criteria?: string[]; + /** Skill ID that should be activated. Used by the skill-activation evaluator. */ + skill?: string; +} + +/** One test case inside a dataset. */ +export interface Example { + /** Stable identifier — used as a key in result tables and CI summaries. */ + id: string; + /** The user message sent to the LLM host at the start of the simulation. */ + input: string; + expected: ExpectedBehavior; +} + +/** A named collection of examples that can be loaded by the runner. */ +export interface Dataset { + name: string; + examples: Example[]; +} + +/** + * Output of a single evaluator for one example. + * `score` is a value in [0, 1] when the evaluator ran, or `'N/A'` when the + * evaluator skipped (e.g. `expected.tools` was absent for trajectory evaluator). + */ +export interface EvaluatorResult { + score: number | 'N/A'; + /** Human-readable explanation of the score, required when score is numeric. */ + reason?: string; +} + +/** Aggregate result for one example after all evaluators have run. */ +export interface EvalResult { + exampleId: string; + input: string; + trajectory: Trajectory; + /** Keys are evaluator names (e.g. `'skill-activation'`, `'trajectory'`). */ + evaluators: Record; +} + +/** + * Contract every evaluator module must satisfy. + * Async to accommodate LLM-as-judge evaluators that call an LLM provider. + */ +export type Evaluator = ( + trajectory: Trajectory, + expected: ExpectedBehavior +) => EvaluatorResult | Promise; diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts new file mode 100644 index 0000000..9b363f4 --- /dev/null +++ b/evals/vitest.config.ts @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { defineConfig } from "vitest/config"; + +/** + * Vitest config for LLM eval suites. Intentionally separate from the main + * vitest.config.ts so `npm test` never picks up eval files (and thus never + * makes LLM calls or requires API keys in a regular dev/CI run). + * + * Run via: npm run test:evals + */ +export default defineConfig({ + test: { + environment: "node", + globals: true, + include: ["evals/**/*.{test,spec,eval}.ts"], + testTimeout: 120_000, + }, +}); diff --git a/manifest.json b/manifest.json index b0694a4..a27d689 100644 --- a/manifest.json +++ b/manifest.json @@ -2,7 +2,7 @@ "manifest_version": "0.3", "name": "elastic-security-mcp-app", "display_name": "Elastic Security", - "version": "1.0.2", + "version": "1.1.0", "description": "Interactive blue-team security operations for Elastic Security — alert triage, attack discovery, case management, detection rules, threat hunting, and sample data generation.", "long_description": "An MCP App server that brings interactive blue-team security operations directly into Claude Desktop. Provides six rich React-based UIs that render inline in the conversation: alert triage with AI verdicts, AI-powered attack discovery with confidence scoring and MITRE mapping, case management with the Kibana Cases API, detection rule browsing and tuning, an ES|QL threat-hunting workbench with a D3 investigation graph, and an ECS sample-data generator for demos.", "author": { @@ -57,6 +57,10 @@ { "name": "generate-sample-data", "description": "Generate ECS-compliant security events for demos" + }, + { + "name": "migrate-rules", + "description": "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security" } ], "tools_generated": true, diff --git a/package-lock.json b/package-lock.json index 08e9dde..156fc31 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,6 +33,7 @@ "elastic-security-mcp-app": "dist/main.js" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.96.0", "@tailwindcss/vite": "^4.2.2", "@testing-library/jest-dom": "^6.9.1", "@testing-library/react": "^16.3.2", @@ -54,6 +55,7 @@ "husky": "^9.1.7", "jsdom": "^29.1.1", "lint-staged": "^16.4.0", + "openai": "^6.37.0", "tailwindcss": "^4.2.2", "tsx": "^4.21.0", "typescript": "^6.0.2", @@ -73,6 +75,28 @@ "dev": true, "license": "MIT" }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.96.0", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.96.0.tgz", + "integrity": "sha512-KlCsODtTyb17bLUVCSDC2HtSvAbJf60sEiPEax9dInF+aDF92vS4TZJ5XD7YCQXNb1/5icYaw8Y7wMjPlIV9Zg==", + "dev": true, + "license": "MIT", + "dependencies": { + "json-schema-to-ts": "^3.1.1", + "standardwebhooks": "^1.0.0" + }, + "bin": { + "anthropic-ai-sdk": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.25.0 || ^4.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, "node_modules/@asamuzakjp/css-color": { "version": "5.1.11", "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", @@ -1859,6 +1883,13 @@ ], "peer": true }, + "node_modules/@stablelib/base64": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@stablelib/base64/-/base64-1.0.1.tgz", + "integrity": "sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@standard-schema/spec": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", @@ -4239,6 +4270,13 @@ "dev": true, "license": "MIT" }, + "node_modules/fast-sha256": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/fast-sha256/-/fast-sha256-1.3.0.tgz", + "integrity": "sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==", + "dev": true, + "license": "Unlicense" + }, "node_modules/fast-uri": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", @@ -4944,6 +4982,20 @@ "dev": true, "license": "MIT" }, + "node_modules/json-schema-to-ts": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz", + "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.18.3", + "ts-algebra": "^2.0.0" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -5750,14 +5802,6 @@ "node": ">= 18" } }, - "node_modules/monaco-promql": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/monaco-promql/-/monaco-promql-1.8.0.tgz", - "integrity": "sha512-XdgRojBzEe/rKtrJaHbSfoMFOMD5TXymDHIitTngmBT6XEjtAirnA7Rb2YJAO1SZrJfgvAo4LFCzJ71fH7+WOw==", - "license": "MIT", - "optional": true, - "peer": true - }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -5868,6 +5912,28 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/openai": { + "version": "6.37.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-6.37.0.tgz", + "integrity": "sha512-0H5dEGFmmLv6KSd0W1w2nyL8WsLkX6yoLeQpU+dZAOuGcany5qkYQMmj35ZrKgb6yiyYqpUzFOpR8mZQkgqeEQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", @@ -6632,6 +6698,17 @@ "dev": true, "license": "MIT" }, + "node_modules/standardwebhooks": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/standardwebhooks/-/standardwebhooks-1.0.0.tgz", + "integrity": "sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@stablelib/base64": "^1.0.0", + "fast-sha256": "^1.3.0" + } + }, "node_modules/state-local": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/state-local/-/state-local-1.0.7.tgz", @@ -6871,6 +6948,13 @@ "tree-kill": "cli.js" } }, + "node_modules/ts-algebra": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", + "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==", + "dev": true, + "license": "MIT" + }, "node_modules/ts-api-utils": { "version": "2.5.0", "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz", diff --git a/package.json b/package.json index 983e3ca..2308b39 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "test": "vitest", "test:run": "vitest run", "test:coverage": "vitest run --coverage", + "test:evals": "cross-env RUN_LLM_EVALS=1 vitest run --config evals/vitest.config.ts --reporter=verbose", "prepublishOnly": "npm run build", "prepare": "husky", "version": "node -e \"const m=JSON.parse(require('fs').readFileSync('manifest.json','utf8'));m.version=require('./package.json').version;require('fs').writeFileSync('manifest.json',JSON.stringify(m,null,2)+'\\n')\" && git add manifest.json" @@ -81,6 +82,7 @@ "react-dom": "^19.2.4" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.96.0", "@tailwindcss/vite": "^4.2.2", "@testing-library/jest-dom": "^6.9.1", "@testing-library/react": "^16.3.2", @@ -102,6 +104,7 @@ "husky": "^9.1.7", "jsdom": "^29.1.1", "lint-staged": "^16.4.0", + "openai": "^6.37.0", "tailwindcss": "^4.2.2", "tsx": "^4.21.0", "typescript": "^6.0.2", diff --git a/skills/automatic-migration/SKILL.md b/skills/automatic-migration/SKILL.md new file mode 100644 index 0000000..ce51d85 --- /dev/null +++ b/skills/automatic-migration/SKILL.md @@ -0,0 +1,101 @@ +--- +name: automatic-migration +description: > + Migrate detection rules from Splunk (or other SIEMs) to Elastic Security. Use for + "migrate my Splunk rules", "import SPL", "onboard from Splunk", "SIEM migration", + "convert detection rules", "translate SPL to EQL", or any request to move security + rules from a third-party platform into Elastic. Vendor support: Splunk (active), + QRadar / Sentinel-One (coming soon). +--- + +# Automatic Migration + +Migrate third-party SIEM detection rules into Elastic Security using the `elastic-security` +MCP connector. Call `migrate-rules` ONCE — it opens an interactive workbench that guides +the SOC engineer through every stage of the migration. Do NOT attempt to drive the process +step-by-step through prose or individual tool calls; the workbench handles all state +transitions internally. + +## Tools + +| Tool | Caller | Purpose | +|------|--------|---------| +| `migrate-rules` | Model | **Entry point.** Opens the interactive migration workbench. No parameters required. | +| `list-migrations` | Workbench | List all existing SIEM migrations | +| `get-migration` | Workbench | Get status and rule counts for a specific migration | +| `get-translated-rules` | Workbench | Fetch translated rules (paginated, filterable) | +| `start-translation` | Workbench | Trigger AI translation of uploaded rules | +| `stop-translation` | Workbench | Cancel an in-progress translation | +| `update-translated-rule` | Workbench | Save manual edits to a translated rule | +| `get-resources` | Workbench | List macro/lookup resources referenced by translated rules | +| `upsert-resource` | Workbench | Create or update a macro or lookup definition | +| `install-rules` | Workbench | Install translated rules into Elastic Security (installed as disabled) | +| `get-stats` | Workbench | Get translation progress counts for a migration | + +Only `migrate-rules` is model-facing. All other tools are called by the workbench via its +back-channel. Do not call them directly in conversation. + +## Workbench Lifecycle + +| Stage | What the user does | Completion signal | +|-------|--------------------|-------------------| +| **vendor-select** | Picks the source SIEM (Splunk active; QRadar / Sentinel-One coming soon) | Vendor button clicked | +| **upload** | Drops a JSON export file, uses the file picker, or pastes a rules array | "Upload & start translation" clicked | +| **translating** | Waits while the AI translator processes rules; live progress bar | Migration status reaches `finished` or `error` | +| **review** | Reviews each rule's three-column diff (original SPL / generated / editable) | "Install N rules" clicked | +| **fix-rule-drawer** | Edits key fields of a single rule (name, query, language, severity, risk score) via structured form; "Re-validate" marks it `partial`, "Save" uses the selected result | Drawer closed | +| **fix-resources-drawer** | Provides definitions for unresolved macros and lookups; each row has an individual Save button calling `upsert-resource` | "Done" in the drawer | +| **install** | Confirms installation of all translatable rules; "Back to review" is available | "Confirm install" clicked | +| **done** | Views the installed / failed summary | — | + +## Correction Strategy + +If the user wants to revisit or undo a step: + +- **Start over at any step**: the "Start over" button in the header resets to vendor-select. +- **Back from install confirmation**: click "Back to review" to return without installing. +- **Re-edit a specific rule**: re-open the rule drawer from the review list and save again; + each save calls `update-translated-rule` and refreshes the list in-place. +- **Re-edit a resource**: re-open the resources drawer; each per-row "Save" calls + `upsert-resource` and re-fetches the resources list without closing the drawer. +- **Restart translation**: use "Start over", re-upload the rules, then re-trigger translation. + +The workbench never permanently deletes data. Translation results and rule edits are persisted +in Kibana; re-opening the workbench via `migrate-rules` will show all prior migrations. + +## Common Gotchas + +**Vendor not supported.** QRadar and Sentinel-One show as "Coming soon" — their vendor-select +buttons are disabled. If the user asks to migrate from a non-Splunk platform, explain that +only Splunk is currently supported and suggest they check the Elastic roadmap for updates. + +**Calling app-only tools directly.** Do not call `start-translation`, `get-translated-rules`, +`install-rules`, or any other app-only tool manually. They are wired to the workbench +back-channel and will return raw JSON with no useful context in a prose conversation. Always +call `migrate-rules` once and let the workbench drive everything else. + +**Upload format.** The upload step expects a JSON array of Splunk rule objects as exported from +the Splunk Enterprise Security Rules page. Each object must include a `search` field containing +the raw SPL query. Other formats (YAML, CSV, Splunk `.conf` files) are not supported and will +fail silently. + +**Partial translations.** Rules marked `partial` were AI-translated but may need tuning before +they match the customer's data. They can be installed, but Elastic Security will show them as +disabled; the SOC engineer should review and enable them manually. Rules marked `untranslatable` +are skipped during installation entirely. + +**Macro and lookup references.** Splunk rules that reference custom macros or lookups will +translate with placeholder references. The fix-resources-drawer lists all detected unresolved +references and auto-expands them. Fill in each definition before installing — installed rules +that reference undefined macros will not fire correctly. + +**Large rule sets.** Translation is asynchronous. For large exports (hundreds of rules), the +translating stage may run for several minutes. The progress bar polls every 3 seconds +automatically. Do not suggest calling `stop-translation` unless the user explicitly wants to +cancel and discard in-progress results. + +**Re-opening an existing migration.** Calling `migrate-rules` when one or more migrations +already exist will show them in the response JSON. The workbench starts at vendor-select each +time — there is no "resume" flow yet. To continue working on an existing migration, the user +must navigate through the workbench stages again; prior translations are preserved on the +server and will reappear in the review step after re-triggering translation. diff --git a/src/elastic/service/index.ts b/src/elastic/service/index.ts index 38671ee..3c6e574 100644 --- a/src/elastic/service/index.ts +++ b/src/elastic/service/index.ts @@ -19,3 +19,18 @@ export type { ScenarioRuleDef, } from "./sampleDataService.js"; export { SampleDataService, SCENARIO_NAMES, SCENARIO_RULES } from "./sampleDataService.js"; +export type { + SiemMigration, + TranslatedRule, + MigrationResource, + MigrationStats, + ListTranslatedRulesOptions, + ListTranslatedRulesResult, + InstallRulesOptions, + InstallRulesResult, +} from "./migrationsService.js"; +export { + MigrationApiError, + MigrationsService, + SIEM_MIGRATIONS_API_BASE, +} from "./migrationsService.js"; diff --git a/src/elastic/service/migrationsService.test.ts b/src/elastic/service/migrationsService.test.ts new file mode 100644 index 0000000..0c184e7 --- /dev/null +++ b/src/elastic/service/migrationsService.test.ts @@ -0,0 +1,329 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, beforeEach } from "vitest"; +import { + MigrationsService, + MigrationApiError, + SIEM_MIGRATIONS_API_BASE, +} from "./migrationsService.js"; +import type { KibanaClient } from "../kibana-client/index.js"; +import { + createMockKibanaClient, + dataEnvelope, + type MockHttpClient, +} from "../../test/helpers/mockHttpClient.js"; +import type { SiemMigration, TranslatedRule, MigrationResource } from "./migrationsService.js"; + +const BASE = SIEM_MIGRATIONS_API_BASE; +const HEADERS = { headers: { "elastic-api-version": "2023-10-31" } }; + +const MIGRATION_ID = "migration-1"; +const RULE_ID = "rule-1"; + +const fakeMigration: SiemMigration = { + id: MIGRATION_ID, + name: "test-migration", + status: "ready", + created_at: "2026-01-01T00:00:00Z", + last_updated_at: "2026-01-01T00:00:00Z", + rules: { + total: 0, pending: 0, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, +}; + +const fakeRule: TranslatedRule = { + id: RULE_ID, + migration_id: MIGRATION_ID, + status: "completed", + translation_result: "full", + original_rule: { name: "splunk-rule" }, +}; + +const fakeResource: MigrationResource = { + type: "macro", + name: "my_macro", + content: "| where true", +}; + +describe("MigrationsService", () => { + let kibanaClient: KibanaClient & MockHttpClient; + let service: MigrationsService; + + beforeEach(() => { + kibanaClient = createMockKibanaClient(); + service = new MigrationsService({ kibanaClient }); + }); + + // ── Migration lifecycle ──────────────────────────────────────────────────── + + describe("createMigration", () => { + it("POSTs to /rules with the migration name and returns migration_id", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ migration_id: MIGRATION_ID })); + + const result = await service.createMigration("My Migration"); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules`, + { name: "My Migration" }, + HEADERS + ); + expect(result).toEqual({ migration_id: MIGRATION_ID }); + }); + }); + + describe("listMigrations", () => { + it("GETs /rules and returns the array", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeMigration])); + + const result = await service.listMigrations(); + + expect(kibanaClient.get).toHaveBeenCalledWith(`${BASE}/rules`, HEADERS); + expect(result).toEqual([fakeMigration]); + }); + }); + + describe("getMigration", () => { + it("GETs /rules/:migrationId and returns the migration", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeMigration)); + + const result = await service.getMigration(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}`, + HEADERS + ); + expect(result).toEqual(fakeMigration); + }); + }); + + describe("deleteMigration", () => { + it("DELETEs /rules/:migrationId", async () => { + await service.deleteMigration(MIGRATION_ID); + + expect(kibanaClient.delete).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}`, + HEADERS + ); + }); + }); + + // ── Rule upload ──────────────────────────────────────────────────────────── + + describe("uploadRules", () => { + it("POSTs rules array to /rules/:migrationId/rules and returns totals", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ total: 5 })); + const splunkRules = [{ search: "index=main" }, { search: "index=security" }]; + + const result = await service.uploadRules(MIGRATION_ID, splunkRules); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules`, + splunkRules, + HEADERS + ); + expect(result).toEqual({ total: 5 }); + }); + }); + + // ── Translated rules ─────────────────────────────────────────────────────── + + describe("getTranslatedRules", () => { + it("GETs /rules/:migrationId/rules with default pagination", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [fakeRule], total: 1 })); + + const result = await service.getTranslatedRules(MIGRATION_ID); + + const [path, config] = kibanaClient.get.mock.calls[0] as [string, Record]; + expect(path).toBe(`${BASE}/rules/${MIGRATION_ID}/rules`); + expect(config.params).toMatchObject({ page: "1", per_page: "20" }); + expect(result).toEqual({ data: [fakeRule], total: 1 }); + }); + + it("forwards custom page, perPage and filter params", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope({ data: [], total: 0 })); + + await service.getTranslatedRules(MIGRATION_ID, { page: 2, perPage: 50, filter: "status:completed" }); + + const [, config] = kibanaClient.get.mock.calls[0] as [string, Record]; + expect(config.params).toEqual({ page: "2", per_page: "50", filter: "status:completed" }); + }); + }); + + describe("getTranslatedRule", () => { + it("GETs /rules/:migrationId/rules/:ruleId", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(fakeRule)); + + const result = await service.getTranslatedRule(MIGRATION_ID, RULE_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`, + HEADERS + ); + expect(result).toEqual(fakeRule); + }); + }); + + describe("updateTranslatedRule", () => { + it("PUTs updates to /rules/:migrationId/rules/:ruleId and returns the updated rule", async () => { + const updated = { ...fakeRule, translation_result: "partial" as const }; + kibanaClient.put.mockResolvedValueOnce(dataEnvelope(updated)); + + const result = await service.updateTranslatedRule(MIGRATION_ID, RULE_ID, { + translation_result: "partial", + }); + + expect(kibanaClient.put).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/rules/${RULE_ID}`, + { translation_result: "partial" }, + HEADERS + ); + expect(result).toEqual(updated); + }); + }); + + // ── Translation control ──────────────────────────────────────────────────── + + describe("startTranslation", () => { + it("POSTs to /rules/:migrationId/start", async () => { + await service.startTranslation(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/start`, + {}, + HEADERS + ); + }); + }); + + describe("stopTranslation", () => { + it("POSTs to /rules/:migrationId/stop", async () => { + await service.stopTranslation(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/stop`, + {}, + HEADERS + ); + }); + }); + + // ── Resources ────────────────────────────────────────────────────────────── + + describe("getResources", () => { + it("GETs /resources/:migrationId and returns the array", async () => { + kibanaClient.get.mockResolvedValueOnce(dataEnvelope([fakeResource])); + + const result = await service.getResources(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/resources/${MIGRATION_ID}`, + HEADERS + ); + expect(result).toEqual([fakeResource]); + }); + }); + + describe("upsertResources", () => { + it("POSTs resources array to /resources/:migrationId", async () => { + await service.upsertResources(MIGRATION_ID, [fakeResource]); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/resources/${MIGRATION_ID}`, + [fakeResource], + HEADERS + ); + }); + }); + + // ── Installation ─────────────────────────────────────────────────────────── + + describe("installRules", () => { + it("POSTs empty body to /rules/:migrationId/install when no ids given", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 3, failed: 0 })); + + const result = await service.installRules(MIGRATION_ID); + + expect(kibanaClient.post).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/install`, + {}, + HEADERS + ); + expect(result).toEqual({ installed: 3, failed: 0 }); + }); + + it("includes ids in the body when provided", async () => { + kibanaClient.post.mockResolvedValueOnce(dataEnvelope({ installed: 1, failed: 0 })); + + await service.installRules(MIGRATION_ID, { ids: ["r1", "r2"] }); + + const [, body] = kibanaClient.post.mock.calls[0] as [string, Record]; + expect(body).toEqual({ ids: ["r1", "r2"] }); + }); + }); + + // ── Stats ────────────────────────────────────────────────────────────────── + + describe("getStats", () => { + it("GETs /rules/:migrationId/stats and returns the stats", async () => { + const stats = { id: MIGRATION_ID, status: "ready" as const, rules: fakeMigration.rules }; + kibanaClient.get.mockResolvedValueOnce(dataEnvelope(stats)); + + const result = await service.getStats(MIGRATION_ID); + + expect(kibanaClient.get).toHaveBeenCalledWith( + `${BASE}/rules/${MIGRATION_ID}/stats`, + HEADERS + ); + expect(result).toEqual(stats); + }); + }); + + // ── MigrationApiError ────────────────────────────────────────────────────── + + describe("MigrationApiError", () => { + it("wraps non-2xx with status parsed from Kibana error format", async () => { + const path = `${BASE}/rules/${MIGRATION_ID}`; + kibanaClient.get.mockRejectedValue( + new Error("Kibana [test-cluster] 404: migration not found") + ); + + await expect(service.getMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.getMigration(MIGRATION_ID)).rejects.toMatchObject({ + status: 404, + path, + message: expect.stringContaining(path) as string, + }); + }); + + it("sets status 0 when error message has no HTTP status code", async () => { + kibanaClient.get.mockRejectedValueOnce(new Error("network timeout")); + + const err = await service.getMigration(MIGRATION_ID).catch((e) => e as MigrationApiError); + expect(err).toBeInstanceOf(MigrationApiError); + expect(err.status).toBe(0); + }); + + it("surfaces a MigrationApiError from every mutating method", async () => { + const netErr = new Error("Kibana [test-cluster] 503: service unavailable"); + + kibanaClient.post.mockRejectedValue(netErr); + kibanaClient.put.mockRejectedValue(netErr); + kibanaClient.delete.mockRejectedValue(netErr); + + await expect(service.createMigration("x")).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.uploadRules(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.startTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.stopTranslation(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.upsertResources(MIGRATION_ID, [])).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.installRules(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.updateTranslatedRule(MIGRATION_ID, RULE_ID, {})).rejects.toBeInstanceOf(MigrationApiError); + await expect(service.deleteMigration(MIGRATION_ID)).rejects.toBeInstanceOf(MigrationApiError); + }); + }); +}); diff --git a/src/elastic/service/migrationsService.ts b/src/elastic/service/migrationsService.ts new file mode 100644 index 0000000..ffd0dd4 --- /dev/null +++ b/src/elastic/service/migrationsService.ts @@ -0,0 +1,361 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { KibanaClient } from "../kibana-client/index.js"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +export const SIEM_MIGRATIONS_API_BASE = "/internal/siem_migrations"; + +/** + * Per-request headers required by the Kibana internal SIEM migrations API. + * `x-elastic-internal-origin: Kibana` is pre-baked into `KibanaClient`; + * only the versioning header needs to be added on each call. + */ +const MIGRATION_HEADERS = { + "elastic-api-version": "2023-10-31", +} as const; + +// --------------------------------------------------------------------------- +// Domain types +// --------------------------------------------------------------------------- + +export interface SiemMigration { + id: string; + name: string; + /** Lifecycle status of the migration. */ + status: "ready" | "running" | "finished" | "error"; + created_at: string; + last_updated_at: string; + rules: { + total: number; + pending: number; + processing: number; + completed: number; + failed: number; + installable: number; + installed: number; + partially_translated: number; + untranslatable: number; + }; +} + +export interface TranslatedRule { + id: string; + migration_id: string; + status: "pending" | "processing" | "completed" | "failed"; + translation_result?: "full" | "partial" | "untranslatable"; + elastic_rule?: Record; + original_rule: Record; + comments?: string[]; +} + +export interface MigrationResource { + type: "macro" | "lookup"; + name: string; + content: string; +} + +export interface MigrationStats { + id: string; + status: SiemMigration["status"]; + rules: SiemMigration["rules"]; +} + +export interface ListTranslatedRulesOptions { + readonly page?: number; + readonly perPage?: number; + readonly filter?: string; +} + +export interface ListTranslatedRulesResult { + data: TranslatedRule[]; + total: number; +} + +export interface InstallRulesOptions { + /** Specific rule IDs to install; omit to install all installable rules. */ + ids?: string[]; +} + +export interface InstallRulesResult { + installed: number; + failed: number; +} + +// --------------------------------------------------------------------------- +// Typed error +// --------------------------------------------------------------------------- + +/** + * Thrown by every {@link MigrationsService} method on a non-2xx response. + * + * The Kibana client's response interceptor formats AxiosErrors as + * `"Kibana [] : "` before they reach here, so + * `status` is extracted from that message when available. + */ +export class MigrationApiError extends Error { + readonly status: number; + readonly path: string; + + constructor(path: string, cause: unknown) { + const causeMsg = cause instanceof Error ? cause.message : String(cause); + // Match the Kibana client error format: "Kibana [name] STATUS: detail" + const statusMatch = causeMsg.match(/\b([1-5]\d{2})\b/); + const status = statusMatch ? parseInt(statusMatch[1], 10) : 0; + + super(`SIEM Migrations API error on ${path}: ${causeMsg}`); + this.name = "MigrationApiError"; + this.status = status; + this.path = path; + if (cause instanceof Error) { + this.cause = cause; + } + } +} + +// --------------------------------------------------------------------------- +// Service +// --------------------------------------------------------------------------- + +interface MigrationsServiceOptions { + readonly kibanaClient: KibanaClient; +} + +/** + * Thin wrapper over the 14 `/internal/siem_migrations/*` Kibana routes. + * + * Every method adds `elastic-api-version: 2023-10-31`; the underlying + * {@link KibanaClient} supplies `x-elastic-internal-origin: Kibana` and + * authentication on every request. Non-2xx responses are re-thrown as + * {@link MigrationApiError}. + */ +export class MigrationsService { + private readonly client: KibanaClient; + + constructor(options: MigrationsServiceOptions) { + this.client = options.kibanaClient; + } + + // ── Migration lifecycle ────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules */ + async createMigration(name: string): Promise<{ migration_id: string }> { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules`; + try { + const { data } = await this.client.post<{ migration_id: string }>( + path, + { name }, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules */ + async listMigrations(): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules/:migrationId */ + async getMigration(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** DELETE /internal/siem_migrations/rules/:migrationId */ + async deleteMigration(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}`; + try { + await this.client.delete(path, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Splunk rule upload ─────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/rules */ + async uploadRules( + migrationId: string, + rules: Record[] + ): Promise<{ total: number }> { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`; + try { + const { data } = await this.client.post<{ total: number }>( + path, + rules, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Translated rules ───────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/rules/:migrationId/rules */ + async getTranslatedRules( + migrationId: string, + options: ListTranslatedRulesOptions = {} + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules`; + const params: Record = { + page: String(options.page ?? 1), + per_page: String(options.perPage ?? 20), + }; + if (options.filter) params.filter = options.filter; + + try { + const { data } = await this.client.get(path, { + params, + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** GET /internal/siem_migrations/rules/:migrationId/rules/:ruleId */ + async getTranslatedRule( + migrationId: string, + ruleId: string + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** PUT /internal/siem_migrations/rules/:migrationId/rules/:ruleId */ + async updateTranslatedRule( + migrationId: string, + ruleId: string, + updates: Partial> + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/rules/${ruleId}`; + try { + const { data } = await this.client.put(path, updates, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Translation control ────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/start */ + async startTranslation(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/start`; + try { + await this.client.post(path, {}, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** POST /internal/siem_migrations/rules/:migrationId/stop */ + async stopTranslation(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stop`; + try { + await this.client.post(path, {}, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Resources ──────────────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/resources/:migrationId */ + async getResources(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + /** POST /internal/siem_migrations/resources/:migrationId */ + async upsertResources( + migrationId: string, + resources: MigrationResource[] + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/resources/${migrationId}`; + try { + await this.client.post(path, resources, { headers: MIGRATION_HEADERS }); + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Installation ───────────────────────────────────────────────────────── + + /** POST /internal/siem_migrations/rules/:migrationId/install */ + async installRules( + migrationId: string, + options: InstallRulesOptions = {} + ): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/install`; + try { + const { data } = await this.client.post( + path, + options.ids ? { ids: options.ids } : {}, + { headers: MIGRATION_HEADERS } + ); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } + + // ── Stats ──────────────────────────────────────────────────────────────── + + /** GET /internal/siem_migrations/rules/:migrationId/stats */ + async getStats(migrationId: string): Promise { + const path = `${SIEM_MIGRATIONS_API_BASE}/rules/${migrationId}/stats`; + try { + const { data } = await this.client.get(path, { + headers: MIGRATION_HEADERS, + }); + return data; + } catch (err) { + throw new MigrationApiError(path, err); + } + } +} diff --git a/src/server.ts b/src/server.ts index deb1a38..814f04a 100644 --- a/src/server.ts +++ b/src/server.ts @@ -31,6 +31,7 @@ import { EsqlService, IndicesService, InvestigateService, + MigrationsService, RulesService, SampleDataService, } from "./elastic/service/index.js"; @@ -38,6 +39,7 @@ import { registerAlertTriageTools } from "./tools/alert-triage.js"; import { registerAttackDiscoveryTools } from "./tools/attack-discovery.js"; import { registerCaseManagementTools } from "./tools/case-management.js"; import { registerDetectionRuleTools } from "./tools/detection-rules.js"; +import { registerMigrationTools } from "./tools/migration.js"; import { registerSampleDataTools } from "./tools/sample-data.js"; import { registerThreatHuntTools } from "./tools/threat-hunt.js"; @@ -95,6 +97,7 @@ export function createServer(deps: CreateServerDeps = {}): McpServer { sampleDataClient: new SampleDataClient({ esClient }), rulesService, }); + const migrationsService = new MigrationsService({ kibanaClient }); const server = new McpServer({ name: "elastic-security", @@ -115,6 +118,7 @@ export function createServer(deps: CreateServerDeps = {}): McpServer { attackDiscoveryService, casesService, }); + registerMigrationTools(server, { migrationsService }); return server; } diff --git a/src/test/helpers/mockHttpClient.ts b/src/test/helpers/mockHttpClient.ts index b843524..f640f2c 100644 --- a/src/test/helpers/mockHttpClient.ts +++ b/src/test/helpers/mockHttpClient.ts @@ -17,6 +17,7 @@ import type { KibanaClient } from "../../elastic/kibana-client/kibana-client.js" export interface MockHttpClient { get: Mock; post: Mock; + put: Mock; patch: Mock; delete: Mock; clusterName: string; @@ -48,6 +49,7 @@ function makeMock(clusterName: string): MockHttpClient { return { get: vi.fn().mockResolvedValue({ data: undefined }), post: vi.fn().mockResolvedValue({ data: undefined }), + put: vi.fn().mockResolvedValue({ data: undefined }), patch: vi.fn().mockResolvedValue({ data: undefined }), delete: vi.fn().mockResolvedValue({ data: undefined }), clusterName, diff --git a/src/test/helpers/mockServices.ts b/src/test/helpers/mockServices.ts index bb77c48..819e95c 100644 --- a/src/test/helpers/mockServices.ts +++ b/src/test/helpers/mockServices.ts @@ -13,6 +13,7 @@ import type { EntityDetailService } from "../../elastic/service/entityDetailServ import type { EsqlService } from "../../elastic/service/esqlService.js"; import type { IndicesService } from "../../elastic/service/indicesService.js"; import type { InvestigateService } from "../../elastic/service/investigateService.js"; +import type { MigrationsService } from "../../elastic/service/migrationsService.js"; import type { RulesService } from "../../elastic/service/rulesService.js"; import type { SampleDataService } from "../../elastic/service/sampleDataService.js"; @@ -99,6 +100,25 @@ export function createMockRulesService(): RulesService { ]); } +export function createMockMigrationsService(): MigrationsService { + return mockService([ + "createMigration", + "listMigrations", + "getMigration", + "deleteMigration", + "uploadRules", + "getTranslatedRules", + "getTranslatedRule", + "updateTranslatedRule", + "startTranslation", + "stopTranslation", + "getResources", + "upsertResources", + "installRules", + "getStats", + ]); +} + export function createMockSampleDataService(): SampleDataService { return mockService([ "generateSampleData", diff --git a/src/test/integration/server.integration.test.ts b/src/test/integration/server.integration.test.ts index eb26b9e..1771fac 100644 --- a/src/test/integration/server.integration.test.ts +++ b/src/test/integration/server.integration.test.ts @@ -139,6 +139,18 @@ describe("MCP server integration (in-process Client + Server)", () => { "generate-attack-discovery", "get-generation-status", "list-ai-connectors", + // automatic-migration + "migrate-rules", + "list-migrations", + "get-migration", + "get-translated-rules", + "start-translation", + "stop-translation", + "update-translated-rule", + "get-resources", + "upsert-resource", + "install-rules", + "get-stats", ].sort() ); } finally { @@ -159,6 +171,7 @@ describe("MCP server integration (in-process Client + Server)", () => { "ui://threat-hunt/mcp-app.html", "ui://generate-sample-data/mcp-app.html", "ui://triage-attack-discoveries/mcp-app.html", + "ui://migrate-rules/mcp-app.html", ].sort() ); } finally { diff --git a/src/tools/migration.test.ts b/src/tools/migration.test.ts new file mode 100644 index 0000000..7193075 --- /dev/null +++ b/src/tools/migration.test.ts @@ -0,0 +1,408 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import fs from "fs"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; + +import { registerMigrationTools } from "./migration.js"; +import { + createMockMcpServer, + parseToolText, + type MockMcpServer, +} from "../test/helpers/mockMcpServer.js"; +import { createMockMigrationsService } from "../test/helpers/mockServices.js"; +import type { MigrationsService } from "../elastic/service/index.js"; + +const RESOURCE_URI = "ui://migrate-rules/mcp-app.html"; +const MIGRATION_ID = "m-1"; +const RULE_ID = "r-1"; + +function setup() { + const server = createMockMcpServer(); + const migrationsService = createMockMigrationsService(); + vi.spyOn(fs, "existsSync").mockReturnValue(false); + vi.spyOn(fs, "readFileSync").mockReturnValue("migration"); + registerMigrationTools(server as unknown as McpServer, { migrationsService }); + return { server, migrationsService }; +} + +describe("registerMigrationTools", () => { + let server: MockMcpServer; + let migrationsService: MigrationsService; + + beforeEach(() => { + ({ server, migrationsService } = setup()); + }); + + // ── Registration ─────────────────────────────────────────────────────────── + + it("registers all 11 tools and the HTML resource", () => { + expect([...server.tools.keys()].sort()).toEqual( + [ + "migrate-rules", + "list-migrations", + "get-migration", + "get-translated-rules", + "start-translation", + "stop-translation", + "update-translated-rule", + "get-resources", + "upsert-resource", + "install-rules", + "get-stats", + ].sort() + ); + expect([...server.resources.keys()]).toEqual([RESOURCE_URI]); + }); + + // ── migrate-rules (model-facing) ─────────────────────────────────────────── + + describe("migrate-rules", () => { + it("returns a compact migration list for the LLM to see", async () => { + vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([ + { + id: MIGRATION_ID, + name: "Splunk prod", + status: "ready", + created_at: "2026-01-01T00:00:00Z", + last_updated_at: "2026-01-01T00:00:00Z", + rules: { + total: 10, pending: 5, processing: 0, completed: 5, failed: 0, + installable: 5, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }, + ]); + + const out = parseToolText<{ message: string; migrations: unknown[] }>( + await server.tool("migrate-rules").callback({}) + ); + + expect(out.message).toContain("workbench"); + expect(out.migrations).toHaveLength(1); + expect(out.migrations[0]).toMatchObject({ id: MIGRATION_ID, name: "Splunk prod" }); + }); + }); + + // ── list-migrations ──────────────────────────────────────────────────────── + + describe("list-migrations", () => { + it("delegates to migrationsService.listMigrations and returns the array", async () => { + vi.mocked(migrationsService.listMigrations).mockResolvedValueOnce([]); + + const out = parseToolText( + await server.tool("list-migrations").callback({}) + ); + + expect(migrationsService.listMigrations).toHaveBeenCalledTimes(1); + expect(out).toEqual([]); + }); + }); + + // ── get-migration ────────────────────────────────────────────────────────── + + describe("get-migration", () => { + it("calls getMigration with the provided ID", async () => { + vi.mocked(migrationsService.getMigration).mockResolvedValueOnce({ + id: MIGRATION_ID, + name: "test", + status: "ready", + created_at: "", + last_updated_at: "", + rules: { + total: 0, pending: 0, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }); + + await server.tool("get-migration").callback({ migrationId: MIGRATION_ID }); + + expect(migrationsService.getMigration).toHaveBeenCalledWith(MIGRATION_ID); + }); + }); + + // ── get-translated-rules ─────────────────────────────────────────────────── + + describe("get-translated-rules", () => { + it("forwards pagination params to getTranslatedRules", async () => { + vi.mocked(migrationsService.getTranslatedRules).mockResolvedValueOnce({ + data: [], + total: 0, + }); + + await server.tool("get-translated-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + page: 2, + perPage: 50, + filter: "status:completed", + }); + + expect(migrationsService.getTranslatedRules).toHaveBeenCalledWith( + MIGRATION_ID, + { page: 2, perPage: 50, filter: "status:completed" } + ); + }); + + it("returns vendorNotSupported for a non-Splunk vendor", async () => { + const out = parseToolText<{ error: string; vendor: string }>( + await server.tool("get-translated-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out).toEqual({ error: "vendorNotSupported", vendor: "qradar" }); + expect(migrationsService.getTranslatedRules).not.toHaveBeenCalled(); + }); + }); + + // ── start-translation ────────────────────────────────────────────────────── + + describe("start-translation", () => { + it("calls startTranslation and returns { status: 'started' }", async () => { + vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("start-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.startTranslation).toHaveBeenCalledWith(MIGRATION_ID); + expect(out.status).toBe("started"); + }); + + it("returns vendorNotSupported for sentinel-one", async () => { + const out = parseToolText<{ error: string; vendor: string }>( + await server.tool("start-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "sentinel-one", + }) + ); + + expect(out).toEqual({ error: "vendorNotSupported", vendor: "sentinel-one" }); + expect(migrationsService.startTranslation).not.toHaveBeenCalled(); + }); + }); + + // ── stop-translation ─────────────────────────────────────────────────────── + + describe("stop-translation", () => { + it("calls stopTranslation and returns { status: 'stopped' }", async () => { + vi.mocked(migrationsService.stopTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("stop-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.stopTranslation).toHaveBeenCalledWith(MIGRATION_ID); + expect(out.status).toBe("stopped"); + }); + + it("returns vendorNotSupported for an unknown vendor", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("stop-translation").callback({ + migrationId: MIGRATION_ID, + vendor: "unknown-siem", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.stopTranslation).not.toHaveBeenCalled(); + }); + }); + + // ── update-translated-rule ───────────────────────────────────────────────── + + describe("update-translated-rule", () => { + it("parses elasticRule JSON and passes updates to service", async () => { + vi.mocked(migrationsService.updateTranslatedRule).mockResolvedValueOnce({ + id: RULE_ID, + migration_id: MIGRATION_ID, + status: "completed", + translation_result: "partial", + original_rule: {}, + }); + const elasticRule = { name: "Fixed rule", type: "query" }; + + await server.tool("update-translated-rule").callback({ + migrationId: MIGRATION_ID, + ruleId: RULE_ID, + vendor: "splunk", + elasticRule: JSON.stringify(elasticRule), + translationResult: "partial", + }); + + expect(migrationsService.updateTranslatedRule).toHaveBeenCalledWith( + MIGRATION_ID, + RULE_ID, + expect.objectContaining({ + elastic_rule: elasticRule, + translation_result: "partial", + }) + ); + }); + + it("returns vendorNotSupported without calling service", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("update-translated-rule").callback({ + migrationId: MIGRATION_ID, + ruleId: RULE_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.updateTranslatedRule).not.toHaveBeenCalled(); + }); + }); + + // ── get-resources ────────────────────────────────────────────────────────── + + describe("get-resources", () => { + it("calls getResources with migrationId", async () => { + vi.mocked(migrationsService.getResources).mockResolvedValueOnce([ + { type: "macro", name: "my_macro", content: "| where true" }, + ]); + + const out = parseToolText( + await server.tool("get-resources").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + }) + ); + + expect(migrationsService.getResources).toHaveBeenCalledWith(MIGRATION_ID); + expect(out).toHaveLength(1); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("get-resources").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + }); + }); + + // ── upsert-resource ──────────────────────────────────────────────────────── + + describe("upsert-resource", () => { + it("calls upsertResources with a single-element array", async () => { + vi.mocked(migrationsService.upsertResources).mockResolvedValueOnce(undefined); + + await server.tool("upsert-resource").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + type: "macro", + name: "splunk_macro", + content: "| eval x=1", + }); + + expect(migrationsService.upsertResources).toHaveBeenCalledWith( + MIGRATION_ID, + [{ type: "macro", name: "splunk_macro", content: "| eval x=1" }] + ); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("upsert-resource").callback({ + migrationId: MIGRATION_ID, + vendor: "sentinel-one", + type: "macro", + name: "m", + content: "", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.upsertResources).not.toHaveBeenCalled(); + }); + }); + + // ── install-rules ────────────────────────────────────────────────────────── + + describe("install-rules", () => { + it("passes ids array to installRules", async () => { + vi.mocked(migrationsService.installRules).mockResolvedValueOnce({ + installed: 2, + failed: 0, + }); + + const out = parseToolText<{ installed: number; failed: number }>( + await server.tool("install-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "splunk", + ids: ["r-1", "r-2"], + }) + ); + + expect(migrationsService.installRules).toHaveBeenCalledWith( + MIGRATION_ID, + { ids: ["r-1", "r-2"] } + ); + expect(out).toEqual({ installed: 2, failed: 0 }); + }); + + it("returns vendorNotSupported for non-Splunk", async () => { + const out = parseToolText<{ error: string }>( + await server.tool("install-rules").callback({ + migrationId: MIGRATION_ID, + vendor: "qradar", + }) + ); + + expect(out.error).toBe("vendorNotSupported"); + expect(migrationsService.installRules).not.toHaveBeenCalled(); + }); + }); + + // ── get-stats ────────────────────────────────────────────────────────────── + + describe("get-stats", () => { + it("calls getStats and returns the result (no vendor gate)", async () => { + const stats = { + id: MIGRATION_ID, + status: "ready" as const, + rules: { + total: 5, pending: 5, processing: 0, completed: 0, failed: 0, + installable: 0, installed: 0, partially_translated: 0, untranslatable: 0, + }, + }; + vi.mocked(migrationsService.getStats).mockResolvedValueOnce(stats); + + const out = parseToolText( + await server.tool("get-stats").callback({ migrationId: MIGRATION_ID }) + ); + + expect(migrationsService.getStats).toHaveBeenCalledWith(MIGRATION_ID); + expect(out).toEqual(stats); + }); + }); + + // ── Vendor gate: undefined vendor is allowed ─────────────────────────────── + + it("proceeds when vendor parameter is absent (defaults to Splunk path)", async () => { + vi.mocked(migrationsService.startTranslation).mockResolvedValueOnce(undefined); + + const out = parseToolText<{ status: string }>( + await server.tool("start-translation").callback({ migrationId: MIGRATION_ID }) + ); + + expect(out.status).toBe("started"); + expect(migrationsService.startTranslation).toHaveBeenCalled(); + }); +}); diff --git a/src/tools/migration.ts b/src/tools/migration.ts new file mode 100644 index 0000000..5502bd2 --- /dev/null +++ b/src/tools/migration.ts @@ -0,0 +1,353 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { + registerAppTool, + registerAppResource, + RESOURCE_MIME_TYPE, +} from "@modelcontextprotocol/ext-apps/server"; +import { z } from "zod"; +import fs from "fs"; +import type { MigrationsService } from "../elastic/service/index.js"; +import { resolveViewPath } from "./view-path.js"; + +const RESOURCE_URI = "ui://migrate-rules/mcp-app.html"; + +/** + * Vendors for which the Kibana SIEM migrations translator is production-ready. + * Re-enabling a vendor is a one-line change to this array once the translator + * matures — QRadar and Sentinel-One are the next candidates. + */ +const SUPPORTED_VENDORS: readonly string[] = ["splunk"]; + +export interface MigrationToolDeps { + readonly migrationsService: MigrationsService; +} + +/** Returns a vendor-gate error response for app-only tools. */ +function vendorNotSupportedResponse(vendor: string) { + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ error: "vendorNotSupported", vendor }), + }, + ], + }; +} + +/** Returns true when `vendor` is explicitly provided but not in SUPPORTED_VENDORS. */ +function isUnsupportedVendor(vendor: string | undefined): vendor is string { + return vendor !== undefined && !SUPPORTED_VENDORS.includes(vendor); +} + +export function registerMigrationTools( + server: McpServer, + deps: MigrationToolDeps +) { + const { migrationsService } = deps; + + // ── Model-facing entry-point ─────────────────────────────────────────────── + + registerAppTool( + server, + "migrate-rules", + { + title: "Migrate Rules", + description: + "Migrate detection rules from Splunk (and other SIEMs) to Elastic Security. " + + "Opens an interactive migration workbench for uploading, translating, reviewing, " + + "and installing rules. Vendor support: Splunk (active), QRadar / Sentinel-One (coming soon).", + inputSchema: {}, + _meta: { ui: { resourceUri: RESOURCE_URI } }, + }, + async () => { + const migrations = await migrationsService.listMigrations(); + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ + message: "Opening SIEM migration workbench", + migrations: migrations.map(({ id, name, status }) => ({ id, name, status })), + }), + }, + ], + }; + } + ); + + // ── App-only tools ───────────────────────────────────────────────────────── + + registerAppTool( + server, + "list-migrations", + { + title: "List Migrations", + description: "List all SIEM rule migrations.", + inputSchema: {}, + _meta: { ui: { visibility: ["app"] } }, + }, + async () => { + const migrations = await migrationsService.listMigrations(); + return { + content: [{ type: "text" as const, text: JSON.stringify(migrations) }], + }; + } + ); + + registerAppTool( + server, + "get-migration", + { + title: "Get Migration", + description: "Get details for a specific SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId }) => { + const migration = await migrationsService.getMigration(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(migration) }], + }; + } + ); + + registerAppTool( + server, + "get-translated-rules", + { + title: "Get Translated Rules", + description: "Get translated rules for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + page: z.number().optional(), + perPage: z.number().optional(), + filter: z.string().optional(), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, page, perPage, filter }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const result = await migrationsService.getTranslatedRules(migrationId, { + page, + perPage, + filter, + }); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "start-translation", + { + title: "Start Translation", + description: "Start the AI translation process for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.startTranslation(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "started" }) }], + }; + } + ); + + registerAppTool( + server, + "stop-translation", + { + title: "Stop Translation", + description: "Stop the AI translation process for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.stopTranslation(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "stopped" }) }], + }; + } + ); + + registerAppTool( + server, + "update-translated-rule", + { + title: "Update Translated Rule", + description: "Update a translated rule in a SIEM migration (e.g. fix its Elastic rule JSON).", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + ruleId: z.string().describe("Translated rule ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + elasticRule: z + .string() + .optional() + .describe("JSON-encoded Elastic rule updates"), + translationResult: z + .enum(["full", "partial", "untranslatable"]) + .optional(), + comments: z.array(z.string()).optional(), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, ruleId, vendor, elasticRule, translationResult, comments }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const updates: Record = {}; + if (elasticRule !== undefined) + updates.elastic_rule = JSON.parse(elasticRule) as Record; + if (translationResult !== undefined) updates.translation_result = translationResult; + if (comments !== undefined) updates.comments = comments; + const result = await migrationsService.updateTranslatedRule(migrationId, ruleId, updates); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "get-resources", + { + title: "Get Resources", + description: "Get macro/lookup resources for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const resources = await migrationsService.getResources(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(resources) }], + }; + } + ); + + registerAppTool( + server, + "upsert-resource", + { + title: "Upsert Resource", + description: "Create or update a macro/lookup resource in a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + type: z.enum(["macro", "lookup"]).describe("Resource type"), + name: z.string().describe("Resource name"), + content: z.string().describe("Resource content"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, type, name, content }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + await migrationsService.upsertResources(migrationId, [{ type, name, content }]); + return { + content: [{ type: "text" as const, text: JSON.stringify({ status: "ok" }) }], + }; + } + ); + + registerAppTool( + server, + "install-rules", + { + title: "Install Rules", + description: "Install translated rules from a SIEM migration into Elastic Security.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + vendor: z + .string() + .optional() + .describe("Source vendor (e.g. 'splunk'). Non-Splunk returns an error."), + ids: z + .array(z.string()) + .optional() + .describe("Specific rule IDs to install. Omit to install all installable rules."), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId, vendor, ids }) => { + if (isUnsupportedVendor(vendor)) return vendorNotSupportedResponse(vendor); + const result = await migrationsService.installRules(migrationId, { ids }); + return { + content: [{ type: "text" as const, text: JSON.stringify(result) }], + }; + } + ); + + registerAppTool( + server, + "get-stats", + { + title: "Get Stats", + description: "Get translation and installation statistics for a SIEM migration.", + inputSchema: { + migrationId: z.string().describe("Migration ID"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ migrationId }) => { + const stats = await migrationsService.getStats(migrationId); + return { + content: [{ type: "text" as const, text: JSON.stringify(stats) }], + }; + } + ); + + // ── App resource (HTML workbench) ────────────────────────────────────────── + + const viewPath = resolveViewPath("migration"); + registerAppResource( + server, + RESOURCE_URI, + RESOURCE_URI, + { mimeType: RESOURCE_MIME_TYPE }, + async () => { + const html = fs.readFileSync(viewPath, "utf-8"); + return { + contents: [{ uri: RESOURCE_URI, mimeType: RESOURCE_MIME_TYPE, text: html }], + }; + } + ); +} diff --git a/src/views/migration/App.tsx b/src/views/migration/App.tsx new file mode 100644 index 0000000..badcc8d --- /dev/null +++ b/src/views/migration/App.tsx @@ -0,0 +1,1383 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import React, { useState, useCallback, useEffect, useMemo, useRef } from "react"; +import type { App as McpApp } from "@modelcontextprotocol/ext-apps"; +import { extractCallResult } from "../../shared/extract-tool-text"; +import { + AppHeader, + AppShell, + BackButton, + EmptyState, + KpiStrip, + KpiTile, + LoadingState, +} from "../../shared/components"; +import { useFullscreen } from "../../shared/hooks/useFullscreen"; +import { useMcpApp } from "../../shared/hooks/useMcpApp"; +import "./styles.css"; + +// --------------------------------------------------------------------------- +// Local domain types (shapes returned by the app-only migration tools) +// --------------------------------------------------------------------------- + +interface MigrationStats { + id: string; + name?: string; + /** Lifecycle status returned by get-migration. */ + status: "ready" | "running" | "finished" | "error" | string; + rules: { + total: number; + pending: number; + processing: number; + completed: number; + failed: number; + installable: number; + installed: number; + partially_translated: number; + untranslatable: number; + }; +} + +interface TranslatedRule { + id: string; + status: string; + translation_result?: "full" | "partial" | "untranslatable"; + original_rule: Record; + elastic_rule?: Record; + comments?: string[]; +} + +interface MigrationResource { + type: "macro" | "lookup"; + name: string; + content: string; +} + +interface InstallResult { + installed: number; + failed: number; +} + +// --------------------------------------------------------------------------- +// WorkbenchState discriminated union +// +// Each stage carries exactly the data it needs and no more. Transitions +// always move forward through the pipeline — no implicit shared state. +// --------------------------------------------------------------------------- + +export type WorkbenchState = + | { + stage: "vendor-select"; + } + | { + stage: "upload"; + vendor: string; + migrationId: string; + } + | { + stage: "translating"; + vendor: string; + migrationId: string; + stats: MigrationStats | null; + } + | { + stage: "review"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + } + | { + stage: "fix-rule-drawer"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + selectedRule: TranslatedRule; + } + | { + stage: "fix-resources-drawer"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + } + | { + stage: "install"; + vendor: string; + migrationId: string; + translations: TranslatedRule[]; + resources: MigrationResource[]; + } + | { + stage: "done"; + installed: number; + failed: number; + }; + +// --------------------------------------------------------------------------- +// Vendor catalogue — re-enabling a vendor is a one-line change here +// --------------------------------------------------------------------------- + +const SUPPORTED_VENDORS: readonly string[] = ["splunk"]; + +const VENDOR_CATALOGUE = [ + { id: "splunk", label: "Splunk" }, + { id: "qradar", label: "IBM QRadar" }, + { id: "sentinel-one", label: "Sentinel One" }, +] as const; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function callTool( + app: McpApp, + name: string, + args: Record +): Promise { + try { + const result = await app.callServerTool({ name, arguments: args }); + const text = extractCallResult(result); + if (!text) return null; + return JSON.parse(text) as T; + } catch (e) { + console.error(`[migration] ${name} failed:`, e); + return null; + } +} + +// --------------------------------------------------------------------------- +// App +// --------------------------------------------------------------------------- + +export function App() { + const [state, setState] = useState({ stage: "vendor-select" }); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + // For the translating stage: poll stats until translation completes + const pollTimerRef = useRef | null>(null); + + const clearPoll = useCallback(() => { + if (pollTimerRef.current !== null) { + clearTimeout(pollTimerRef.current); + pollTimerRef.current = null; + } + }, []); + + useEffect(() => () => clearPoll(), [clearPoll]); + + const { connected, getApp } = useMcpApp({ + name: "migration", + version: "1.0.0", + onConnect: (_app, _gotResult) => { + // No initial data load needed — the workbench starts at vendor-select. + }, + }); + + const fullscreen = useFullscreen(getApp); + + // ── Stage transitions ────────────────────────────────────────────────────── + + const selectVendor = useCallback( + async (vendor: string) => { + const app = getApp(); + if (!app) return; + setLoading(true); + setError(null); + try { + const res = await callTool<{ migration_id: string }>(app, "create-migration", { + name: `Migration ${new Date().toISOString().slice(0, 10)}`, + }); + if (!res?.migration_id) throw new Error("Failed to create migration"); + setState({ stage: "upload", vendor, migrationId: res.migration_id }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp] + ); + + const uploadRules = useCallback( + async (rulesJson: string) => { + const app = getApp(); + if (!app || state.stage !== "upload") return; + const { vendor, migrationId } = state; + setLoading(true); + setError(null); + try { + const rules = JSON.parse(rulesJson) as Record[]; + await callTool(app, "upload-rules", { migrationId, vendor, rules }); + await callTool(app, "start-translation", { migrationId, vendor }); + const stats = await callTool(app, "get-stats", { migrationId }); + setState({ stage: "translating", vendor, migrationId, stats: stats ?? null }); + schedulePoll(app, vendor, migrationId); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const schedulePoll = useCallback( + (app: McpApp, vendor: string, migrationId: string) => { + clearPoll(); + pollTimerRef.current = setTimeout(async () => { + // Use get-migration (not get-stats) so we get the strongly-typed status + // field ("ready" | "running" | "finished" | "error") alongside the rule counts. + const migration = await callTool(app, "get-migration", { migrationId }); + setState((prev) => { + if (prev.stage !== "translating") return prev; + return { ...prev, stats: migration ?? prev.stats }; + }); + // Translation is complete when Kibana sets status to "finished" or "error". + if (migration && (migration.status === "finished" || migration.status === "error")) { + void (async () => { + const translationsRes = await callTool<{ data: TranslatedRule[] }>( + app, "get-translated-rules", { migrationId, vendor, perPage: 500 } + ); + const resources = + (await callTool(app, "get-resources", { migrationId, vendor })) ?? []; + setState({ + stage: "review", + vendor, + migrationId, + translations: translationsRes?.data ?? [], + resources, + }); + })(); + } else { + schedulePoll(app, vendor, migrationId); + } + }, 3000); + }, + [clearPoll] + ); + + const openRuleDrawer = useCallback((rule: TranslatedRule) => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { ...prev, stage: "fix-rule-drawer", selectedRule: rule }; + }); + }, []); + + const saveRuleFix = useCallback( + async (elasticRuleJson: string, translationResult: "full" | "partial" | "untranslatable") => { + const app = getApp(); + if (!app || state.stage !== "fix-rule-drawer") return; + const { vendor, migrationId, translations, resources, selectedRule } = state; + setLoading(true); + setError(null); + try { + const updated = await callTool( + app, + "update-translated-rule", + { migrationId, ruleId: selectedRule.id, vendor, elasticRule: elasticRuleJson, translationResult } + ); + setState({ + stage: "review", + vendor, + migrationId, + resources, + translations: translations.map((t) => + t.id === selectedRule.id ? (updated ?? t) : t + ), + }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const saveRuleInline = useCallback( + async ( + ruleId: string, + elasticRuleJson: string, + translationResult: "full" | "partial" | "untranslatable" + ) => { + const app = getApp(); + if (!app || state.stage !== "review") return; + const { vendor, migrationId, translations, resources } = state; + setLoading(true); + setError(null); + try { + const updated = await callTool(app, "update-translated-rule", { + migrationId, + ruleId, + vendor, + elasticRule: elasticRuleJson, + translationResult, + }); + setState({ + stage: "review", + vendor, + migrationId, + resources, + translations: translations.map((t) => (t.id === ruleId ? (updated ?? t) : t)), + }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const openResourcesDrawer = useCallback(() => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { ...prev, stage: "fix-resources-drawer" }; + }); + }, []); + + const saveResources = useCallback( + async (resource: MigrationResource) => { + const app = getApp(); + if (!app || state.stage !== "fix-resources-drawer") return; + const { vendor, migrationId, translations } = state; + setLoading(true); + setError(null); + try { + await callTool(app, "upsert-resource", { migrationId, vendor, ...resource }); + const resources = + (await callTool(app, "get-resources", { migrationId, vendor })) ?? []; + setState({ stage: "fix-resources-drawer", vendor, migrationId, translations, resources }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, + [getApp, state] + ); + + const closeDrawer = useCallback(() => { + setState((prev) => { + if (prev.stage === "fix-rule-drawer" || prev.stage === "fix-resources-drawer") { + const { stage: _stage, ...rest } = prev as WorkbenchState & { + stage: "fix-rule-drawer" | "fix-resources-drawer"; + }; + void _stage; + return { ...(rest as { vendor: string; migrationId: string; translations: TranslatedRule[]; resources: MigrationResource[] }), stage: "review" }; + } + if (prev.stage === "install") { + return { stage: "review", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations, resources: prev.resources }; + } + return prev; + }); + }, []); + + const startInstall = useCallback(() => { + setState((prev) => { + if (prev.stage !== "review") return prev; + return { stage: "install", vendor: prev.vendor, migrationId: prev.migrationId, translations: prev.translations, resources: prev.resources }; + }); + }, []); + + const confirmInstall = useCallback(async () => { + const app = getApp(); + if (!app || state.stage !== "install") return; + const { vendor, migrationId } = state; + setLoading(true); + setError(null); + try { + const result = await callTool(app, "install-rules", { migrationId, vendor }); + setState({ stage: "done", installed: result?.installed ?? 0, failed: result?.failed ?? 0 }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + }, [getApp, state]); + + const reset = useCallback(() => { + clearPoll(); + setState({ stage: "vendor-select" }); + setError(null); + }, [clearPoll]); + + // ── Render ───────────────────────────────────────────────────────────────── + + // AppHeader expects { isFullscreen, onToggle } — useFullscreen returns { isFullscreen, toggle } + const fullscreenProp = { isFullscreen: fullscreen.isFullscreen, onToggle: fullscreen.toggle }; + + if (!connected) { + return ( + + + Connecting to Elastic Security… + + ); + } + + return ( + + + ) : undefined + } + /> + + {error && ( +
+ {error} + +
+ )} + + {loading && Working…} + + {!loading && renderStage(state, { + selectVendor, + uploadRules, + openRuleDrawer, + saveRuleFix, + saveRuleInline, + openResourcesDrawer, + saveResources, + closeDrawer, + startInstall, + confirmInstall, + reset, + })} +
+ ); +} + +// --------------------------------------------------------------------------- +// Per-stage renderers (extracted to keep App() readable) +// --------------------------------------------------------------------------- + +interface StageHandlers { + selectVendor: (vendor: string) => void; + uploadRules: (json: string) => void; + openRuleDrawer: (rule: TranslatedRule) => void; + saveRuleFix: (json: string, result: "full" | "partial" | "untranslatable") => void; + saveRuleInline: (id: string, json: string, result: "full" | "partial" | "untranslatable") => void; + openResourcesDrawer: () => void; + saveResources: (resource: MigrationResource) => void; + closeDrawer: () => void; + startInstall: () => void; + confirmInstall: () => void; + reset: () => void; +} + +function renderStage(state: WorkbenchState, h: StageHandlers): React.ReactNode { + switch (state.stage) { + case "vendor-select": + return ; + + case "upload": + return ; + + case "translating": + return ; + + case "review": + return ( + + ); + + case "fix-rule-drawer": + return ( + <> + + + + ); + + case "fix-resources-drawer": + return ( + <> + + + + ); + + case "install": + return ( + t.translation_result !== "untranslatable").length} + onConfirm={h.confirmInstall} + onBack={h.closeDrawer} + /> + ); + + case "done": + return ; + } +} + +// --------------------------------------------------------------------------- +// Stage components +// --------------------------------------------------------------------------- + +function VendorSelect({ onSelect }: { onSelect: (vendor: string) => void }) { + return ( +
+

Select your source SIEM

+

+ Choose the platform you are migrating detection rules from. +

+
+ {VENDOR_CATALOGUE.map(({ id, label }) => { + // ≤5-LOC client-side gate: only Splunk is production-ready. + // Add a vendor to SUPPORTED_VENDORS to re-enable it. + const active = SUPPORTED_VENDORS.includes(id); + return ( + + ); + })} +
+
+ ); +} + +function Upload({ vendor, onUpload }: { vendor: string; onUpload: (json: string) => void }) { + const [text, setText] = useState(""); + const [dragOver, setDragOver] = useState(false); + const fileInputRef = React.useRef(null); + + const readFile = (file: File) => { + const reader = new FileReader(); + reader.onload = (e) => setText((e.target?.result as string | null) ?? ""); + reader.readAsText(file); + }; + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault(); + setDragOver(false); + const file = e.dataTransfer.files[0]; + if (file) readFile(file); + }; + + return ( +
+

Upload {vendor} rules

+

+ Drop a JSON export file, use the file picker, or paste the rules array directly. +

+ + {/* Hidden file input wired to the drop zone button */} + { + const file = e.target.files?.[0]; + if (file) readFile(file); + e.target.value = ""; + }} + /> + +
{ e.preventDefault(); setDragOver(true); }} + onDragLeave={() => setDragOver(false)} + onDrop={handleDrop} + > + +

or drop a .json file here, or paste below

+