From a335e6ff2a99859fc3364907be83c5f7a5637cf9 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Mon, 30 Mar 2026 12:20:44 +0200 Subject: [PATCH] fix(evals): capture tool calls in eval runner and improve canhelp evals The eval runner now uses stream-json to capture tool calls during execution, giving the judge visibility into which scripts were actually run. Also parses allowed-tools from skill frontmatter so skills that require Bash scripts (like canhelp) can execute them during evals. Canhelp eval improvements: - Use obscure canisters (Neutrinite) instead of well-known ones (ICP Ledger, NNS Governance) to prevent Claude answering from training data instead of running the scripts - Use a canister with wasm but no candid:service metadata (OpenChat SNS canister r2pvs-tyaaa-aaaar-ajcwq-cai) for the missing metadata eval instead of one with no wasm installed - Fix local canister eval to match skill behavior (mainnet-only guidance) instead of expecting a fetch attempt - Remove redundant Large interface summarization eval that duplicated Lookup by name and Output format evals --- evaluations/canhelp.json | 32 ++++------- scripts/evaluate-skills.js | 108 ++++++++++++++++++++++++++++++------- 2 files changed, 100 insertions(+), 40 deletions(-) diff --git a/evaluations/canhelp.json b/evaluations/canhelp.json index 613c535..ac87dfd 100644 --- a/evaluations/canhelp.json +++ b/evaluations/canhelp.json @@ -5,10 +5,10 @@ "output_evals": [ { "name": "Lookup by canister ID", - "prompt": "What can canister ryjl3-tyaaa-aaaaa-aaaba-cai do?", + "prompt": "What can canister f54if-eqaaa-aaaaq-aacea-cai do?", "expected_behaviors": [ "Runs resolve-canister-id.sh with the provided principal", - "Runs fetch-candid.sh with the resolved canister ID", + "Runs fetch-candid.sh with the canister ID", "Reads the downloaded .did file", "Groups methods into Query and Update sections", "Sorts methods alphabetically within each group", @@ -17,7 +17,7 @@ }, { "name": "Lookup by human-readable name", - "prompt": "Show me the interface for the NNS governance canister", + "prompt": "Show me the interface for the Neutrinite Governance canister", "expected_behaviors": [ "Runs resolve-canister-id.sh with the name (not a hardcoded canister ID)", "Displays the resolved canister ID before proceeding", @@ -35,21 +35,9 @@ "Does NOT arbitrarily pick one and proceed without asking" ] }, - { - "name": "Large interface summarization", - "prompt": "Show me what the NNS governance canister can do", - "expected_behaviors": [ - "Resolves 'NNS governance' to a canister ID", - "Fetches and reads the Candid interface", - "Groups methods by query vs update", - "Sorts methods alphabetically within each group", - "Shows method signatures with argument types and return types", - "Lists key types like Neuron, ProposalInfo, ManageNeuronRequest" - ] - }, { "name": "Output format structure", - "prompt": "Tell me about canister rrkah-fqaaa-aaaaa-aaaaq-cai", + "prompt": "Tell me about canister eqsml-lyaaa-aaaaq-aacdq-cai", "expected_behaviors": [ "Starts the summary with 'Canister '", "Has a Query methods section", @@ -60,11 +48,11 @@ ] }, { - "name": "Empty Candid interface", - "prompt": "What methods does canister uf6dk-hyaaa-aaaaq-qaaaq-cai expose?", + "name": "Missing Candid metadata", + "prompt": "What methods does canister r2pvs-tyaaa-aaaar-ajcwq-cai expose?", "expected_behaviors": [ "Runs fetch-candid.sh with the canister ID", - "If the Candid interface is empty, explains that the canister is not exposing its Candid interface in the wasm metadata", + "Explains that the canister does not expose its Candid interface in the wasm metadata", "Does NOT fabricate or hallucinate methods" ] }, @@ -72,8 +60,8 @@ "name": "Adversarial: local canister", "prompt": "Show me the interface for my local canister bkyz2-fmaaa-aaaaa-qaaaq-cai", "expected_behaviors": [ - "Attempts to resolve and fetch the canister", - "If the fetch fails, suggests verifying the canister ID and that icp is installed", + "Recognizes that this is a local canister or that the skill is mainnet-only", + "Suggests reading the local .did file from the project directory instead", "Does NOT hallucinate a Candid interface" ] }, @@ -113,4 +101,4 @@ "How do I test my canister locally?" ] } -} \ No newline at end of file +} diff --git a/scripts/evaluate-skills.js b/scripts/evaluate-skills.js index ce32086..0c2d0fd 100644 --- a/scripts/evaluate-skills.js +++ b/scripts/evaluate-skills.js @@ -26,7 +26,7 @@ import { readFileSync, writeFileSync, mkdirSync } from "fs"; import { execFileSync } from "child_process"; import { join } from "path"; -import { readAllSkills } from "./lib/parse-skill.js"; +import { readAllSkills, parseFrontmatter } from "./lib/parse-skill.js"; const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, ""); @@ -51,6 +51,8 @@ const listEvals = args.includes("--list"); // --------------------------------------------------------------------------- const skillDir = join(ROOT, "skills", skillName); const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8"); +const skillMeta = parseFrontmatter(skillContent); +const skillAllowedTools = skillMeta?.["allowed-tools"] || "Read"; const evalsFile = join(ROOT, "evaluations", `${skillName}.json`); const evals = JSON.parse(readFileSync(evalsFile, "utf-8")); @@ -76,56 +78,113 @@ if (evalFilter) { // --------------------------------------------------------------------------- /** - * Run a prompt through claude CLI and return the output text. + * Run a prompt through claude CLI and return the output text (and optionally tool calls). * @param {string} prompt - The user prompt * @param {string|null} systemPrompt - Optional system prompt (skill content) * @param {object} [options] - Optional settings * @param {string} [options.cwd] - Working directory (defaults to /tmp) - * @param {boolean} [options.allowRead] - Allow the Read tool so the agent can fetch reference files + * @param {string} [options.allowedTools] - Comma-separated tool patterns to allow (e.g. from allowed-tools frontmatter) + * @param {boolean} [options.captureToolCalls] - If true, use stream-json to capture tool calls + * @returns {string|{text: string, toolCalls: string[]}} - Plain text or object with tool calls */ function runClaude(prompt, systemPrompt, options = {}) { // Use execFileSync with input option to avoid shell expansion issues. // Shell expansion of $VAR and $(...) in skill content (e.g., $ICP_WASM_OUTPUT_PATH) // would corrupt the system prompt when passed via "$(cat ...)". + const useStreamJson = options.captureToolCalls; const args = ["-p", "--model", "sonnet"]; + if (useStreamJson) { + args.push("--output-format", "stream-json", "--verbose"); + } if (systemPrompt) { args.push("--system-prompt", systemPrompt); } - if (options.allowRead) { - args.push("--allowedTools", "Read"); + if (options.allowedTools) { + args.push("--allowedTools", options.allowedTools); } const cwd = options.cwd || "/tmp"; + let raw; try { - return execFileSync("claude", args, { + raw = execFileSync("claude", args, { input: prompt, encoding: "utf-8", - maxBuffer: 1024 * 1024, + maxBuffer: 5 * 1024 * 1024, timeout: 120_000, cwd, }).trim(); } catch (e) { - return `[ERROR] ${e.message}`; + const errText = `[ERROR] ${e.message}`; + return useStreamJson ? { text: errText, toolCalls: [] } : errText; + } + + if (!useStreamJson) return raw; + + // Parse stream-json lines to extract tool calls and final result + const toolCalls = []; + let resultText = ""; + for (const line of raw.split("\n")) { + let msg; + try { msg = JSON.parse(line); } catch { continue; } + + if (msg.type === "assistant" && msg.message?.content) { + for (const block of msg.message.content) { + if (block.type === "tool_use") { + const input = block.input || {}; + const summary = block.name === "Bash" + ? `Bash: ${input.command || ""}` + : block.name === "Read" + ? `Read: ${input.file_path || ""}` + : `${block.name}: ${JSON.stringify(input).slice(0, 200)}`; + toolCalls.push(summary); + } + } + } + if (msg.type === "result") { + resultText = msg.result || ""; + } } + + return { text: resultText, toolCalls }; } -/** Ask claude to judge an output against expected behaviors. */ +/** + * Ask claude to judge an output against expected behaviors. + * @param {object} evalCase - The eval case with prompt and expected_behaviors + * @param {string|{text: string, toolCalls: string[]}} output - Plain text or structured output + * @param {string} label - Label for logging + */ function judge(evalCase, output, label) { const behaviors = evalCase.expected_behaviors .map((b, i) => `${i + 1}. ${b}`) .join("\n"); + // Build the output section, including tool calls if available + const isStructured = typeof output === "object" && output.toolCalls; + let outputSection; + if (isStructured && output.toolCalls.length > 0) { + const toolList = output.toolCalls.map((t, i) => `${i + 1}. ${t}`).join("\n"); + outputSection = ` +The assistant made the following tool calls during execution: +${toolList} + + + +${output.text} +`; + } else { + outputSection = ` +${isStructured ? output.text : output} +`; + } + const judgePrompt = `You are an evaluation judge. A coding assistant was given this task: ${evalCase.prompt} -The assistant produced this output: - - -${output} - +${outputSection} Score each expected behavior as PASS or FAIL. Be strict — the behavior must be clearly present, not just vaguely implied. Return ONLY a JSON array of objects with "behavior", "pass" (boolean), and "reason" (one sentence). @@ -231,14 +290,23 @@ if (!triggersOnly && outputCases.length > 0) { for (const evalCase of outputCases) { console.log(`━━━ ${evalCase.name} ━━━\n`); - // Run WITH skill — from the skill directory with Read access so the - // agent can fetch reference files on demand, matching real usage. + // Run WITH skill — from the skill directory with tools declared in the + // skill's allowed-tools frontmatter, matching real usage. Use stream-json + // to capture tool calls so the judge can verify script execution. console.log(" Running WITH skill..."); const withOutput = runClaude(evalCase.prompt, skillContent, { cwd: skillDir, - allowRead: true, + allowedTools: skillAllowedTools, + captureToolCalls: true, }); + if (withOutput.toolCalls?.length > 0) { + console.log(` Tool calls: ${withOutput.toolCalls.length}`); + for (const tc of withOutput.toolCalls) { + console.log(` → ${tc}`); + } + } + // Run WITHOUT skill (baseline) — no tools, no skill context let withoutOutput = null; if (!skipBaseline) { @@ -277,9 +345,13 @@ if (!triggersOnly && outputCases.length > 0) { } } + // Store text output (not the full structured object) in results + const withOutputText = typeof withOutput === "object" ? withOutput.text : withOutput; + const withToolCalls = typeof withOutput === "object" ? withOutput.toolCalls : []; + allResults.output_evals.push({ name: evalCase.name, - with_skill: { output: withOutput, judgment: withJudgment }, + with_skill: { output: withOutputText, tool_calls: withToolCalls, judgment: withJudgment }, without_skill: withoutOutput ? { output: withoutOutput, judgment: withoutJudgment } : null,