From 5667c9ceab27233e115d8b3c381b7cace7c53eb0 Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 26 Mar 2026 07:40:15 +0000 Subject: [PATCH 1/5] fix(core): extract pi-cli tool calls from streaming events for skill-trigger Pi CLI emits tool_execution_start/end events in JSONL output, but the provider only extracted tool calls from message content arrays. This caused the skill-trigger evaluator to miss pi's skill file reads. Now extractMessages() also scans for tool_execution_start/end events and injects reconstructed tool calls into assistant messages. Also handles tool_call (snake_case) content type variant. Closes #780 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/src/evaluation/providers/pi-cli.ts | 127 ++++++++-- .../providers/pi-cli-tool-extraction.test.ts | 231 ++++++++++++++++++ 2 files changed, 344 insertions(+), 14 deletions(-) create mode 100644 packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index 7c0dc25e..9aa0191a 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -539,6 +539,10 @@ function summarizePiEvent(event: unknown): string | undefined { } return `message_update: ${eventType}`; } + case 'tool_execution_start': + return `tool_start: ${record.toolName}`; + case 'tool_execution_end': + return `tool_end: ${record.toolName}`; default: return type; } @@ -580,29 +584,119 @@ function parsePiJsonl(output: string): unknown[] { } function extractMessages(events: unknown[]): readonly Message[] { + let messages: Message[] | undefined; + for (let i = events.length - 1; i >= 0; i--) { const event = events[i]; if (!event || typeof event !== 'object') continue; const record = event as Record; if (record.type !== 'agent_end') continue; - const messages = record.messages; - if (!Array.isArray(messages)) continue; + const msgs = record.messages; + if (!Array.isArray(msgs)) continue; - return messages.map(convertPiMessage).filter((m): m is Message => m !== undefined); + messages = msgs.map(convertPiMessage).filter((m): m is Message => m !== undefined); + break; } - const output: Message[] = []; + if (!messages) { + messages = []; + for (const event of events) { + if (!event || typeof event !== 'object') continue; + const record = event as Record; + if (record.type === 'turn_end') { + const converted = convertPiMessage(record.message); + if (converted) messages.push(converted); + } + } + } + + // Pi CLI may emit tool_execution_start/tool_execution_end events whose tool + // calls are absent from the final agent_end messages. Reconstruct them and + // inject into the last assistant message so evaluators (e.g. skill-trigger) + // can detect them. + const eventToolCalls = extractToolCallsFromEvents(events); + if (eventToolCalls.length > 0) { + injectEventToolCalls(messages, eventToolCalls); + } + + return messages; +} + +/** + * Scan JSONL events for tool_execution_start / tool_execution_end pairs and + * reconstruct ToolCall objects from them. + */ +function extractToolCallsFromEvents(events: unknown[]): ToolCall[] { + const starts = new Map(); + const results = new Map(); + for (const event of events) { if (!event || typeof event !== 'object') continue; - const record = event as Record; - if (record.type === 'turn_end') { - const converted = convertPiMessage(record.message); - if (converted) output.push(converted); + const r = event as Record; + const type = r.type; + if (type === 'tool_execution_start' && typeof r.toolName === 'string') { + const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined; + starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args }); + } else if (type === 'tool_execution_end') { + const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined; + if (id) results.set(id, r.result); + } + } + + const toolCalls: ToolCall[] = []; + for (const [id, { tool, input }] of starts) { + toolCalls.push({ + tool, + input: input as Record | undefined, + id: id.startsWith('anon-') ? undefined : id, + output: results.get(id), + }); + } + return toolCalls; +} + +/** + * Merge event-sourced tool calls into messages. For each tool call, if it + * already exists (by id) in some message, skip it. Otherwise, append it to + * the last assistant message (creating one if needed). + */ +function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void { + const existingIds = new Set(); + const existingTools = new Set(); + for (const msg of messages) { + if (!msg.toolCalls) continue; + for (const tc of msg.toolCalls) { + if (tc.id) existingIds.add(tc.id); + // Track tool+input combos to avoid duplicates when there's no id + existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`); + } + } + + const missing = eventToolCalls.filter((tc) => { + if (tc.id && existingIds.has(tc.id)) return false; + if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false; + return true; + }); + + if (missing.length === 0) return; + + // Find the last assistant message to attach tool calls to + let target: Message | undefined; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role === 'assistant') { + target = messages[i]; + break; } } - return output; + if (target) { + const existing = target.toolCalls ?? []; + (target as { toolCalls?: readonly ToolCall[] }).toolCalls = [...existing, ...missing]; + } else { + // No assistant message — create a synthetic one + messages.push({ role: 'assistant', content: '', toolCalls: missing }); + } } function extractTokenUsage(events: unknown[]): ProviderTokenUsage | undefined { @@ -720,15 +814,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] { input: p.input, id: typeof p.id === 'string' ? p.id : undefined, }); - } - if (p.type === 'toolCall' && typeof p.name === 'string') { + } else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') { toolCalls.push({ tool: p.name, - input: p.arguments, + input: p.arguments ?? p.input, id: typeof p.id === 'string' ? p.id : undefined, }); - } - if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { + } else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { const existing = toolCalls.find((tc) => tc.id === p.tool_use_id); if (existing) { const idx = toolCalls.indexOf(existing); @@ -830,3 +922,10 @@ async function defaultPiRunner(options: PiRunOptions): Promise { }); }); } + +/** @internal Exported for testing only. */ +export const _internal = { + extractMessages, + extractToolCallsFromEvents, + parsePiJsonl, +}; diff --git a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts new file mode 100644 index 00000000..84e5d0d6 --- /dev/null +++ b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts @@ -0,0 +1,231 @@ +import { describe, expect, it } from 'vitest'; +import { _internal } from '../../../src/evaluation/providers/pi-cli.js'; + +const { extractMessages, extractToolCallsFromEvents } = _internal; + +describe('pi-cli tool call extraction from events', () => { + it('should extract tool calls from tool_execution_start/end events', () => { + const events = [ + { type: 'agent_start' }, + { type: 'turn_start' }, + { type: 'message_start', message: { role: 'assistant' } }, + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolName: 'read', + toolCallId: 'tc-1', + result: 'skill content here', + }, + { type: 'message_end' }, + { + type: 'turn_end', + message: { role: 'assistant', content: [{ type: 'text', text: 'Done' }] }, + }, + { + type: 'agent_end', + messages: [{ role: 'assistant', content: [{ type: 'text', text: 'Done' }] }], + }, + ]; + + const toolCalls = extractToolCallsFromEvents(events); + + expect(toolCalls).toHaveLength(1); + expect(toolCalls[0].tool).toBe('read'); + expect(toolCalls[0].id).toBe('tc-1'); + expect(toolCalls[0].input).toEqual({ path: '.agents/skills/csv-analyzer/SKILL.md' }); + expect(toolCalls[0].output).toBe('skill content here'); + }); + + it('should inject event tool calls into messages when content has no tool calls', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-1', + result: 'skill file contents', + }, + { + type: 'agent_end', + messages: [{ role: 'assistant', content: [{ type: 'text', text: 'I will review this.' }] }], + }, + ]; + + const messages = extractMessages(events); + + expect(messages).toHaveLength(1); + expect(messages[0].role).toBe('assistant'); + expect(messages[0].toolCalls).toBeDefined(); + expect(messages[0].toolCalls).toHaveLength(1); + expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[0].input).toEqual({ + path: '.agents/skills/csv-analyzer/SKILL.md', + }); + }); + + it('should not duplicate tool calls already present in messages', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-1', + result: 'content', + }, + { + type: 'agent_end', + messages: [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Review' }, + { + type: 'tool_use', + name: 'read', + id: 'tc-1', + input: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + ], + }, + ], + }, + ]; + + const messages = extractMessages(events); + + expect(messages).toHaveLength(1); + expect(messages[0].toolCalls).toHaveLength(1); + }); + + it('should handle multiple tool execution events', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-1', + result: 'skill content', + }, + { + type: 'tool_execution_start', + toolName: 'bash', + toolCallId: 'tc-2', + args: { command: 'ls' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-2', + result: 'file list', + }, + { + type: 'agent_end', + messages: [{ role: 'assistant', content: [{ type: 'text', text: 'Done' }] }], + }, + ]; + + const messages = extractMessages(events); + + expect(messages[0].toolCalls).toHaveLength(2); + expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[1].tool).toBe('bash'); + }); + + it('should create synthetic assistant message when no assistant message exists', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-1', + result: 'content', + }, + { + type: 'agent_end', + messages: [{ role: 'user', content: [{ type: 'text', text: 'Review this' }] }], + }, + ]; + + const messages = extractMessages(events); + + expect(messages).toHaveLength(2); + expect(messages[1].role).toBe('assistant'); + expect(messages[1].toolCalls).toHaveLength(1); + expect(messages[1].toolCalls?.[0].tool).toBe('read'); + }); + + it('should fall back to turn_end events and still inject tool calls', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'read', + toolCallId: 'tc-1', + args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + { + type: 'tool_execution_end', + toolCallId: 'tc-1', + result: 'content', + }, + { + type: 'turn_end', + message: { role: 'assistant', content: [{ type: 'text', text: 'Response' }] }, + }, + ]; + + const messages = extractMessages(events); + + expect(messages).toHaveLength(1); + expect(messages[0].toolCalls).toHaveLength(1); + expect(messages[0].toolCalls?.[0].tool).toBe('read'); + }); + + it('should handle tool_call type in message content', () => { + const events = [ + { + type: 'agent_end', + messages: [ + { + role: 'assistant', + content: [ + { + type: 'tool_call', + name: 'read', + id: 'tc-1', + arguments: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + ], + }, + ], + }, + ]; + + const messages = extractMessages(events); + + expect(messages[0].toolCalls).toHaveLength(1); + expect(messages[0].toolCalls?.[0].tool).toBe('read'); + expect(messages[0].toolCalls?.[0].input).toEqual({ + path: '.agents/skills/csv-analyzer/SKILL.md', + }); + }); +}); From 5e3ab55436cc805d9c41ee71eb439d11b53d8ad5 Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 26 Mar 2026 08:02:13 +0000 Subject: [PATCH 2/5] fix(core): avoid mutating readonly Message in injectEventToolCalls Replace target message with a new object instead of casting to bypass readonly constraint, per code review feedback. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/providers/pi-cli.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index 9aa0191a..e2abcc73 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -681,18 +681,18 @@ function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): if (missing.length === 0) return; - // Find the last assistant message to attach tool calls to - let target: Message | undefined; + // Find the last assistant message and replace it with an enriched copy + let targetIdx = -1; for (let i = messages.length - 1; i >= 0; i--) { if (messages[i].role === 'assistant') { - target = messages[i]; + targetIdx = i; break; } } - if (target) { - const existing = target.toolCalls ?? []; - (target as { toolCalls?: readonly ToolCall[] }).toolCalls = [...existing, ...missing]; + if (targetIdx >= 0) { + const target = messages[targetIdx]; + messages[targetIdx] = { ...target, toolCalls: [...(target.toolCalls ?? []), ...missing] }; } else { // No assistant message — create a synthetic one messages.push({ role: 'assistant', content: '', toolCalls: missing }); From 44e5415cb691fce9d78c72f74a916e0530906a1b Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 26 Mar 2026 08:30:24 +0000 Subject: [PATCH 3/5] fix(evals): restore skill-trigger assertion for agent-plugin-review eval Re-adds the skill-trigger assertion that was removed as a workaround for #780. Now that pi-cli tool call extraction is fixed, the evaluator can detect when pi loads the agent-plugin-review skill. Co-Authored-By: Claude Opus 4.6 (1M context) --- evals/agentic-engineering/agent-plugin-review.eval.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 5e8b417d..35311681 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -20,6 +20,8 @@ tests: Review the deploy-auto plugin in this repo for completeness. Check that every skill has a corresponding eval file. assertions: + - type: skill-trigger + skill: agent-plugin-review - type: contains value: deploy-rollback - type: rubrics From 6f173247087469f695fa42e5a117cf019f5d746a Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 26 Mar 2026 08:43:44 +0000 Subject: [PATCH 4/5] fix(evals): configure pi-cli target with model and remove workers: 1 Pi-cli target needs subprovider/model/api_key to produce meaningful output. Without them, pi uses its default which returns empty responses. Also removes workers: 1 from agent-plugin-review eval since all test cases are read-only reviews that can safely run in parallel. Co-Authored-By: Claude Opus 4.6 (1M context) --- .agentv/targets.yaml | 2 ++ evals/agentic-engineering/agent-plugin-review.eval.yaml | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 14e4b9ac..de3f4cf0 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -13,6 +13,8 @@ targets: - name: pi-cli provider: pi-cli subprovider: openrouter + model: openai/gpt-5.1-codex + api_key: ${{ OPENROUTER_API_KEY }} grader_target: gemini-flash - name: pi-coding-agent diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 35311681..88091dcd 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -3,7 +3,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch execution: targets: - pi-cli - workers: 1 workspace: template: ./workspace-template From a9f27f5e62a54ccd7b48f3658c398767cf66ee9b Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 26 Mar 2026 08:48:54 +0000 Subject: [PATCH 5/5] style: fix formatting in package.json files Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/package.json | 5 +---- packages/core/package.json | 5 +---- packages/eval/package.json | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index e5e8edac..63c72814 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun src/cli.ts", "build": "tsup && bun run copy-readme", diff --git a/packages/core/package.json b/packages/core/package.json index ef541141..87e08d2f 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -38,10 +38,7 @@ "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts", "generate:schema": "bun scripts/generate-eval-schema.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", diff --git a/packages/eval/package.json b/packages/eval/package.json index 5a33d4d6..725a4734 100644 --- a/packages/eval/package.json +++ b/packages/eval/package.json @@ -30,10 +30,7 @@ "fix": "biome check --write .", "test": "bun test" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "zod": "^3.23.8" }