From 5667c9ceab27233e115d8b3c381b7cace7c53eb0 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Thu, 26 Mar 2026 07:40:15 +0000
Subject: [PATCH 1/5] fix(core): extract pi-cli tool calls from streaming
 events for skill-trigger

Pi CLI emits tool_execution_start/end events in JSONL output, but the
provider only extracted tool calls from message content arrays. This
caused the skill-trigger evaluator to miss pi's skill file reads.

Now extractMessages() also scans for tool_execution_start/end events
and injects reconstructed tool calls into assistant messages. Also
handles tool_call (snake_case) content type variant.

Closes #780

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/src/evaluation/providers/pi-cli.ts   | 127 ++++++++--
 .../providers/pi-cli-tool-extraction.test.ts  | 231 ++++++++++++++++++
 2 files changed, 344 insertions(+), 14 deletions(-)
 create mode 100644 packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts

diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts
index 7c0dc25e..9aa0191a 100644
--- a/packages/core/src/evaluation/providers/pi-cli.ts
+++ b/packages/core/src/evaluation/providers/pi-cli.ts
@@ -539,6 +539,10 @@ function summarizePiEvent(event: unknown): string | undefined {
       }
       return `message_update: ${eventType}`;
     }
+    case 'tool_execution_start':
+      return `tool_start: ${record.toolName}`;
+    case 'tool_execution_end':
+      return `tool_end: ${record.toolName}`;
     default:
       return type;
   }
@@ -580,29 +584,119 @@ function parsePiJsonl(output: string): unknown[] {
 }
 
 function extractMessages(events: unknown[]): readonly Message[] {
+  let messages: Message[] | undefined;
+
   for (let i = events.length - 1; i >= 0; i--) {
     const event = events[i];
     if (!event || typeof event !== 'object') continue;
     const record = event as Record<string, unknown>;
     if (record.type !== 'agent_end') continue;
 
-    const messages = record.messages;
-    if (!Array.isArray(messages)) continue;
+    const msgs = record.messages;
+    if (!Array.isArray(msgs)) continue;
 
-    return messages.map(convertPiMessage).filter((m): m is Message => m !== undefined);
+    messages = msgs.map(convertPiMessage).filter((m): m is Message => m !== undefined);
+    break;
   }
 
-  const output: Message[] = [];
+  if (!messages) {
+    messages = [];
+    for (const event of events) {
+      if (!event || typeof event !== 'object') continue;
+      const record = event as Record<string, unknown>;
+      if (record.type === 'turn_end') {
+        const converted = convertPiMessage(record.message);
+        if (converted) messages.push(converted);
+      }
+    }
+  }
+
+  // Pi CLI may emit tool_execution_start/tool_execution_end events whose tool
+  // calls are absent from the final agent_end messages. Reconstruct them and
+  // inject into the last assistant message so evaluators (e.g. skill-trigger)
+  // can detect them.
+  const eventToolCalls = extractToolCallsFromEvents(events);
+  if (eventToolCalls.length > 0) {
+    injectEventToolCalls(messages, eventToolCalls);
+  }
+
+  return messages;
+}
+
+/**
+ * Scan JSONL events for tool_execution_start / tool_execution_end pairs and
+ * reconstruct ToolCall objects from them.
+ */
+function extractToolCallsFromEvents(events: unknown[]): ToolCall[] {
+  const starts = new Map<string, { tool: string; input: unknown }>();
+  const results = new Map<string, unknown>();
+
   for (const event of events) {
     if (!event || typeof event !== 'object') continue;
-    const record = event as Record<string, unknown>;
-    if (record.type === 'turn_end') {
-      const converted = convertPiMessage(record.message);
-      if (converted) output.push(converted);
+    const r = event as Record<string, unknown>;
+    const type = r.type;
+    if (type === 'tool_execution_start' && typeof r.toolName === 'string') {
+      const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
+      starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args });
+    } else if (type === 'tool_execution_end') {
+      const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
+      if (id) results.set(id, r.result);
+    }
+  }
+
+  const toolCalls: ToolCall[] = [];
+  for (const [id, { tool, input }] of starts) {
+    toolCalls.push({
+      tool,
+      input: input as Record<string, unknown> | undefined,
+      id: id.startsWith('anon-') ? undefined : id,
+      output: results.get(id),
+    });
+  }
+  return toolCalls;
+}
+
+/**
+ * Merge event-sourced tool calls into messages. For each tool call, if it
+ * already exists (by id) in some message, skip it. Otherwise, append it to
+ * the last assistant message (creating one if needed).
+ */
+function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void {
+  const existingIds = new Set<string>();
+  const existingTools = new Set<string>();
+  for (const msg of messages) {
+    if (!msg.toolCalls) continue;
+    for (const tc of msg.toolCalls) {
+      if (tc.id) existingIds.add(tc.id);
+      // Track tool+input combos to avoid duplicates when there's no id
+      existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`);
+    }
+  }
+
+  const missing = eventToolCalls.filter((tc) => {
+    if (tc.id && existingIds.has(tc.id)) return false;
+    if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false;
+    return true;
+  });
+
+  if (missing.length === 0) return;
+
+  // Find the last assistant message to attach tool calls to
+  let target: Message | undefined;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === 'assistant') {
+      target = messages[i];
+      break;
     }
   }
 
-  return output;
+  if (target) {
+    const existing = target.toolCalls ?? [];
+    (target as { toolCalls?: readonly ToolCall[] }).toolCalls = [...existing, ...missing];
+  } else {
+    // No assistant message — create a synthetic one
+    messages.push({ role: 'assistant', content: '', toolCalls: missing });
+  }
 }
 
 function extractTokenUsage(events: unknown[]): ProviderTokenUsage | undefined {
@@ -720,15 +814,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
         input: p.input,
         id: typeof p.id === 'string' ? p.id : undefined,
       });
-    }
-    if (p.type === 'toolCall' && typeof p.name === 'string') {
+    } else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') {
       toolCalls.push({
         tool: p.name,
-        input: p.arguments,
+        input: p.arguments ?? p.input,
         id: typeof p.id === 'string' ? p.id : undefined,
       });
-    }
-    if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
+    } else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
       const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
       if (existing) {
         const idx = toolCalls.indexOf(existing);
@@ -830,3 +922,10 @@ async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
     });
   });
 }
+
+/** @internal Exported for testing only. */
+export const _internal = {
+  extractMessages,
+  extractToolCallsFromEvents,
+  parsePiJsonl,
+};
diff --git a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts
new file mode 100644
index 00000000..84e5d0d6
--- /dev/null
+++ b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts
@@ -0,0 +1,231 @@
+import { describe, expect, it } from 'vitest';
+import { _internal } from '../../../src/evaluation/providers/pi-cli.js';
+
+const { extractMessages, extractToolCallsFromEvents } = _internal;
+
+describe('pi-cli tool call extraction from events', () => {
+  it('should extract tool calls from tool_execution_start/end events', () => {
+    const events = [
+      { type: 'agent_start' },
+      { type: 'turn_start' },
+      { type: 'message_start', message: { role: 'assistant' } },
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        result: 'skill content here',
+      },
+      { type: 'message_end' },
+      {
+        type: 'turn_end',
+        message: { role: 'assistant', content: [{ type: 'text', text: 'Done' }] },
+      },
+      {
+        type: 'agent_end',
+        messages: [{ role: 'assistant', content: [{ type: 'text', text: 'Done' }] }],
+      },
+    ];
+
+    const toolCalls = extractToolCallsFromEvents(events);
+
+    expect(toolCalls).toHaveLength(1);
+    expect(toolCalls[0].tool).toBe('read');
+    expect(toolCalls[0].id).toBe('tc-1');
+    expect(toolCalls[0].input).toEqual({ path: '.agents/skills/csv-analyzer/SKILL.md' });
+    expect(toolCalls[0].output).toBe('skill content here');
+  });
+
+  it('should inject event tool calls into messages when content has no tool calls', () => {
+    const events = [
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-1',
+        result: 'skill file contents',
+      },
+      {
+        type: 'agent_end',
+        messages: [{ role: 'assistant', content: [{ type: 'text', text: 'I will review this.' }] }],
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages).toHaveLength(1);
+    expect(messages[0].role).toBe('assistant');
+    expect(messages[0].toolCalls).toBeDefined();
+    expect(messages[0].toolCalls).toHaveLength(1);
+    expect(messages[0].toolCalls?.[0].tool).toBe('read');
+    expect(messages[0].toolCalls?.[0].input).toEqual({
+      path: '.agents/skills/csv-analyzer/SKILL.md',
+    });
+  });
+
+  it('should not duplicate tool calls already present in messages', () => {
+    const events = [
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-1',
+        result: 'content',
+      },
+      {
+        type: 'agent_end',
+        messages: [
+          {
+            role: 'assistant',
+            content: [
+              { type: 'text', text: 'Review' },
+              {
+                type: 'tool_use',
+                name: 'read',
+                id: 'tc-1',
+                input: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+              },
+            ],
+          },
+        ],
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages).toHaveLength(1);
+    expect(messages[0].toolCalls).toHaveLength(1);
+  });
+
+  it('should handle multiple tool execution events', () => {
+    const events = [
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-1',
+        result: 'skill content',
+      },
+      {
+        type: 'tool_execution_start',
+        toolName: 'bash',
+        toolCallId: 'tc-2',
+        args: { command: 'ls' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-2',
+        result: 'file list',
+      },
+      {
+        type: 'agent_end',
+        messages: [{ role: 'assistant', content: [{ type: 'text', text: 'Done' }] }],
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages[0].toolCalls).toHaveLength(2);
+    expect(messages[0].toolCalls?.[0].tool).toBe('read');
+    expect(messages[0].toolCalls?.[1].tool).toBe('bash');
+  });
+
+  it('should create synthetic assistant message when no assistant message exists', () => {
+    const events = [
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-1',
+        result: 'content',
+      },
+      {
+        type: 'agent_end',
+        messages: [{ role: 'user', content: [{ type: 'text', text: 'Review this' }] }],
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages).toHaveLength(2);
+    expect(messages[1].role).toBe('assistant');
+    expect(messages[1].toolCalls).toHaveLength(1);
+    expect(messages[1].toolCalls?.[0].tool).toBe('read');
+  });
+
+  it('should fall back to turn_end events and still inject tool calls', () => {
+    const events = [
+      {
+        type: 'tool_execution_start',
+        toolName: 'read',
+        toolCallId: 'tc-1',
+        args: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+      },
+      {
+        type: 'tool_execution_end',
+        toolCallId: 'tc-1',
+        result: 'content',
+      },
+      {
+        type: 'turn_end',
+        message: { role: 'assistant', content: [{ type: 'text', text: 'Response' }] },
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages).toHaveLength(1);
+    expect(messages[0].toolCalls).toHaveLength(1);
+    expect(messages[0].toolCalls?.[0].tool).toBe('read');
+  });
+
+  it('should handle tool_call type in message content', () => {
+    const events = [
+      {
+        type: 'agent_end',
+        messages: [
+          {
+            role: 'assistant',
+            content: [
+              {
+                type: 'tool_call',
+                name: 'read',
+                id: 'tc-1',
+                arguments: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+              },
+            ],
+          },
+        ],
+      },
+    ];
+
+    const messages = extractMessages(events);
+
+    expect(messages[0].toolCalls).toHaveLength(1);
+    expect(messages[0].toolCalls?.[0].tool).toBe('read');
+    expect(messages[0].toolCalls?.[0].input).toEqual({
+      path: '.agents/skills/csv-analyzer/SKILL.md',
+    });
+  });
+});

From 5e3ab55436cc805d9c41ee71eb439d11b53d8ad5 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Thu, 26 Mar 2026 08:02:13 +0000
Subject: [PATCH 2/5] fix(core): avoid mutating readonly Message in
 injectEventToolCalls

Replace target message with a new object instead of casting to bypass
readonly constraint, per code review feedback.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/core/src/evaluation/providers/pi-cli.ts | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts
index 9aa0191a..e2abcc73 100644
--- a/packages/core/src/evaluation/providers/pi-cli.ts
+++ b/packages/core/src/evaluation/providers/pi-cli.ts
@@ -681,18 +681,18 @@ function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]):
 
   if (missing.length === 0) return;
 
-  // Find the last assistant message to attach tool calls to
-  let target: Message | undefined;
+  // Find the last assistant message and replace it with an enriched copy
+  let targetIdx = -1;
   for (let i = messages.length - 1; i >= 0; i--) {
     if (messages[i].role === 'assistant') {
-      target = messages[i];
+      targetIdx = i;
       break;
     }
   }
 
-  if (target) {
-    const existing = target.toolCalls ?? [];
-    (target as { toolCalls?: readonly ToolCall[] }).toolCalls = [...existing, ...missing];
+  if (targetIdx >= 0) {
+    const target = messages[targetIdx];
+    messages[targetIdx] = { ...target, toolCalls: [...(target.toolCalls ?? []), ...missing] };
   } else {
     // No assistant message — create a synthetic one
     messages.push({ role: 'assistant', content: '', toolCalls: missing });

From 44e5415cb691fce9d78c72f74a916e0530906a1b Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Thu, 26 Mar 2026 08:30:24 +0000
Subject: [PATCH 3/5] fix(evals): restore skill-trigger assertion for
 agent-plugin-review eval

Re-adds the skill-trigger assertion that was removed as a workaround
for #780. Now that pi-cli tool call extraction is fixed, the evaluator
can detect when pi loads the agent-plugin-review skill.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 evals/agentic-engineering/agent-plugin-review.eval.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml
index 5e8b417d..35311681 100644
--- a/evals/agentic-engineering/agent-plugin-review.eval.yaml
+++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml
@@ -20,6 +20,8 @@ tests:
       Review the deploy-auto plugin in this repo for completeness.
       Check that every skill has a corresponding eval file.
     assertions:
+      - type: skill-trigger
+        skill: agent-plugin-review
       - type: contains
         value: deploy-rollback
       - type: rubrics

From 6f173247087469f695fa42e5a117cf019f5d746a Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Thu, 26 Mar 2026 08:43:44 +0000
Subject: [PATCH 4/5] fix(evals): configure pi-cli target with model and remove
 workers: 1

Pi-cli target needs subprovider/model/api_key to produce meaningful
output. Without them, pi uses its default which returns empty responses.

Also removes workers: 1 from agent-plugin-review eval since all test
cases are read-only reviews that can safely run in parallel.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .agentv/targets.yaml                                    | 2 ++
 evals/agentic-engineering/agent-plugin-review.eval.yaml | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index 14e4b9ac..de3f4cf0 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -13,6 +13,8 @@ targets:
   - name: pi-cli
     provider: pi-cli
     subprovider: openrouter
+    model: openai/gpt-5.1-codex
+    api_key: ${{ OPENROUTER_API_KEY }}
     grader_target: gemini-flash
 
   - name: pi-coding-agent
diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml
index 35311681..88091dcd 100644
--- a/evals/agentic-engineering/agent-plugin-review.eval.yaml
+++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml
@@ -3,7 +3,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch
 execution:
   targets:
     - pi-cli
-  workers: 1
 
 workspace:
   template: ./workspace-template

From a9f27f5e62a54ccd7b48f3658c398767cf66ee9b Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Thu, 26 Mar 2026 08:48:54 +0000
Subject: [PATCH 5/5] style: fix formatting in package.json files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/package.json      | 5 +----
 packages/core/package.json | 5 +----
 packages/eval/package.json | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/apps/cli/package.json b/apps/cli/package.json
index e5e8edac..63c72814 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun src/cli.ts",
     "build": "tsup && bun run copy-readme",
diff --git a/packages/core/package.json b/packages/core/package.json
index ef541141..87e08d2f 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -38,10 +38,7 @@
     "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts",
     "generate:schema": "bun scripts/generate-eval-schema.ts"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "@agentclientprotocol/sdk": "^0.14.1",
     "@agentv/eval": "workspace:*",
diff --git a/packages/eval/package.json b/packages/eval/package.json
index 5a33d4d6..725a4734 100644
--- a/packages/eval/package.json
+++ b/packages/eval/package.json
@@ -30,10 +30,7 @@
     "fix": "biome check --write .",
     "test": "bun test"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "zod": "^3.23.8"
   }