Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ targets:
- name: pi-cli
provider: pi-cli
subprovider: openrouter
model: openai/gpt-5.1-codex
api_key: ${{ OPENROUTER_API_KEY }}
grader_target: gemini-flash

- name: pi-coding-agent
Expand Down
5 changes: 1 addition & 4 deletions apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
"bin": {
"agentv": "./dist/cli.js"
},
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"scripts": {
"dev": "bun src/cli.ts",
"build": "tsup && bun run copy-readme",
Expand Down
3 changes: 2 additions & 1 deletion evals/agentic-engineering/agent-plugin-review.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch
execution:
targets:
- pi-cli
workers: 1

workspace:
template: ./workspace-template
Expand All @@ -20,6 +19,8 @@ tests:
Review the deploy-auto plugin in this repo for completeness.
Check that every skill has a corresponding eval file.
assertions:
- type: skill-trigger
skill: agent-plugin-review
- type: contains
value: deploy-rollback
- type: rubrics
Expand Down
5 changes: 1 addition & 4 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@
"diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts",
"generate:schema": "bun scripts/generate-eval-schema.ts"
},
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"dependencies": {
"@agentclientprotocol/sdk": "^0.14.1",
"@agentv/eval": "workspace:*",
Expand Down
127 changes: 113 additions & 14 deletions packages/core/src/evaluation/providers/pi-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,10 @@ function summarizePiEvent(event: unknown): string | undefined {
}
return `message_update: ${eventType}`;
}
case 'tool_execution_start':
return `tool_start: ${record.toolName}`;
case 'tool_execution_end':
return `tool_end: ${record.toolName}`;
default:
return type;
}
Expand Down Expand Up @@ -580,29 +584,119 @@ function parsePiJsonl(output: string): unknown[] {
}

function extractMessages(events: unknown[]): readonly Message[] {
let messages: Message[] | undefined;

for (let i = events.length - 1; i >= 0; i--) {
const event = events[i];
if (!event || typeof event !== 'object') continue;
const record = event as Record<string, unknown>;
if (record.type !== 'agent_end') continue;

const messages = record.messages;
if (!Array.isArray(messages)) continue;
const msgs = record.messages;
if (!Array.isArray(msgs)) continue;

return messages.map(convertPiMessage).filter((m): m is Message => m !== undefined);
messages = msgs.map(convertPiMessage).filter((m): m is Message => m !== undefined);
break;
}

const output: Message[] = [];
if (!messages) {
messages = [];
for (const event of events) {
if (!event || typeof event !== 'object') continue;
const record = event as Record<string, unknown>;
if (record.type === 'turn_end') {
const converted = convertPiMessage(record.message);
if (converted) messages.push(converted);
}
}
}

// Pi CLI may emit tool_execution_start/tool_execution_end events whose tool
// calls are absent from the final agent_end messages. Reconstruct them and
// inject into the last assistant message so evaluators (e.g. skill-trigger)
// can detect them.
const eventToolCalls = extractToolCallsFromEvents(events);
if (eventToolCalls.length > 0) {
injectEventToolCalls(messages, eventToolCalls);
}

return messages;
}

/**
* Scan JSONL events for tool_execution_start / tool_execution_end pairs and
* reconstruct ToolCall objects from them.
*/
function extractToolCallsFromEvents(events: unknown[]): ToolCall[] {
const starts = new Map<string, { tool: string; input: unknown }>();
const results = new Map<string, unknown>();

for (const event of events) {
if (!event || typeof event !== 'object') continue;
const record = event as Record<string, unknown>;
if (record.type === 'turn_end') {
const converted = convertPiMessage(record.message);
if (converted) output.push(converted);
const r = event as Record<string, unknown>;
const type = r.type;
if (type === 'tool_execution_start' && typeof r.toolName === 'string') {
const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args });
} else if (type === 'tool_execution_end') {
const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
if (id) results.set(id, r.result);
}
}

const toolCalls: ToolCall[] = [];
for (const [id, { tool, input }] of starts) {
toolCalls.push({
tool,
input: input as Record<string, unknown> | undefined,
id: id.startsWith('anon-') ? undefined : id,
output: results.get(id),
});
}
return toolCalls;
}

/**
* Merge event-sourced tool calls into messages. For each tool call, if it
* already exists (by id) in some message, skip it. Otherwise, append it to
* the last assistant message (creating one if needed).
*/
function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void {
const existingIds = new Set<string>();
const existingTools = new Set<string>();
for (const msg of messages) {
if (!msg.toolCalls) continue;
for (const tc of msg.toolCalls) {
if (tc.id) existingIds.add(tc.id);
// Track tool+input combos to avoid duplicates when there's no id
existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`);
}
}

const missing = eventToolCalls.filter((tc) => {
if (tc.id && existingIds.has(tc.id)) return false;
if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false;
return true;
});

if (missing.length === 0) return;

// Find the last assistant message and replace it with an enriched copy
let targetIdx = -1;
for (let i = messages.length - 1; i >= 0; i--) {
if (messages[i].role === 'assistant') {
targetIdx = i;
break;
}
}

return output;
if (targetIdx >= 0) {
const target = messages[targetIdx];
messages[targetIdx] = { ...target, toolCalls: [...(target.toolCalls ?? []), ...missing] };
} else {
// No assistant message — create a synthetic one
messages.push({ role: 'assistant', content: '', toolCalls: missing });
}
}

function extractTokenUsage(events: unknown[]): ProviderTokenUsage | undefined {
Expand Down Expand Up @@ -720,15 +814,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
input: p.input,
id: typeof p.id === 'string' ? p.id : undefined,
});
}
if (p.type === 'toolCall' && typeof p.name === 'string') {
} else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') {
toolCalls.push({
tool: p.name,
input: p.arguments,
input: p.arguments ?? p.input,
id: typeof p.id === 'string' ? p.id : undefined,
});
}
if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
} else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
if (existing) {
const idx = toolCalls.indexOf(existing);
Expand Down Expand Up @@ -830,3 +922,10 @@ async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
});
});
}

/** @internal Exported for testing only. */
export const _internal = {
extractMessages,
extractToolCallsFromEvents,
parsePiJsonl,
};
Loading
Loading