Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 6 additions & 18 deletions evals/agentic-engineering/agent-plugin-review.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch
execution:
targets:
- pi-cli
workers: 1

workspace:
template: ./workspace-template
hooks:
before_all:
command:
- node
- "{{workspace_path}}/scripts/setup.mjs"

tests:
- id: detect-missing-eval
Expand All @@ -14,8 +20,6 @@ tests:
Review the deploy-auto plugin in this repo for completeness.
Check that every skill has a corresponding eval file.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: contains
value: deploy-rollback
- type: rubrics
Expand All @@ -28,8 +32,6 @@ tests:
input: |
Review the eval files under evals/deploy-auto/ for naming convention issues.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: contains
value: .eval.yaml
- type: rubrics
Expand All @@ -44,8 +46,6 @@ tests:
Review evals/deploy-auto/deploy-plan.yaml for eval quality issues.
Check assertion coverage and expected_output format.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags that no assertions are defined in deploy-plan.yaml
Expand All @@ -57,8 +57,6 @@ tests:
input: |
Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags that file paths are missing a leading slash
Expand All @@ -70,8 +68,6 @@ tests:
Review evals/deploy-auto/deploy-plan.yaml for structural improvements.
Look at how inputs are organized across test cases.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Identifies the repeated SKILL.md file input across all 3 tests
Expand All @@ -83,8 +79,6 @@ tests:
Review the deploy-auto plugin's workflow architecture.
Check whether phases enforce prerequisites before proceeding.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags that deploy-execute does not check for deploy-plan.md before starting
Expand All @@ -97,8 +91,6 @@ tests:
Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy.
Cross-check expected outputs against what the skills actually document.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags the contradiction between pytest (skill) and python -m unittest (eval)
Expand All @@ -110,8 +102,6 @@ tests:
Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues.
Check that referenced commands and skills actually exist.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags that /deploy-execute is referenced but does not exist as a slash command
Expand All @@ -123,8 +113,6 @@ tests:
input: |
Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues.
assertions:
- type: skill-trigger
value: agent-plugin-review
- type: rubrics
criteria:
- Flags the hardcoded path C:\Users\admin\.kube\config
Expand Down
59 changes: 59 additions & 0 deletions evals/agentic-engineering/workspace-template/scripts/setup.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env node
/**
* Workspace before_all hook: copy skills into the workspace for agent discovery.
* Receives workspace_path via stdin JSON from the AgentV orchestrator.
* Runs with cwd = eval file directory (which is inside the repo).
*/

import { cpSync, mkdirSync, readdirSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import { execSync } from 'node:child_process';

// Read workspace_path from stdin (provided by AgentV orchestrator)
let workspacePath;
try {
const stdin = readFileSync(0, 'utf8');
const context = JSON.parse(stdin);
workspacePath = context.workspace_path;
} catch {
workspacePath = process.cwd();
}

// Resolve repo root from cwd (eval dir is inside the repo)
let repoRoot;
try {
repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim();
} catch {
console.error('Failed to resolve repo root from cwd:', process.cwd());
process.exit(1);
}

console.log(`Workspace: ${workspacePath}`);
console.log(`Repo root: ${repoRoot}`);

// Copy to skill discovery directories in the workspace
const skillDirs = [
join(workspacePath, '.agents', 'skills'),
join(workspacePath, '.pi', 'skills'),
];
for (const dir of skillDirs) {
mkdirSync(dir, { recursive: true });
}

const skillSources = [
join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-plugin-review'),
join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-architecture-design'),
join(repoRoot, 'plugins', 'agentv-dev', 'skills', 'agentv-eval-review'),
];

for (const src of skillSources) {
const name = src.split(/[\\/]/).pop();
for (const dir of skillDirs) {
cpSync(src, join(dir, name), { recursive: true });
}
console.log(`Copied ${name}`);
}

for (const dir of skillDirs) {
console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`);
}
24 changes: 16 additions & 8 deletions packages/core/src/evaluation/providers/pi-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,18 @@ export class PiCliProvider implements Provider {
const startTime = new Date().toISOString();
const startMs = Date.now();

const workspaceRoot = await this.createWorkspace();
// Use eval-materialized workspace (request.cwd) when available, consistent with copilot-cli.
// Only create a temp workspace when no cwd is provided.
const hasExternalCwd = !!(request.cwd || this.config.cwd);
const workspaceRoot = hasExternalCwd ? undefined : await this.createWorkspace();
const cwd = this.resolveCwd(workspaceRoot, request.cwd);
const logger = await this.createStreamLogger(request).catch(() => undefined);
try {
// Save prompt to file for debugging/logging
const promptFile = path.join(workspaceRoot, PROMPT_FILENAME);
const promptFile = path.join(cwd, PROMPT_FILENAME);
await writeFile(promptFile, request.question, 'utf8');

const args = this.buildPiArgs(request.question, inputFiles);
const cwd = this.resolveCwd(workspaceRoot, request.cwd);

const result = await this.executePi(args, cwd, request.signal, logger);

Expand Down Expand Up @@ -136,7 +139,7 @@ export class PiCliProvider implements Provider {
args,
executable: this.config.executable,
promptFile,
workspace: workspaceRoot,
workspace: workspaceRoot ?? cwd,
inputFiles,
logFile: logger?.filePath,
},
Expand All @@ -148,18 +151,23 @@ export class PiCliProvider implements Provider {
};
} finally {
await logger?.close();
await this.cleanupWorkspace(workspaceRoot);
if (workspaceRoot) {
await this.cleanupWorkspace(workspaceRoot);
}
}
}

private resolveCwd(workspaceRoot: string, cwdOverride?: string): string {
private resolveCwd(workspaceRoot: string | undefined, cwdOverride?: string): string {
if (cwdOverride) {
return path.resolve(cwdOverride);
}
if (!this.config.cwd) {
if (this.config.cwd) {
return path.resolve(this.config.cwd);
}
if (workspaceRoot) {
return workspaceRoot;
}
return path.resolve(this.config.cwd);
return process.cwd();
}

private buildPiArgs(prompt: string, inputFiles: readonly string[] | undefined): string[] {
Expand Down
Loading