diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 35df6996..5e8b417d 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -3,9 +3,15 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch execution: targets: - pi-cli + workers: 1 workspace: template: ./workspace-template + hooks: + before_all: + command: + - node + - "{{workspace_path}}/scripts/setup.mjs" tests: - id: detect-missing-eval @@ -14,8 +20,6 @@ tests: Review the deploy-auto plugin in this repo for completeness. Check that every skill has a corresponding eval file. assertions: - - type: skill-trigger - value: agent-plugin-review - type: contains value: deploy-rollback - type: rubrics @@ -28,8 +32,6 @@ tests: input: | Review the eval files under evals/deploy-auto/ for naming convention issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: contains value: .eval.yaml - type: rubrics @@ -44,8 +46,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for eval quality issues. Check assertion coverage and expected_output format. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that no assertions are defined in deploy-plan.yaml @@ -57,8 +57,6 @@ tests: input: | Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that file paths are missing a leading slash @@ -70,8 +68,6 @@ tests: Review evals/deploy-auto/deploy-plan.yaml for structural improvements. Look at how inputs are organized across test cases. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Identifies the repeated SKILL.md file input across all 3 tests @@ -83,8 +79,6 @@ tests: Review the deploy-auto plugin's workflow architecture. Check whether phases enforce prerequisites before proceeding. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that deploy-execute does not check for deploy-plan.md before starting @@ -97,8 +91,6 @@ tests: Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy. Cross-check expected outputs against what the skills actually document. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags the contradiction between pytest (skill) and python -m unittest (eval) @@ -110,8 +102,6 @@ tests: Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues. Check that referenced commands and skills actually exist. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags that /deploy-execute is referenced but does not exist as a slash command @@ -123,8 +113,6 @@ tests: input: | Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues. assertions: - - type: skill-trigger - value: agent-plugin-review - type: rubrics criteria: - Flags the hardcoded path C:\Users\admin\.kube\config diff --git a/evals/agentic-engineering/workspace-template/scripts/setup.mjs b/evals/agentic-engineering/workspace-template/scripts/setup.mjs new file mode 100644 index 00000000..bfeddcf6 --- /dev/null +++ b/evals/agentic-engineering/workspace-template/scripts/setup.mjs @@ -0,0 +1,59 @@ +#!/usr/bin/env node +/** + * Workspace before_all hook: copy skills into the workspace for agent discovery. + * Receives workspace_path via stdin JSON from the AgentV orchestrator. + * Runs with cwd = eval file directory (which is inside the repo). + */ + +import { cpSync, mkdirSync, readdirSync, readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { execSync } from 'node:child_process'; + +// Read workspace_path from stdin (provided by AgentV orchestrator) +let workspacePath; +try { + const stdin = readFileSync(0, 'utf8'); + const context = JSON.parse(stdin); + workspacePath = context.workspace_path; +} catch { + workspacePath = process.cwd(); +} + +// Resolve repo root from cwd (eval dir is inside the repo) +let repoRoot; +try { + repoRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf8' }).trim(); +} catch { + console.error('Failed to resolve repo root from cwd:', process.cwd()); + process.exit(1); +} + +console.log(`Workspace: ${workspacePath}`); +console.log(`Repo root: ${repoRoot}`); + +// Copy to skill discovery directories in the workspace +const skillDirs = [ + join(workspacePath, '.agents', 'skills'), + join(workspacePath, '.pi', 'skills'), +]; +for (const dir of skillDirs) { + mkdirSync(dir, { recursive: true }); +} + +const skillSources = [ + join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-plugin-review'), + join(repoRoot, 'plugins', 'agentic-engineering', 'skills', 'agent-architecture-design'), + join(repoRoot, 'plugins', 'agentv-dev', 'skills', 'agentv-eval-review'), +]; + +for (const src of skillSources) { + const name = src.split(/[\\/]/).pop(); + for (const dir of skillDirs) { + cpSync(src, join(dir, name), { recursive: true }); + } + console.log(`Copied ${name}`); +} + +for (const dir of skillDirs) { + console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`); +} diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index d580a82a..c182f85a 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -78,15 +78,18 @@ export class PiCliProvider implements Provider { const startTime = new Date().toISOString(); const startMs = Date.now(); - const workspaceRoot = await this.createWorkspace(); + // Use eval-materialized workspace (request.cwd) when available, consistent with copilot-cli. + // Only create a temp workspace when no cwd is provided. + const hasExternalCwd = !!(request.cwd || this.config.cwd); + const workspaceRoot = hasExternalCwd ? undefined : await this.createWorkspace(); + const cwd = this.resolveCwd(workspaceRoot, request.cwd); const logger = await this.createStreamLogger(request).catch(() => undefined); try { // Save prompt to file for debugging/logging - const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); + const promptFile = path.join(cwd, PROMPT_FILENAME); await writeFile(promptFile, request.question, 'utf8'); const args = this.buildPiArgs(request.question, inputFiles); - const cwd = this.resolveCwd(workspaceRoot, request.cwd); const result = await this.executePi(args, cwd, request.signal, logger); @@ -136,7 +139,7 @@ export class PiCliProvider implements Provider { args, executable: this.config.executable, promptFile, - workspace: workspaceRoot, + workspace: workspaceRoot ?? cwd, inputFiles, logFile: logger?.filePath, }, @@ -148,18 +151,23 @@ export class PiCliProvider implements Provider { }; } finally { await logger?.close(); - await this.cleanupWorkspace(workspaceRoot); + if (workspaceRoot) { + await this.cleanupWorkspace(workspaceRoot); + } } } - private resolveCwd(workspaceRoot: string, cwdOverride?: string): string { + private resolveCwd(workspaceRoot: string | undefined, cwdOverride?: string): string { if (cwdOverride) { return path.resolve(cwdOverride); } - if (!this.config.cwd) { + if (this.config.cwd) { + return path.resolve(this.config.cwd); + } + if (workspaceRoot) { return workspaceRoot; } - return path.resolve(this.config.cwd); + return process.cwd(); } private buildPiArgs(prompt: string, inputFiles: readonly string[] | undefined): string[] {