diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000..e9a7d0e --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,3 @@ +self-hosted-runner: + labels: + - agent-skills-amd-4cpu diff --git a/.github/workflows/capture-baseline.yml b/.github/workflows/capture-baseline.yml new file mode 100644 index 0000000..0f25b26 --- /dev/null +++ b/.github/workflows/capture-baseline.yml @@ -0,0 +1,166 @@ +name: Capture Eval Baseline +on: + workflow_dispatch: + inputs: + tag: + description: "Release tag to baseline (e.g. v1.9.0)" + required: true + type: string +permissions: + contents: read + +jobs: + + # ── Claude Code — 3 models ───────────────────────────────────────────────── + capture-claude-code: + name: baseline / claude-code / ${{ matrix.skill }} / ${{ matrix.model }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + skill: [hawkscan, api] + model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.tag }} + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-node@v4 + with: + node-version: "20" + - name: Install Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + - name: Verify claude CLI + run: claude --version + - name: Run baseline eval (${{ matrix.model }}) + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + uv run evals --harness claude-code --skill ${{ matrix.skill }} \ + --model ${{ matrix.model }} --bare --max-budget 0.15 || true + - name: Upload baseline artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: baseline-claude-code-${{ matrix.skill }}-${{ matrix.model }} + path: evals/harnesses/claude-code/results/${{ matrix.skill }}/cell.json + retention-days: 90 + + # ── Codex — 2 models ────────────────────────────────────────────────────── + capture-codex: + name: baseline / codex / ${{ matrix.skill }} / ${{ matrix.model }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + skill: [hawkscan, api] + model: [gpt-5.5, o3] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.tag }} + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-node@v4 + with: + node-version: "20" + - name: Install Codex CLI + run: npm install -g @openai/codex + - name: Verify codex CLI + run: codex --version + - name: Install StackHawk skills (hawkscan + api) + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + codex plugin marketplace add . + echo y | codex plugin add hawkscan@stackhawk + echo y | codex plugin add stackhawk-api@stackhawk + - name: Run baseline eval (${{ matrix.model }}) + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} || true + - name: Upload baseline artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: baseline-codex-${{ matrix.skill }}-${{ matrix.model }} + path: evals/harnesses/codex/results/${{ matrix.skill }}/cell.json + retention-days: 90 + + # ── Antigravity (agy) — default model ───────────────────────────────────── + capture-agy: + name: baseline / agy / ${{ matrix.skill }} / ${{ matrix.model }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + skill: [hawkscan, api] + model: [default] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.tag }} + - uses: astral-sh/setup-uv@v5 + - name: Install agy CLI + run: curl -fsSL https://antigravity.google/install-cli | bash + - name: Verify agy CLI + run: agy --version + - name: Install StackHawk plugins + env: + AGY_API_KEY: ${{ secrets.AGY_API_KEY }} + run: | + echo y | agy plugin install plugins/hawkscan + echo y | agy plugin install plugins/api + - name: Run baseline eval + env: + AGY_API_KEY: ${{ secrets.AGY_API_KEY }} + run: | + MODEL_ARGS=() + if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi + uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true + - name: Upload baseline artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: baseline-agy-${{ matrix.skill }}-${{ matrix.model }} + path: evals/harnesses/agy/results/${{ matrix.skill }}/cell.json + retention-days: 90 + + # ── Cursor — default model ───────────────────────────────────────────────── + capture-cursor: + name: baseline / cursor / ${{ matrix.skill }} / ${{ matrix.model }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + skill: [hawkscan, api] + model: [default] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.tag }} + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-node@v4 + with: + node-version: "20" + - name: Install Cursor CLI + run: npm install -g @cursor/cli || npm install -g cursor-agent + continue-on-error: true # package name TBD; update when stable + - name: Verify agent CLI + run: agent --version + continue-on-error: true # CLI package name TBD; skip if unavailable + - name: Run baseline eval + env: + CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} + run: | + MODEL_ARGS=() + if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi + uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true + continue-on-error: true # best-effort + - name: Upload baseline artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: baseline-cursor-${{ matrix.skill }}-${{ matrix.model }} + path: evals/harnesses/cursor/results/${{ matrix.skill }}/cell.json + retention-days: 90 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index aa29ba8..1843daf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -127,6 +127,24 @@ jobs: if: inputs.dry_run == true run: echo "DRY RUN complete — all checks passed for ${{ steps.version.outputs.tag }}" + capture-baseline: + name: Trigger baseline capture + needs: release + if: inputs.dry_run != true + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - uses: actions/checkout@v4 + - name: Dispatch capture-baseline + # GITHUB_TOKEN can dispatch workflows in the same repo for most orgs. + # If org policy blocks it, swap to the TF_GITHUB_TOKEN PAT that + # update-marketplace pulls from SSM (aws ssm get-parameter --name TF_GITHUB_TOKEN). + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_TAG: ${{ needs.release.outputs.tag }} + run: gh workflow run capture-baseline.yml -f tag="$RELEASE_TAG" + update-marketplace: name: Update marketplace pin needs: release @@ -139,7 +157,7 @@ jobs: - name: Resolve cache run: | biodome ci restore-cache - rm -rf *.tar.lz4 + rm -rf ./*.tar.lz4 - name: Pull secrets run: biodome ci save-secrets @@ -158,7 +176,7 @@ jobs: echo "::add-mask::${GH_PAT}" git clone https://github.com/stackhawk/agent-skills-marketplace.git /tmp/marketplace git -C /tmp/marketplace remote set-url origin \ - https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git + "https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git" - name: Update marketplace.json run: | diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml index 5cc3162..e56aaab 100644 --- a/.github/workflows/skill-evals.yml +++ b/.github/workflows/skill-evals.yml @@ -1,6 +1,9 @@ name: Skill Evals on: + # Manual, on-demand only — matches origin/main's deliberate design (commit c860e47 + # "ci: remove pull_request trigger — evals run on workflow_dispatch only"). These + # evals drive real agents against tool CLIs and were never an automatic PR gate. workflow_dispatch: inputs: skill: @@ -10,15 +13,15 @@ on: type: choice options: [hawkscan, api, both] platform: - description: "Platform to run (all = claude-code + codex + agy + cursor)" + description: "Platform to run" required: true default: "all" type: choice options: [all, claude-code, codex, agy, cursor] rubric: - description: "Run qualitative rubric grader (slower, ~$0.10 extra per run)" + description: "Also run the qualitative rubric grader (extra ANTHROPIC_API_KEY cost)" required: false - default: false + default: true type: boolean permissions: @@ -35,78 +38,141 @@ permissions: jobs: + # ── Config validation (no API keys; runs on every PR including forks) ────── + validate-config: + name: validate eval config + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - name: Validate prompts.yaml + process-checks.json + run: uv run validate + + # ── Unit tests (no API keys; runs on every PR) ──────────────────────────── + pytest: + name: pytest (lib) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - name: Run lib tests + run: uv run pytest -q + # ── Claude Code ────────────────────────────────────────────────────────── eval-claude-code: - name: claude-code / ${{ matrix.skill }} + name: claude-code / ${{ matrix.skill }} / ${{ matrix.model }} runs-on: ubuntu-latest + needs: validate-config if: | - github.event_name != 'workflow_dispatch' || inputs.platform == 'all' || inputs.platform == 'claude-code' strategy: fail-fast: false matrix: - skill: [hawkscan, api] + skill: [hawkscan, api, stackhawk-data-seed] + model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001] steps: - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install Claude Code CLI - run: npm install -g @anthropic-ai/claude-code + - name: Install Claude Code CLI (native) + run: | + curl -fsSL https://claude.ai/install.sh | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" - name: Verify claude CLI run: claude --version - - name: Run ${{ matrix.skill }} evals + # hawk CLI is a Java app; ensure a JDK 17+ is on PATH for it. + - uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "17" + + # Install the latest hawk via StackHawk's official action in install-only + # mode (no scan). It downloads the CLI and adds it to PATH so the hawkscan + # skill can follow its documented CLI path (hawk version/config/validate/ + # scan). Without hawk the agent improvises and never emits a hawk* signal. + - name: Install latest hawk CLI + uses: stackhawk/hawkscan-action@v2.5.0 + with: + apiKey: ${{ secrets.HAWK_API_KEY }} + version: latest + installCLIOnly: true + continue-on-error: true # install-only shouldn't need apiKey; don't let a check abort the job + + - name: Verify hawk CLI + run: hawk version + continue-on-error: true # absence is captured per-prompt in the eval traces + + # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup + # checks (hawkop app list / env list) and the entire api skill. No official + # action exists, so install the native Linux binary straight into + # /usr/local/bin (already on PATH). Version + URL per the repo's api skill + # reference (plugins/api/skills/api/references/hawkop-shortcuts.md). + - name: Install hawkop CLI + run: | + set -euo pipefail + HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)" + echo "Installing hawkop v${HAWKOP_VERSION}" + curl -fLo /tmp/hawkop.tar.gz \ + "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz" + tar -xzf /tmp/hawkop.tar.gz -C /tmp + sudo mv /tmp/hawkop /usr/local/bin/hawkop + continue-on-error: true # don't abort the job — evals records any absence + - name: Verify hawkop CLI + run: hawkop --version + continue-on-error: true # absence is captured per-prompt in the eval traces + + # No --bare: --bare is "minimal mode" where skills only resolve via an + # explicit /skill-name and do NOT auto-trigger from their description, so + # natural-language prompts never fire the skill (all false-negatives). + # Full plugin mode is also the realistic user experience (hooks + skill). + - name: Run ${{ matrix.skill }} evals (${{ matrix.model }}) + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }} + API_KEY: ${{ secrets.HAWK_API_KEY }} # hawk reads API_KEY + HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }} # hawkop reads HAWKOP_API_KEY + HAWKOP_FORMAT: json + run: | + uv run evals --harness claude-code --skill ${{ matrix.skill }} \ + --model ${{ matrix.model }} --max-budget 0.15 ${{ inputs.rubric && '--rubric' || '' }} + + - name: Skill lift (compare with/without) + if: github.event_name == 'pull_request' env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | - RUBRIC_FLAG="" - if [ "${{ inputs.rubric }}" = "true" ]; then - RUBRIC_FLAG="--rubric" - fi - python3 evals/harnesses/claude-code/run-evals.py \ - --skill ${{ matrix.skill }} \ - --bare \ - --max-budget 0.15 \ - $RUBRIC_FLAG + uv run compare --harness claude-code --skill ${{ matrix.skill }} \ + --model ${{ matrix.model }} --max-budget 0.15 || true - name: Upload results if: always() uses: actions/upload-artifact@v4 with: - name: eval-claude-code-${{ matrix.skill }} + name: eval-claude-code-${{ matrix.skill }}-${{ matrix.model }} path: evals/harnesses/claude-code/results/${{ matrix.skill }}/ retention-days: 30 # ── Codex ───────────────────────────────────────────────────────────────── eval-codex: - name: codex / ${{ matrix.skill }} + name: codex / ${{ matrix.skill }} / ${{ matrix.model }} runs-on: ubuntu-latest + needs: validate-config if: | - github.event_name != 'workflow_dispatch' || inputs.platform == 'all' || inputs.platform == 'codex' strategy: fail-fast: false matrix: - skill: [hawkscan, api] + skill: [hawkscan, api, stackhawk-data-seed] + model: [gpt-5.5, o3] steps: - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - + - uses: astral-sh/setup-uv@v5 - uses: actions/setup-node@v4 with: node-version: "20" @@ -117,6 +183,18 @@ jobs: - name: Verify codex CLI run: codex --version + - name: Install Claude Code CLI (native, rubric grader) + run: | + curl -fsSL https://claude.ai/install.sh | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + # codex exec reads stored credentials, not OPENAI_API_KEY directly — without + # this it 401s ("Missing bearer"). Pipe the key via stdin (never as an arg). + - name: Authenticate codex CLI + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: printenv OPENAI_API_KEY | codex login --with-api-key + - name: Install StackHawk skills (hawkscan + api) env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -124,201 +202,310 @@ jobs: codex plugin marketplace add . echo y | codex plugin add hawkscan@stackhawk echo y | codex plugin add stackhawk-api@stackhawk + echo y | codex plugin add stackhawk-data-seed@stackhawk - - name: Run ${{ matrix.skill }} evals + # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan. + - uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "17" + - name: Install latest hawk CLI + uses: stackhawk/hawkscan-action@v2.5.0 + with: + apiKey: ${{ secrets.HAWK_API_KEY }} + version: latest + installCLIOnly: true + continue-on-error: true # install-only shouldn't need apiKey; don't let a check abort the job + - name: Verify hawk CLI + run: hawk version + continue-on-error: true # absence is captured per-prompt in the eval traces + + # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup + # checks (hawkop app list / env list) and the entire api skill. No official + # action exists, so install the native Linux binary straight into + # /usr/local/bin (already on PATH). Version + URL per the repo's api skill + # reference (plugins/api/skills/api/references/hawkop-shortcuts.md). + - name: Install hawkop CLI + run: | + set -euo pipefail + HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)" + echo "Installing hawkop v${HAWKOP_VERSION}" + curl -fLo /tmp/hawkop.tar.gz \ + "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz" + tar -xzf /tmp/hawkop.tar.gz -C /tmp + sudo mv /tmp/hawkop /usr/local/bin/hawkop + continue-on-error: true # don't abort the job — evals records any absence + - name: Verify hawkop CLI + run: hawkop --version + continue-on-error: true # absence is captured per-prompt in the eval traces + + - name: Run ${{ matrix.skill }} evals (${{ matrix.model }}) env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # rubric grader (claude) + HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }} + API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_FORMAT: json run: | - python3 evals/harnesses/codex/run-evals.py \ - --skill ${{ matrix.skill }} + uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} ${{ inputs.rubric && '--rubric' || '' }} - name: Upload results if: always() uses: actions/upload-artifact@v4 with: - name: eval-codex-${{ matrix.skill }} + name: eval-codex-${{ matrix.skill }}-${{ matrix.model }} path: evals/harnesses/codex/results/${{ matrix.skill }}/ retention-days: 30 # ── Antigravity (agy) — replaces Gemini ─────────────────────────────────── eval-agy: - name: agy / ${{ matrix.skill }} + name: agy / ${{ matrix.skill }} / ${{ matrix.model }} runs-on: ubuntu-latest + needs: validate-config if: | - github.event_name != 'workflow_dispatch' || inputs.platform == 'all' || inputs.platform == 'agy' strategy: fail-fast: false matrix: - skill: [hawkscan, api] + skill: [hawkscan, api, stackhawk-data-seed] + model: [default] steps: - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" + - uses: astral-sh/setup-uv@v5 + - name: Install Claude Code CLI (native, rubric grader) + run: | + curl -fsSL https://claude.ai/install.sh | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" - name: Install agy CLI - run: curl -fsSL https://antigravity.google/install-cli | bash + run: | + # /cli/install.sh is the real bootstrapper; /install-cli returns the + # site's HTML landing page (piping that into bash is what broke before). + curl -fsSL https://antigravity.google/cli/install.sh | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" # installer drops `agy` here + continue-on-error: true # don't abort the job — evals records any launch failure - name: Verify agy CLI run: agy --version + continue-on-error: true # if unavailable, the eval run captures it as a per-prompt error - name: Install StackHawk plugins env: - AGY_API_KEY: ${{ secrets.AGY_API_KEY }} + ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }} # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY run: | echo y | agy plugin install plugins/hawkscan echo y | agy plugin install plugins/api + echo y | agy plugin install plugins/stackhawk-data-seed + continue-on-error: true # depends on agy CLI; best-effort so evals still runs + + # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan. + - uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "17" + - name: Install latest hawk CLI + uses: stackhawk/hawkscan-action@v2.5.0 + with: + apiKey: ${{ secrets.HAWK_API_KEY }} + version: latest + installCLIOnly: true + continue-on-error: true # install-only shouldn't need apiKey; don't let a check abort the job + - name: Verify hawk CLI + run: hawk version + continue-on-error: true # absence is captured per-prompt in the eval traces + + # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup + # checks (hawkop app list / env list) and the entire api skill. No official + # action exists, so install the native Linux binary straight into + # /usr/local/bin (already on PATH). Version + URL per the repo's api skill + # reference (plugins/api/skills/api/references/hawkop-shortcuts.md). + - name: Install hawkop CLI + run: | + set -euo pipefail + HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)" + echo "Installing hawkop v${HAWKOP_VERSION}" + curl -fLo /tmp/hawkop.tar.gz \ + "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz" + tar -xzf /tmp/hawkop.tar.gz -C /tmp + sudo mv /tmp/hawkop /usr/local/bin/hawkop + continue-on-error: true # don't abort the job — evals records any absence + - name: Verify hawkop CLI + run: hawkop --version + continue-on-error: true # absence is captured per-prompt in the eval traces - name: Run ${{ matrix.skill }} evals env: - AGY_API_KEY: ${{ secrets.AGY_API_KEY }} + ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }} # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # rubric grader (claude) + HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }} + API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_FORMAT: json run: | - python3 evals/harnesses/agy/run-evals.py \ - --skill ${{ matrix.skill }} \ - --print-timeout 240s + MODEL_ARGS=() + if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi + RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi + uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC + continue-on-error: true # best-effort; digest degrades gracefully (matches cursor) - name: Upload results if: always() uses: actions/upload-artifact@v4 with: - name: eval-agy-${{ matrix.skill }} + name: eval-agy-${{ matrix.skill }}-${{ matrix.model }} path: evals/harnesses/agy/results/${{ matrix.skill }}/ retention-days: 30 # ── Cursor ──────────────────────────────────────────────────────────────── eval-cursor: - name: cursor / ${{ matrix.skill }} + name: cursor / ${{ matrix.skill }} / ${{ matrix.model }} runs-on: ubuntu-latest + needs: validate-config if: | - github.event_name != 'workflow_dispatch' || inputs.platform == 'all' || inputs.platform == 'cursor' strategy: fail-fast: false matrix: - skill: [hawkscan, api] + skill: [hawkscan, api, stackhawk-data-seed] + model: [default] steps: - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - uses: actions/setup-node@v4 - with: - node-version: "20" + - uses: astral-sh/setup-uv@v5 - name: Install Cursor CLI - run: npm install -g @cursor/cli || npm install -g cursor-agent - continue-on-error: true # package name TBD; update when stable + run: | + # Official installer; symlinks the `agent` binary into ~/.local/bin. + # (@cursor/cli / cursor-agent npm packages don't exist — they 404'd.) + curl https://cursor.com/install -fsS | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + continue-on-error: true # best-effort; evals records any launch failure - name: Verify agent CLI run: agent --version + continue-on-error: true # absence is captured per-prompt in the eval traces + + - name: Install Claude Code CLI (native, rubric grader) + run: | + curl -fsSL https://claude.ai/install.sh | bash + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan. + - uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "17" + - name: Install latest hawk CLI + uses: stackhawk/hawkscan-action@v2.5.0 + with: + apiKey: ${{ secrets.HAWK_API_KEY }} + version: latest + installCLIOnly: true + continue-on-error: true # install-only shouldn't need apiKey; don't let a check abort the job + - name: Verify hawk CLI + run: hawk version + continue-on-error: true # absence is captured per-prompt in the eval traces + + # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup + # checks (hawkop app list / env list) and the entire api skill. No official + # action exists, so install the native Linux binary straight into + # /usr/local/bin (already on PATH). Version + URL per the repo's api skill + # reference (plugins/api/skills/api/references/hawkop-shortcuts.md). + - name: Install hawkop CLI + run: | + set -euo pipefail + HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)" + echo "Installing hawkop v${HAWKOP_VERSION}" + curl -fLo /tmp/hawkop.tar.gz \ + "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz" + tar -xzf /tmp/hawkop.tar.gz -C /tmp + sudo mv /tmp/hawkop /usr/local/bin/hawkop + continue-on-error: true # don't abort the job — evals records any absence + - name: Verify hawkop CLI + run: hawkop --version + continue-on-error: true # absence is captured per-prompt in the eval traces - name: Run ${{ matrix.skill }} evals env: CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # rubric grader (claude) + HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }} + API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }} + HAWKOP_FORMAT: json run: | - python3 evals/harnesses/cursor/run-evals.py \ - --skill ${{ matrix.skill }} + MODEL_ARGS=() + if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi + RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi + uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC + continue-on-error: true # best-effort; digest degrades gracefully - name: Upload results if: always() uses: actions/upload-artifact@v4 with: - name: eval-cursor-${{ matrix.skill }} + name: eval-cursor-${{ matrix.skill }}-${{ matrix.model }} path: evals/harnesses/cursor/results/${{ matrix.skill }}/ retention-days: 30 # ── PR comment ──────────────────────────────────────────────────────────── - comment: - name: Post PR summary - needs: [eval-claude-code, eval-codex, eval-agy, eval-cursor] - if: always() && github.event_name == 'pull_request' + report: + name: Eval report (run summary + PR comment) + needs: [validate-config, eval-claude-code, eval-codex, eval-agy, eval-cursor] + if: always() runs-on: ubuntu-latest permissions: pull-requests: write steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 - uses: actions/download-artifact@v4 with: pattern: eval-* merge-multiple: false path: results/ - - - name: Build and post comment + - uses: astral-sh/setup-uv@v5 + - name: Fetch released baseline (best-effort) + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set +e + mkdir -p baseline + TAG=$(gh release view --json tagName -q .tagName 2>/dev/null) + if [ -z "$TAG" ]; then echo "no release yet"; exit 0; fi + SHA=$(git rev-list -n 1 "$TAG" 2>/dev/null) + RUN=$(gh run list --workflow capture-baseline.yml --json databaseId,headSha \ + -q "map(select(.headSha==\"$SHA\")) | .[0].databaseId" 2>/dev/null) + if [ -z "$RUN" ] || [ "$RUN" = "null" ]; then echo "no capture run for $TAG"; exit 0; fi + gh run download "$RUN" -p 'baseline-*' -D baseline 2>/dev/null || echo "download failed" + echo "baseline fetched for $TAG (run $RUN)" + - name: Build digest + run: uv run report --pr --results-dir results --baseline-dir baseline --lift-dir results --out digest.md + - name: Write digest to run summary + if: always() + run: cat digest.md >> "$GITHUB_STEP_SUMMARY" + - name: Post digest comment + if: github.event_name == 'pull_request' uses: actions/github-script@v7 with: script: | const fs = require('fs'); - const path = require('path'); - - const needsResult = ${{ toJSON(needs) }}; - const allSuccess = Object.values(needsResult).every(n => n.result === 'success'); - const overallIcon = allSuccess ? '✅' : '❌'; - - let body = `## ${overallIcon} Skill Eval Results\n\n`; - - const platforms = ['claude-code', 'codex', 'agy', 'cursor']; - const skills = ['hawkscan', 'api']; - - for (const platform of platforms) { - body += `### Platform: \`${platform}\`\n\n`; - for (const skill of skills) { - const summaryPath = path.join( - 'results', `eval-${platform}-${skill}`, 'summary.json' - ); - - if (!fs.existsSync(summaryPath)) { - body += `**\`${skill}\`**: ⚠️ No results\n`; - continue; - } - - const s = JSON.parse(fs.readFileSync(summaryPath, 'utf8')); - const ta = s.trigger_accuracy; - const triggerIcon = ta.correct === ta.total ? '✅' : '❌'; - - body += `**\`${skill}\`**: ${triggerIcon} Trigger ${ta.correct}/${ta.total}`; - if (s.process_avg_score !== null) { - const scoreIcon = s.process_avg_score >= 70 && s.total_blocking_failures === 0 ? '✅' : '⚠️'; - body += ` | ${scoreIcon} Process ${s.process_avg_score}/100`; - } - if (s.false_positives?.length) body += ` | ⚠️ FP: ${s.false_positives.join(', ')}`; - if (s.false_negatives?.length) body += ` | ⚠️ FN: ${s.false_negatives.join(', ')}`; - body += '\n'; - } - body += '\n'; - } - - body += `---\n_Commit ${context.sha.slice(0, 7)}. `; - body += `[Full results](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})_\n`; - + const body = fs.readFileSync('digest.md', 'utf8'); const marker = ''; const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number }); const existing = comments.find(c => c.body.includes(marker)); - const fullBody = marker + '\n' + body; - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body: fullBody, - }); + await github.rest.issues.updateComment({ owner: context.repo.owner, + repo: context.repo.repo, comment_id: existing.id, body }); } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: fullBody, - }); + await github.rest.issues.createComment({ owner: context.repo.owner, + repo: context.repo.repo, issue_number: context.issue.number, body }); } diff --git a/.gitignore b/.gitignore index 61cc84f..f5df676 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,8 @@ docs/superpowers/ .worktrees/ .idea/ *.iml + +# Python +__pycache__/ +*.py[cod] +.venv/ diff --git a/evals/README.md b/evals/README.md index 74ebac7..b9b3458 100644 --- a/evals/README.md +++ b/evals/README.md @@ -7,11 +7,11 @@ Evaluation assets for the `hawkscan`, `api`, and `stackhawk-data-seed` skills. T ``` evals/ hawkscan/ - prompts.csv # 20 trigger/no-trigger test cases for the hawkscan skill + prompts.yaml # 20 trigger/no-trigger test cases for the hawkscan skill process-checks.json # Deterministic checks: commands, files, and patterns that must (or must not) appear rubric-items.json # Qualitative rubric check definitions for style and correctness grading api/ - prompts.csv # 16 trigger/no-trigger test cases for the api skill + prompts.yaml # 16 trigger/no-trigger test cases for the api skill process-checks.json # Deterministic checks rubric-items.json # Qualitative rubric check definitions stackhawk-data-seed/ @@ -19,17 +19,19 @@ evals/ process-checks.json # Deterministic checks for discovery, dialog, artifact emission, and contract boundaries rubric-items.json # Qualitative rubric check definitions rubric-schema.json # Shared JSON Schema — constrains rubric grader output format + lib/ # Shared library: models, config, grading, harness, replay, compare, reporting + cli.py # Unified CLI entrypoints (evals, compare, regrade, validate) harnesses/ - README.md # How to build platform-specific harnesses (Codex, Claude, Gemini, etc.) + README.md # How to build platform-specific harnesses (Codex, Claude, etc.) ``` ## Three layers of evaluation -### 1. Trigger evals (`prompts.csv`) +### 1. Trigger evals (`prompts.yaml`) -Each row is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked. +Each entry is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked. Each prompt may also set a `budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an `expected` list (each item has exactly one of: signal / anti_pattern / check_id). -Columns: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes` +Fields: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes` Invocation types: - `explicit` — skill named directly (e.g. `$hawkscan` or `$api`) @@ -50,19 +52,63 @@ A second, read-only grader pass over the agent's output and generated files. The ## Running evals -Harnesses are platform-specific. See `harnesses/README.md` for the contract and planned implementations. +This is a uv project. All commands go through `uv run`. -**Manual checklist:** -1. Run the prompt in the target agent -2. Check the output and any generated files against `process-checks.json` — look for `signals` (must appear) and `anti_patterns` (must not appear) -3. Run a grader with the `grader_prompt` from `rubric-items.json` against the output; require JSON output conforming to `rubric-schema.json` -4. Record results per check; track scores over time to detect regressions +| Task | Command | +|---|---| +| Validate config (no keys) | `uv run validate` | +| Run a skill | `uv run evals --harness claude-code --skill hawkscan` | +| Single prompt | `uv run evals --harness claude-code --skill hawkscan --id hw-07` | +| Compare with/without skill | `uv run compare --harness claude-code --skill hawkscan` | +| Regrade a saved trace (free) | `uv run regrade --skill hawkscan` | + +Per-prompt config lives in `evals//prompts.yaml`. Each prompt may set a +`budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an +`expected` list (each item has exactly one of: signal / anti_pattern / check_id). +A correct run that breaches a budget grades as PASS-SLOW. A process-check in +`process-checks.json` may carry `applies_to: []` to scope it to +specific prompts (absent = applies to all). + +See `harnesses/README.md` for per-platform instructions and CI setup. + +### Reports + +**Per-job summaries.** Each `uv run evals` run writes a JUnit-style table to +`$GITHUB_STEP_SUMMARY`: one row per test, failures-first ordering, +`✅ PASS / ◆ PASS-SLOW / ❌ FAIL` verdicts. It also writes a `cell.json` +artifact in the results directory so downstream steps can aggregate across +jobs. + +**PR digest comment.** When a PR lands, the `comment` CI job collects all +`cell.json` artifacts and runs: + +``` +uv run report --pr [--results-dir DIR] [--baseline-dir DIR] [--lift-dir DIR] [--out FILE] +``` + +This produces a consolidated Markdown digest posted as a sticky PR comment. +The digest contains: + +- **Matrix overview** — one row per (platform × skill × model) cell showing + trigger accuracy, ✅/◆/❌ verdict mix, and aggregate score. +- **Per-cell tables** — the same failures-first rows from each job summary. +- **Regression vs released-tag baseline** — the `comment` job fetches the + baseline from the most recent release's `capture-baseline.yml` run + (best-effort; missing baseline degrades gracefully to "no baseline + available"). Comparison is pure deterministic threshold math: per-test + verdict-flips (fixed / regressed) and aggregate score deltas with a ±3 + band → better / worse / no-change. No AI or LLM calls are used. +- **Skill lift section** — with-skill vs without-skill verdict comparison + showing how many prompts move from FAIL→PASS when the skill is active. + +Baselines are captured at release tags by `capture-baseline.yml`, which is +triggered automatically from `release.yml`. ## Adding test cases When a skill bug or regression is discovered: -1. Add a new row to the relevant `prompts.csv` capturing the prompt that exposed the bug +1. Add a new entry to the relevant `prompts.yaml` capturing the prompt that exposed the bug 2. If the bug was a missing process step, add a check to `process-checks.json` 3. If the bug was a style or qualitative issue, add a check to the relevant `rubric-items.json` diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/api/prompts.csv b/evals/api/prompts.csv deleted file mode 100644 index 93d89e1..0000000 --- a/evals/api/prompts.csv +++ /dev/null @@ -1,17 +0,0 @@ -id,should_trigger,invocation_type,prompt,notes -api-01,true,explicit,"Use the $api skill to show me my org's security posture","Direct skill reference by name" -api-02,true,implicit,"Show me the security posture of my StackHawk organization","'security posture' keyword from skill description" -api-03,true,implicit,"What are my untriaged findings across all apps?","'untriaged findings' keyword" -api-04,true,implicit,"Give me a security dashboard overview for my org","'security dashboard' keyword" -api-05,true,implicit,"Show me the scan history for my payment-api application","'scan history' keyword" -api-06,true,implicit,"What needs immediate security attention in my org?","'what needs attention' keyword" -api-07,true,contextual,"Which of my apps have HIGH severity findings that haven't been addressed?","Finding severity query; no keyword match — tests whether description alone triggers" -api-08,true,contextual,"What changed in my security findings since last week?","Delta / diff query; tests 'what's new' path in Step 4" -api-09,true,contextual,"Which apps haven't been scanned in over 30 days?","Stale app detection; tests Step 3 posture with stale-app focus" -api-10,true,contextual,"I need a security report for the team's weekly standup","Reporting use case with no API-specific keywords" -api-11,true,contextual,"Pull the full finding details for the checkout-service — the PM wants a severity breakdown","App deep dive; tests Step 4 path" -api-12,false,negative,"Run a DAST scan on my API","Scan request → hawkscan skill; 'scan' should not route to api skill" -api-13,false,negative,"Scan my app on localhost:8080 for vulnerabilities","Explicit scan request → hawkscan" -api-14,false,negative,"Create a stackhawk.yml for my service","Config generation → hawkscan" -api-15,false,negative,"Fix the CORS misconfiguration that HawkScan found","Code fix → hawkscan + code change; api skill reads only" -api-16,false,negative,"Run HawkScan against my staging environment","Scan request → hawkscan; 'StackHawk' keyword should not override scan intent" diff --git a/evals/api/prompts.yaml b/evals/api/prompts.yaml new file mode 100644 index 0000000..72b0534 --- /dev/null +++ b/evals/api/prompts.yaml @@ -0,0 +1,80 @@ +- id: api-01 + should_trigger: true + invocation_type: explicit + prompt: Use the $api skill to show me my org's security posture + notes: Direct skill reference by name +- id: api-02 + should_trigger: true + invocation_type: implicit + prompt: Show me the security posture of my StackHawk organization + notes: '''security posture'' keyword from skill description' +- id: api-03 + should_trigger: true + invocation_type: implicit + prompt: What are my untriaged findings across all apps? + notes: '''untriaged findings'' keyword' +- id: api-04 + should_trigger: true + invocation_type: implicit + prompt: Give me a security dashboard overview for my org + notes: '''security dashboard'' keyword' +- id: api-05 + should_trigger: true + invocation_type: implicit + prompt: Show me the scan history for my payment-api application + notes: '''scan history'' keyword' +- id: api-06 + should_trigger: true + invocation_type: implicit + prompt: What needs immediate security attention in my org? + notes: '''what needs attention'' keyword' +- id: api-07 + should_trigger: true + invocation_type: contextual + prompt: Which of my apps have HIGH severity findings that haven't been addressed? + notes: Finding severity query; no keyword match — tests whether description alone triggers +- id: api-08 + should_trigger: true + invocation_type: contextual + prompt: What changed in my security findings since last week? + notes: Delta / diff query; tests 'what's new' path in Step 4 +- id: api-09 + should_trigger: true + invocation_type: contextual + prompt: Which apps haven't been scanned in over 30 days? + notes: Stale app detection; tests Step 3 posture with stale-app focus +- id: api-10 + should_trigger: true + invocation_type: contextual + prompt: I need a security report for the team's weekly standup + notes: Reporting use case with no API-specific keywords +- id: api-11 + should_trigger: true + invocation_type: contextual + prompt: Pull the full finding details for the checkout-service — the PM wants a severity breakdown + notes: App deep dive; tests Step 4 path +- id: api-12 + should_trigger: false + invocation_type: negative + prompt: Run a DAST scan on my API + notes: Scan request → hawkscan skill; 'scan' should not route to api skill +- id: api-13 + should_trigger: false + invocation_type: negative + prompt: Scan my app on localhost:8080 for vulnerabilities + notes: Explicit scan request → hawkscan +- id: api-14 + should_trigger: false + invocation_type: negative + prompt: Create a stackhawk.yml for my service + notes: Config generation → hawkscan +- id: api-15 + should_trigger: false + invocation_type: negative + prompt: Fix the CORS misconfiguration that HawkScan found + notes: Code fix → hawkscan + code change; api skill reads only +- id: api-16 + should_trigger: false + invocation_type: negative + prompt: Run HawkScan against my staging environment + notes: Scan request → hawkscan; 'StackHawk' keyword should not override scan intent diff --git a/evals/cli.py b/evals/cli.py new file mode 100644 index 0000000..764801a --- /dev/null +++ b/evals/cli.py @@ -0,0 +1,174 @@ +"""Unified eval CLI. Entry points: evals, compare, regrade, validate.""" +from __future__ import annotations +import argparse +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + +from evals.lib.config import load_skill +from evals.lib.grading import grade +from evals.lib.harness import get_adapter +from evals.lib.replay import regrade as _regrade +from evals.lib.reporting import build_summary, render_table, render_compare, console +from evals.lib.compare import compare_skill + +PLATFORMS = ["claude-code", "codex", "cursor", "copilot", "agy"] +RESULTS_ROOT = Path(__file__).resolve().parent / "harnesses" + + +def _common_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"]) + p.add_argument("--harness", default="claude-code", choices=PLATFORMS) + p.add_argument("--id", dest="prompt_id") + p.add_argument("--model") + p.add_argument("--max-budget", type=float, default=0.20) + p.add_argument("--bare", action="store_true") + p.add_argument("--full-auto", action="store_true") + p.add_argument("--rubric", action="store_true", + help="also run the qualitative model-graded rubric (needs ANTHROPIC_API_KEY)") + + +def main() -> None: + ap = argparse.ArgumentParser(prog="evals") + _common_args(ap) + args = ap.parse_args() + + cfg = load_skill(args.skill) + adapter = get_adapter(args.harness) + plugin_dirs = [str(Path.cwd() / "plugins" / args.skill)] + prompts = [p for p in cfg.prompts if not args.prompt_id or p.id == args.prompt_id] + if not prompts: + print(f"no prompt '{args.prompt_id}'", file=sys.stderr); sys.exit(1) + + from evals.lib.models import EvalResult, Verdict + results = [] + out_dir = RESULTS_ROOT / args.harness / "results" / args.skill + out_dir.mkdir(parents=True, exist_ok=True) + for p in prompts: + try: + run = adapter.launch(p.prompt, args.skill, p.id, plugin_dirs, + model=args.model, load_skill=True, + max_budget=args.max_budget, bare=args.bare, + full_auto=args.full_auto) + did = adapter.detect_trigger(run, args.skill) + res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill, + did_trigger=did) + # Qualitative rubric (opt-in): grade the transcript with a claude + # grader and attach to the result so the reporter can weave it into + # the pass/fail table. Only when the skill triggered correctly — + # grading a non-triggering run against a workflow rubric is moot. + if args.rubric and res.trigger_correct and did: + from evals.lib.rubric import grade_rubric + res.rubric = grade_rubric(run, args.skill, p.id) + # persist a trace for visibility (uploaded with the artifact) + trace = (f"# {p.id} (returncode={run.returncode})\n" + f"## error\n{run.error or ''}\n" + f"## stderr_tail\n{run.stderr_tail}\n" + f"## output_text\n{run.output_text}\n" + f"## bash_commands\n" + "\n".join(run.bash_commands) + "\n") + (out_dir / f"{p.id}.trace.txt").write_text(trace) + except Exception as e: # noqa: BLE001 — never let one prompt abort the cell + res = EvalResult(platform=args.harness, skill=args.skill, run_id=p.id, + should_trigger=p.should_trigger, did_trigger=False, + trigger_correct=(not p.should_trigger), + verdict=Verdict.FAIL if p.should_trigger else Verdict.PASS, + score=0 if p.should_trigger else 100, + note=f"harness exception: {type(e).__name__}: {e}") + (out_dir / f"{p.id}.trace.txt").write_text( + f"# {p.id}\n## harness exception\n{type(e).__name__}: {e}\n") + results.append(res) + (out_dir / f"{p.id}.result.json").write_text(res.model_dump_json(indent=2)) + + render_table(results) + summary = build_summary(args.skill, args.harness, results) + summary["timestamp"] = datetime.now(timezone.utc).isoformat() + (out_dir / "summary.json").write_text(json.dumps(summary, indent=2)) + + from evals.lib.models import CellReport + import subprocess as _sp + commit = _sp.run(["git", "rev-parse", "--short", "HEAD"], capture_output=True, + text=True).stdout.strip() or "unknown" + cell = CellReport(platform=args.harness, skill=args.skill, + model=args.model or "default", commit=commit, results=results) + (out_dir / "cell.json").write_text(cell.model_dump_json(indent=2)) + # Note: individual cells no longer write to GITHUB_STEP_SUMMARY — the `report` + # job aggregates every cell.json into one pivot table (render_digest), so the + # run summary holds a single table instead of one per matrix cell. + + if summary["false_positives"] or summary["false_negatives"] or \ + summary["total_blocking_failures"] > 0: + sys.exit(1) + + +def compare() -> None: + ap = argparse.ArgumentParser(prog="compare") + _common_args(ap) + args = ap.parse_args() + rows = compare_skill(args.skill, args.harness, model=args.model, + max_budget=args.max_budget, bare=args.bare, + full_auto=args.full_auto, only_id=args.prompt_id) + import json + from pathlib import Path + out_dir = Path(__file__).resolve().parent / "harnesses" / args.harness / "results" / args.skill + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "lift.json").write_text(json.dumps( + [{**r, "with_verdict": r["with_verdict"].value, + "without_verdict": r["without_verdict"].value} for r in rows], indent=2)) + render_compare(rows) + + +def regrade() -> None: + ap = argparse.ArgumentParser(prog="regrade") + ap.add_argument("trace", type=Path) + ap.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"]) + ap.add_argument("--harness", default="claude-code", choices=PLATFORMS) + args = ap.parse_args() + res = _regrade(args.trace, skill=args.skill, platform=args.harness) + render_table([res]) + + +def report() -> None: + import argparse + from pathlib import Path + from evals.lib.models import CellReport + from evals.lib.reporting import render_digest + ap = argparse.ArgumentParser(prog="report") + ap.add_argument("--pr", action="store_true") + ap.add_argument("--results-dir", type=Path, default=Path("results")) + ap.add_argument("--baseline-dir", type=Path, default=None) + ap.add_argument("--lift-dir", type=Path, default=None) + ap.add_argument("--out", type=Path, default=Path("digest.md")) + args = ap.parse_args() + cells = [] + for cj in sorted(args.results_dir.rglob("cell.json")): + try: + cells.append(CellReport.model_validate_json(cj.read_text())) + except Exception: + continue + from evals.lib.baseline import load_baseline_dir + baselines = load_baseline_dir(args.baseline_dir) or None + lift = None + if args.lift_dir and args.lift_dir.exists(): + lift = {} + for lj in args.lift_dir.rglob("lift.json"): + sib = lj.parent / "cell.json" + if not sib.exists(): + continue + cell = CellReport.model_validate_json(sib.read_text()) + lift[(cell.platform, cell.skill, cell.model)] = json.loads(lj.read_text()) + lift = lift or None + md = render_digest(cells, baselines=baselines, lift=lift) + args.out.write_text(md) + print(f"wrote {args.out} ({len(cells)} cells)") + + +def validate() -> None: + ap = argparse.ArgumentParser(prog="validate") + ap.add_argument("--skill", choices=["hawkscan", "api", "stackhawk-data-seed"]) + args = ap.parse_args() + skills = [args.skill] if args.skill else ["hawkscan", "api", "stackhawk-data-seed"] + for skill in skills: + cfg = load_skill(skill) # raises on any validation error + console.print(f"[green]✓[/] {skill}: {len(cfg.prompts)} prompts, " + f"{len(cfg.checks)} checks valid") diff --git a/evals/harnesses/README.md b/evals/harnesses/README.md index 16d2370..04d8b2a 100644 --- a/evals/harnesses/README.md +++ b/evals/harnesses/README.md @@ -16,6 +16,8 @@ Each harness connects the platform-agnostic test cases in `evals/` to a specific ### Prerequisites +Install [uv](https://docs.astral.sh/uv/) if you don't have it — `uv run` handles dependency installation automatically, so no separate `uv sync` step is needed before running evals. + Install the CLI for whichever platform you want to test: ```bash @@ -30,18 +32,18 @@ curl -fsSL https://antigravity.google/install-cli | bash # Antigravity (agy) ```bash # Requires: ANTHROPIC_API_KEY -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan -python3 evals/harnesses/claude-code/run-evals.py --skill api +uv run evals --harness claude-code --skill hawkscan +uv run evals --harness claude-code --skill api # Override model (default: claude's configured default) -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-opus-4-7 -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-haiku-4-5-20251001 +uv run evals --harness claude-code --skill hawkscan --model claude-opus-4-7 +uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001 # Single prompt -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07 +uv run evals --harness claude-code --skill hawkscan --id hw-07 # Dry run (no API calls) -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run +uv run evals --harness claude-code --skill hawkscan --dry-run ``` ### Codex @@ -55,20 +57,20 @@ codex plugin add stackhawk-api@stackhawk ```bash # Requires: OPENAI_API_KEY -python3 evals/harnesses/codex/run-evals.py --skill hawkscan -python3 evals/harnesses/codex/run-evals.py --skill api +uv run evals --harness codex --skill hawkscan +uv run evals --harness codex --skill api # Override model -python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model gpt-5.5 -python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model o3 +uv run evals --harness codex --skill hawkscan --model gpt-5.5 +uv run evals --harness codex --skill hawkscan --model o3 ``` ### Cursor ```bash # Requires: Cursor Pro account -python3 evals/harnesses/cursor/run-evals.py --skill hawkscan -python3 evals/harnesses/cursor/run-evals.py --skill api +uv run evals --harness cursor --skill hawkscan +uv run evals --harness cursor --skill api ``` ### Copilot @@ -76,9 +78,9 @@ python3 evals/harnesses/cursor/run-evals.py --skill api ```bash # Requires: GitHub Copilot account (gh copilot or copilot CLI) # No plugin setup needed — loads directly via --plugin-dir -python3 evals/harnesses/copilot/run-evals.py --skill hawkscan -python3 evals/harnesses/copilot/run-evals.py --skill api -python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex +uv run evals --harness copilot --skill hawkscan +uv run evals --harness copilot --skill api +uv run evals --harness copilot --skill hawkscan --model gpt-5.3-codex ``` > **Best trigger detection**: Copilot emits an explicit `skill` tool call @@ -95,16 +97,23 @@ agy plugin install /path/to/agent-skills/plugins/api ```bash # Run with your main agy session idle (background tasks bleed in otherwise) -python3 evals/harnesses/agy/run-evals.py --skill hawkscan -python3 evals/harnesses/agy/run-evals.py --skill api +uv run evals --harness agy --skill hawkscan +uv run evals --harness agy --skill api # Longer timeout for slow prompts -python3 evals/harnesses/agy/run-evals.py --skill hawkscan --print-timeout 300s +uv run evals --harness agy --skill hawkscan --print-timeout 300s ``` +> **Shims vs adapters**: The per-platform `run-evals.py` scripts are back-compat +> shims that forward to `uv run evals`. Full stream-parsing adapter logic lives in +> `evals/harnesses//adapter.py`; **claude-code, codex, cursor, and agy** +> all have real `adapter.py` implementations. Copilot and Gemini use the legacy +> shim path (Gemini is frozen). The per-platform `run-evals.py` files remain thin +> forwarding shims for back-compat. + ## How it works -For each row in `evals//prompts.csv`, each harness: +For each entry in `evals//prompts.yaml`, each harness: 1. Runs `agent -p ""` in a fresh isolated directory 2. Captures bash commands executed and text output @@ -122,7 +131,17 @@ For each row in `evals//prompts.csv`, each harness: ## CI -The `.github/workflows/skill-evals.yml` workflow runs Claude Code + Codex + Gemini + Cursor on every PR that touches `plugins/` or `evals/`. +The `.github/workflows/skill-evals.yml` workflow is tiered: + +- **Every PR + push**: runs `uv run validate` (no API keys required), then runs + **all four platforms** (claude-code, codex, agy, cursor). On PRs, claude-code + uses the Haiku model to stay within budget; the other platforms run their + default model. +- **Merge to main + manual dispatch**: runs the full multi-model matrix across + all platforms. +- **PR comment job**: collects `cell.json` artifacts from all platform jobs, + fetches the released-tag baseline (best-effort), and posts a consolidated + digest comment via `uv run report --pr`. Required GitHub secrets: - `ANTHROPIC_API_KEY` — Claude Code diff --git a/evals/harnesses/_manual_harness.py b/evals/harnesses/_manual_harness.py index 7b400a9..f996e44 100644 --- a/evals/harnesses/_manual_harness.py +++ b/evals/harnesses/_manual_harness.py @@ -3,13 +3,13 @@ Import this from platform-specific run-evals.py files. """ -import csv import json -import os import sys from datetime import datetime, timezone from pathlib import Path +from evals.lib.config import load_skill + HARNESS_ROOT = Path(__file__).parent.resolve() EVALS_DIR = HARNESS_ROOT.parent @@ -36,23 +36,22 @@ def run_manual_evals( prompt_id: str | None, rubric: bool, ) -> None: - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" results_dir = HARNESS_ROOT / platform / "results" / skill - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] + cfg = load_skill(skill) + all_prompts = cfg.prompts + checks = cfg.checks blocking_checks = [c for c in checks if c.get("severity") == "blocking"] rubric_items = None if rubric: + # rubric-items.json is not yet part of evals.lib — loaded directly for now rubric_path = EVALS_DIR / skill / "rubric-items.json" if rubric_path.exists(): rubric_items = json.loads(rubric_path.read_text())["checks"] if prompt_id: - prompts = [p for p in all_prompts if p["id"] == prompt_id] + prompts = [p for p in all_prompts if p.id == prompt_id] if not prompts: print(f"ERROR: No prompt with id '{prompt_id}'", file=sys.stderr) sys.exit(1) @@ -70,11 +69,11 @@ def run_manual_evals( all_results = [] for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - notes = row.get("notes", "") + run_id = row.id + prompt = row.prompt + should_trigger = row.should_trigger + itype = row.invocation_type + notes = row.notes print(f"\n{'─' * 68}") print(f"[{run_id}] {itype:<12} should_trigger={'Y' if should_trigger else 'N'}") diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py new file mode 100644 index 0000000..f00e2a2 --- /dev/null +++ b/evals/harnesses/agy/adapter.py @@ -0,0 +1,172 @@ +"""agy Harness adapter. Plain-text output (no structured stream). + +Pre-shim (5472ed2~1:evals/harnesses/agy/run-evals.py) notes: +- agy outputs plain text — no --output-format flag available. +- Trigger detection scans output_text only; no bash_commands ever populated. +- Skills installed globally via `agy plugin install` (done in CI); load_skill + toggling is a no-op here. +- AGY_API_KEY passed via os.environ (CI sets it); no special env handling needed. +- Launch: agy -p --print-timeout [--model M] +- The pre-shim used a unified ALL_SIGNALS dict (no CLI/INVOCATION split) with + SKILL: prefix signals. Those are carried in INVOCATION_SIGNALS below alongside + the backtick-evaluation-format signals shared by codex/cursor adapters. +""" +from __future__ import annotations +import shutil +import subprocess +import tempfile + +from evals.lib.models import ParsedRun +from evals.lib.triggers import explicit_decision, decide_trigger +from evals.lib.observe import observe_suffix + +# CLI_SIGNALS: agy emits plain text — there are no shell commands to scan. +CLI_SIGNALS: dict[str, list[str]] = { + "hawkscan": [], + "api": [], + "stackhawk-data-seed": [], +} + +# INVOCATION_SIGNALS: checked against output_text. +# Combines the pre-shim ALL_SIGNALS (SKILL: prefix variants) with the +# evaluation-format backtick signals used by the shared skill prompts. +INVOCATION_SIGNALS: dict[str, list[str]] = { + "hawkscan": [ + # Pre-shim ALL_SIGNALS (verbatim from 5472ed2~1:evals/harnesses/agy/run-evals.py) + "skill: hawkscan", + "skill:hawkscan", + # Evaluation-format variants emitted by the shared skill evaluation suffix + "hawkscan:hawkscan`: yes", + "hawkscan:hawkscan` — yes", + "hawkscan:hawkscan**: yes", + "hawkscan:hawkscan** — yes", + "hawkscan:hawkscan: yes", + "hawkscan:hawkscan — yes", + # Action-intent phrases + "autonomous security scan", + "dast scan after code", + "dast scan triggered", + "dast scan required", + "security scan required", + "security scan after", + "run the security scan", + "running the hawkscan", + "running the security scan", + ], + "api": [ + # Pre-shim ALL_SIGNALS (verbatim) + "skill: api", + "skill:api", + "skill: stackhawk-api", + # Evaluation-format variants + "stackhawk-api:api`: yes", + "stackhawk-api:api` — yes", + "stackhawk-api:api: yes", + "stackhawk-api:api — yes", + ], + "stackhawk-data-seed": [ + "skill: stackhawk-data-seed", + "skill:stackhawk-data-seed", + "stackhawk-data-seed:stackhawk-data-seed`: yes", + "stackhawk-data-seed:stackhawk-data-seed: yes", + "stackhawk-data-seed:stackhawk-data-seed — yes", + "stackhawk-data-seed: yes", "stackhawk-data-seed — yes", + "seed data for hawkscan", "seed this repo", "minimum seed entities", + "data seed complete", "data-seed/manifest", + ], +} + +# Matches pre-shim default --print-timeout (180s); bumped slightly for safety. +PRINT_TIMEOUT = "240s" + +# Appended to every prompt before invoking agy (verbatim from pre-shim +# 5472ed2~1:evals/harnesses/agy/run-evals.py). In --print mode agy hangs on tool +# approvals, so this asks the agent to declare its skill choice up front — that +# declaration is what explicit_decision + INVOCATION_SIGNALS detect. Without it, +# live agy runs produce no detectable trigger text (all false-negatives). agy now +# uses the shared per-skill observe suffix (evals/lib/observe.py), aligning its +# declaration format and workflow-enumeration ask with the other harnesses. + + +def parse_stream(raw: str) -> ParsedRun: + """agy outputs plain text — wrap entirely in output_text; no commands to parse.""" + return ParsedRun(output_text=raw.strip()) + + +class AgyAdapter: + platform = "agy" + + def cli_signals(self, skill: str) -> list[str]: + return CLI_SIGNALS.get(skill, []) + + def invocation_signals(self, skill: str) -> list[str]: + return INVOCATION_SIGNALS.get(skill, []) + + def parse_stream(self, raw: str) -> ParsedRun: + return parse_stream(raw) + + def detect_trigger(self, run: ParsedRun, skill: str) -> bool: + # agy is text-only; CLI signals may appear in prose too, so check both + # lists against the combined text. An explicit decline still overrides a + # loose phrase match (e.g. the agent quoting a "don't scan" instruction). + hay = (" ".join(run.bash_commands) + " " + run.output_text).lower() + cli_hit = any(s.lower() in hay for s in self.cli_signals(skill)) + loose = any(s.lower() in hay for s in self.invocation_signals(skill)) + return decide_trigger(executed_cli=cli_hit, + declared=explicit_decision(run.output_text, skill), + loose_hit=loose) + + def launch( + self, + prompt: str, + skill: str, + run_id: str, + plugin_dirs: list[str], + *, + model: str | None, + load_skill: bool, + max_budget: float, + bare: bool, + full_auto: bool, + ) -> ParsedRun: + # Skills are installed globally via `agy plugin install` in CI; + # load_skill toggling is a no-op here. + tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_") + try: + # --print mode hangs on tool approvals; the suffix makes agy declare + # its skill choice up front so detect_trigger has text to match. agy is + # text-only (no real execution), so observe mode is its only mode. + effective_prompt = prompt + observe_suffix(skill) + cmd = ["agy", "-p", effective_prompt, "--print-timeout", PRINT_TIMEOUT] + if model: + cmd += ["--model", model] + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=420, + cwd=tmpdir, + ) + except subprocess.TimeoutExpired: + return ParsedRun(error="timeout") + run = parse_stream(proc.stdout) + run.returncode = proc.returncode + run.stderr_tail = (proc.stderr or "")[-2000:] + # agy has no non-interactive auth (relies on OAuth; see upstream + # google-antigravity/antigravity-cli#78). In a browser-less CI runner + # it prints an auth URL and times out. Label that distinctly so the + # digest doesn't read it as an eval/plumbing failure on our side. + blob = (run.output_text + " " + run.stderr_tail).lower() + if "authentication required" in blob or "authentication timed out" in blob: + run.error = "agy: no headless auth (upstream antigravity-cli#78) — not runnable in CI" + elif proc.returncode != 0 and not run.error: + run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}" + elif not run.output_text and not run.bash_commands and not run.error: + run.error = f"empty output (exit {proc.returncode})" + return run + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +ADAPTER = AgyAdapter() diff --git a/evals/harnesses/agy/run-evals.py b/evals/harnesses/agy/run-evals.py index c485b1d..52d7fd7 100644 --- a/evals/harnesses/agy/run-evals.py +++ b/evals/harnesses/agy/run-evals.py @@ -1,375 +1,11 @@ #!/usr/bin/env python3 -""" -Antigravity (agy) eval harness for StackHawk agent skills. - -Uses `agy -p --print-timeout` (headless mode). Skills are installed via: - agy plugin install /path/to/agent-skills/plugins/hawkscan - agy plugin install /path/to/agent-skills/plugins/api - -agy outputs plain text (no --output-format stream-json), so trigger detection -scans the full text output for CLI signals and skill-invocation phrases. - -Usage: - python3 evals/harnesses/agy/run-evals.py --skill hawkscan - python3 evals/harnesses/agy/run-evals.py --skill api - python3 evals/harnesses/agy/run-evals.py --skill hawkscan --id hw-07 - python3 evals/harnesses/agy/run-evals.py --skill hawkscan --dry-run - -Requirements: - - agy CLI installed and authenticated - - StackHawk plugins installed: - agy plugin install /path/to/agent-skills/plugins/hawkscan - agy plugin install /path/to/agent-skills/plugins/api - - Run from the agent-skills repo root - -Known limitations: - - agy connects to a shared server process. Background tasks from your - main agy session can bleed into eval runs — run evals when your main - agy session is idle. - - Some contextual prompts take >180s; use --print-timeout to increase. - - Process check scores will be low (agy in print mode doesn't execute - full workflows). -""" - -import argparse -import csv -import json -import os -import re -import subprocess +"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/. +Run `uv run evals --harness agy --skill ` instead. +This shim forwards old invocations to the new CLI.""" import sys -import tempfile -import shutil -from datetime import datetime, timezone -from pathlib import Path - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -HARNESS_DIR = Path(__file__).parent.resolve() -EVALS_DIR = HARNESS_DIR.parent.parent -REPO_ROOT = EVALS_DIR.parent -RESULTS_DIR = HARNESS_DIR / "results" - -# --------------------------------------------------------------------------- -# Trigger signals -# agy outputs plain text, so ALL signals are searched against output_text. -# CLI_SIGNALS: hawk/hawkop commands that appear in agent's description of work. -# INVOCATION_SIGNALS: phrases the agent uses when explicitly invoking a skill. -# --------------------------------------------------------------------------- -ALL_SIGNALS = { - # Explicit skill declarations injected by the OBSERVE_SUFFIX. - # The suffix asks the agent to state 'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. - # This is far more reliable than inferring intent from CLI command mentions. - "hawkscan": [ - "skill: hawkscan", - "skill:hawkscan", - ], - "api": [ - "skill: api", - "skill:api", - "skill: stackhawk-api", - ], -} - -# Negative signals — if these appear, the agent is explicitly NOT using the skill -NEGATIVE_SIGNALS = { - "hawkscan": [ - # Agent explicitly declines the scan - "i cannot run", - "i can't run", - "cannot perform a scan", - "not able to scan", - "no application to scan", - ], - "api": [], -} - - -# --------------------------------------------------------------------------- -# Text parsing — agy outputs plain text, not JSONL -# --------------------------------------------------------------------------- - -def parse_output(text: str) -> dict: - return { - "bash_commands": [], # no JSON tool calls in agy text mode - "files_written": [], - "output_text": text.strip(), - "usage": {}, - "error": None, - } - - -# --------------------------------------------------------------------------- -# Trigger detection — text-only approach -# --------------------------------------------------------------------------- - -def detect_trigger(parsed: dict, skill: str) -> bool: - haystack = parsed["output_text"].lower() - if not haystack: - return False - return any(s.lower() in haystack for s in ALL_SIGNALS.get(skill, [])) - - -# --------------------------------------------------------------------------- -# Process checks -# --------------------------------------------------------------------------- - -def run_process_checks(parsed: dict, checks: list) -> list[dict]: - haystack = " ".join([ - *parsed["bash_commands"], - parsed["output_text"], - ]).lower() - all_files = " ".join(parsed["files_written"]).lower() - - results = [] - for check in checks: - ctype = check.get("type", "command_executed") - signals = [s.lower() for s in check.get("signals", [])] - antis = [a.lower() for a in check.get("anti_patterns", [])] - - signal_hit = next((s for s in signals if s in haystack), None) - anti_hit = next((a for a in antis if a in haystack), None) - - if ctype in ("command_negative", "file_content_negative", "output_negative"): - passed = anti_hit is None - elif ctype == "file_absent": - target = check.get("target_file", "").lower() - passed = target not in all_files - elif ctype == "conditional_command": - m = re.search(r"'([^']+)'", check.get("condition", "")) - condition_keyword = m.group(1).lower() if m else None - if condition_keyword and condition_keyword not in haystack: - passed = True - else: - passed = signal_hit is not None - elif ctype == "command_preference": - preferred = [p.lower() for p in check.get("preferred", [])] - passed = any(p in haystack for p in preferred) and anti_hit is None - else: - passed = signal_hit is not None - if antis: - passed = passed and anti_hit is None - - results.append({ - "id": check["id"], - "pass": passed, - "severity": check.get("severity", "warning"), - "signal_found": signal_hit, - "anti_found": anti_hit, - }) - return results - - -def score_checks(results: list[dict]) -> dict: - blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking") - warning_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "warning") - return { - "total": len(results), - "passed": sum(1 for r in results if r["pass"]), - "blocking_failed": blocking_failed, - "warning_failed": warning_failed, - "score": max(0, 100 - blocking_failed * 15 - warning_failed * 5), - } - - -# --------------------------------------------------------------------------- -# Run agy -# --------------------------------------------------------------------------- - -OBSERVE_SUFFIX = ( - "\n\n(Eval mode: before responding, state which skill you would invoke: " - "'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. Then proceed with your response.)" -) - - -def run_agy( - prompt: str, - skill: str, - run_id: str, - model: str | None = None, - print_timeout: str = "120s", - observe: bool = True, -) -> tuple[dict, int]: - # In observe mode, append a suffix so agy describes its plan without - # blocking on tool call approvals (which hang forever in --print mode). - effective_prompt = prompt + OBSERVE_SUFFIX if observe else prompt - tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")) - try: - cmd = ["agy", "-p", effective_prompt, "--print-timeout", print_timeout] - if model: - cmd += ["--model", model] - - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=int(print_timeout.rstrip("s")) + 30, - cwd=str(tmpdir), - env={**os.environ}, - ) - - trace_dir = RESULTS_DIR / skill - trace_dir.mkdir(parents=True, exist_ok=True) - (trace_dir / f"{run_id}.txt").write_text(proc.stdout) - - parsed = parse_output(proc.stdout) - if proc.returncode != 0 and not parsed["output_text"]: - stderr = proc.stderr.strip() - if stderr: - parsed["error"] = stderr[:300] - - return parsed, proc.returncode - - except subprocess.TimeoutExpired: - return {"bash_commands": [], "files_written": [], "output_text": "", - "usage": {}, "error": "timeout"}, 1 - except FileNotFoundError: - print("ERROR: 'agy' CLI not found.", file=sys.stderr) - sys.exit(1) - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main() -> None: - parser = argparse.ArgumentParser( - description="Antigravity (agy) eval harness for StackHawk agent skills", - ) - parser.add_argument("--skill", required=True, choices=["hawkscan", "api"]) - parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--model", metavar="MODEL_ID", - help="Model override (passed to agy --model)") - parser.add_argument("--print-timeout", default="180s", - help="Per-prompt timeout for agy (default: 180s)") - args = parser.parse_args() - - skill = args.skill - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" - - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] - - if args.prompt_id: - prompts = [p for p in all_prompts if p["id"] == args.prompt_id] - if not prompts: - print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr) - sys.exit(1) - else: - prompts = all_prompts - - model_label = f" | Model: {args.model}" if args.model else "" - print(f"\nSkill: {skill} | Platform: agy | Mode: observe{model_label} | Prompts: {len(prompts)}") - if args.dry_run: - print("[dry-run — no agy calls]") - print("─" * 68) - - all_results = [] - - for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - - print(f"\n[{run_id}] {itype:<12} trigger={'Y' if should_trigger else 'N'}") - print(f" {prompt[:92]}{'…' if len(prompt) > 92 else ''}") - - if args.dry_run: - print(" → skipped") - continue - - parsed, _exit = run_agy( - prompt, skill, run_id, - model=args.model, - print_timeout=args.print_timeout, - observe=True, - ) - - if parsed.get("error"): - print(f" ERROR: {parsed['error']}") - - did_trigger = detect_trigger(parsed, skill) - trigger_ok = did_trigger == should_trigger - - process_results: list[dict] = [] - scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0} - if should_trigger and did_trigger: - process_results = run_process_checks(parsed, checks) - scoring = score_checks(process_results) - - result = { - "platform": "agy", - "skill": skill, - "run_id": run_id, - "prompt": prompt, - "should_trigger": should_trigger, - "did_trigger": did_trigger, - "trigger_correct": trigger_ok, - "bash_commands": parsed["bash_commands"], - "files_written": parsed["files_written"], - "process_checks": process_results, - "scoring": scoring, - "timestamp": datetime.now(timezone.utc).isoformat(), - } - all_results.append(result) - - out_dir = RESULTS_DIR / skill - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2)) - - t_icon = "✓" if trigger_ok else "✗" - score_str = f"score={scoring['score']}/100" if process_results else "—" - print(f" {t_icon} did_trigger={did_trigger} {score_str}") - for pr in process_results: - if not pr["pass"] and pr["severity"] == "blocking": - print(f" BLOCKING FAIL: {pr['id']}") - - if args.dry_run or not all_results: - return - - trigger_correct = sum(1 for r in all_results if r["trigger_correct"]) - total = len(all_results) - false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]] - false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]] - proc_runs = [r for r in all_results if r["process_checks"]] - avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs) - if proc_runs else None) - total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0 - - print("\n" + "═" * 68) - print(f"SUMMARY skill={skill} platform=agy") - print(f" Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)") - if false_pos: - print(f" False positives : {', '.join(r['run_id'] for r in false_pos)}") - if false_neg: - print(f" False negatives : {', '.join(r['run_id'] for r in false_neg)}") - if avg_score is not None: - print(f" Process avg score: {avg_score}/100 (blocking failures: {total_blocking})") - print(f" Results in : {RESULTS_DIR / skill}/") - - summary = { - "skill": skill, "platform": "agy", - "timestamp": datetime.now(timezone.utc).isoformat(), - "trigger_accuracy": {"correct": trigger_correct, "total": total}, - "false_positives": [r["run_id"] for r in false_pos], - "false_negatives": [r["run_id"] for r in false_neg], - "process_avg_score": avg_score, - "total_blocking_failures": total_blocking, - "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], - "score": r["scoring"]["score"]} for r in all_results], - } - (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2)) - - if false_pos or false_neg or total_blocking > 0: - sys.exit(1) - +from evals.cli import main if __name__ == "__main__": + if "--harness" not in sys.argv: + sys.argv += ["--harness", "agy"] main() diff --git a/evals/harnesses/claude-code/README.md b/evals/harnesses/claude-code/README.md index e84b0c3..b0246ae 100644 --- a/evals/harnesses/claude-code/README.md +++ b/evals/harnesses/claude-code/README.md @@ -5,71 +5,65 @@ Runs the StackHawk skill eval suite against Claude Code's non-interactive CLI (` ## Prerequisites - **Claude Code CLI** installed and authenticated: `claude --version` -- **Python 3.11+**: `python3 --version` +- **Python 3.11+** with `uv`: `uv run evals --help` - Run from the **agent-skills repo root** (plugin dirs are auto-detected) -## How it works +## Invocation -For each row in `evals//prompts.csv`: +```bash +# Run all prompts for a skill (preferred) +uv run evals --harness claude-code --skill hawkscan +uv run evals --harness claude-code --skill api -1. Runs `claude -p "" --output-format stream-json --plugin-dir plugins/` - in a fresh temp directory (isolated, no state leakage between runs) -2. Parses the JSONL event stream to extract bash commands, files written, and output text -3. Detects whether the skill triggered (skill-specific command patterns in the trace) -4. If the skill should have triggered and did: runs deterministic checks from - `evals//process-checks.json` against the captured trace -5. Saves `results//.jsonl` (raw trace) and `results//.result.json` (scored) +# Run a specific model +uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001 -Optionally, `--rubric` runs a second `claude -p` call as a qualitative grader, using -`evals//rubric-items.json` and enforcing `evals/rubric-schema.json` via `--json-schema`. +# Cap spend per run (default: $0.20) +uv run evals --harness claude-code --skill hawkscan --max-budget 0.10 -## Usage +# Full-auto mode: agent executes commands (--dangerously-skip-permissions) +uv run evals --harness claude-code --skill hawkscan --full-auto -```bash -# Run all prompts for a skill -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan -python3 evals/harnesses/claude-code/run-evals.py --skill api +# Suppress progress UI (used in CI) +uv run evals --harness claude-code --skill hawkscan --bare +``` -# Run a single prompt by ID -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07 +`run-evals.py` in this directory is a back-compat shim that forwards to `uv run evals --harness claude-code`. Use the `uv run evals` form going forward. -# Dry run — print prompts without calling claude -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run +## Config source -# Full-auto mode: agent can actually execute commands (--dangerously-skip-permissions) -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --full-auto +Prompts and trigger labels are loaded from `evals//prompts.yaml` (not prompts.csv — the CSV was removed during the YAML migration). Process checks come from `evals//process-checks.json`. -# Also run the qualitative rubric grader (extra cost + ~30s per run) -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --rubric +## How it works -# Cap spend per run (default: $0.20) -python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --max-budget 0.10 -``` +For each prompt in `evals//prompts.yaml`: + +1. `ClaudeCodeAdapter.launch()` runs `claude -p "" --output-format stream-json --plugin-dir plugins/` in a fresh temp directory (isolated, no state leakage between runs). The raw stdout is parsed in-memory; no raw `.jsonl` file is persisted. +2. `parse_stream()` extracts bash commands, files written/edited, output text, and cost from the JSONL event stream. +3. `detect_trigger()` checks whether the skill triggered using CLI command signals (e.g. `hawk scan`) and invocation-phrase signals in the output text. +4. If the skill should have triggered and did, process checks from `process-checks.json` are run against the captured trace. +5. A verdict (`pass`, `pass-slow`, or `fail`) is assigned and an `EvalResult` is written to `results//.result.json`. ## Two modes ### Observe mode (default) -The agent runs normally but permissions are not bypassed. It will plan and narrate what -it would do — including bash commands it intends to execute — without necessarily -running them. Trigger detection and most process checks work because the agent names -the commands in its output even when execution is blocked. +Permissions are not bypassed. The agent plans and narrates what it would do — including bash commands it intends to run — without necessarily executing them. Trigger detection and most process checks still work because the agent names the commands in its output. -**Use for:** trigger accuracy checks, output quality checks, rubric grading. +**Use for:** trigger accuracy checks, output quality checks, CI. ### Full-auto mode (`--full-auto`) -Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands, -write files, and run `hawk` CLI calls. Results are more accurate for process checks that -require real execution (e.g. `hawk validate config` was actually run and passed). +Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands, write files, and run `hawk` CLI calls. Results are more accurate for process checks that require real execution. -**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app -is available. Run in a trusted, isolated environment — not on a production machine. +**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app is available. Run in a trusted, isolated environment. ## Understanding results ### Per-run result file (`results//.result.json`) +Conforms to the `EvalResult` Pydantic model (`evals/lib/models.py`): + ```json { "platform": "claude-code", @@ -78,67 +72,51 @@ is available. Run in a trusted, isolated environment — not on a production mac "should_trigger": true, "did_trigger": true, "trigger_correct": true, - "bash_commands": ["hawk version", "hawkop app list", "hawk validate config stackhawk.yml", "hawk scan --json-output"], - "files_written": ["stackhawk.yml"], + "verdict": "pass", + "budget_breaches": [], "process_checks": [ - { "id": "preflight_version_check", "pass": true, "severity": "blocking", "signal_found": "hawk version" }, - { "id": "step2_no_local_yml_created", "pass": true, "severity": "blocking", "signal_found": null } + { "id": "preflight_version_check", "passed": true, "severity": "blocking", "signal_found": "hawk version", "anti_found": null }, + { "id": "step2_no_local_yml_created", "passed": true, "severity": "blocking", "signal_found": null, "anti_found": null } ], - "scoring": { - "total": 22, - "passed": 20, - "blocking_failed": 1, - "warning_failed": 1, - "score": 80 - }, - "rubric_result": null, + "score": 100, "cost_usd": 0.048 } ``` ### Summary file (`results//summary.json`) -Written after a full run. Tracks trigger accuracy, process score, false positives/negatives, -and per-run scores — useful for comparing skill versions over time. +Written after a full run. Tracks trigger accuracy, process score, false positives/negatives, and per-run scores. ### Scoring -| Check type | Deduction per failure | +| Check type | Deduction per failure | |---|---| -| `blocking` | −15 points | -| `warning` | −5 points | +| `blocking` | −15 points | +| `warning` | −5 points | -`overall_pass` in rubric results requires score ≥ 70 and zero blocking failures. +Verdict is `pass` if trigger is correct and score ≥ 70 with zero blocking failures; `pass-slow` if correct but over budget; `fail` otherwise. ### Process checks only run when the skill should have triggered and did -If `should_trigger=false` and the skill correctly did not fire, no process checks run — -there is no workflow to grade. The run scores as a trigger-accuracy pass only. +If `should_trigger=false` and the skill correctly did not fire, no process checks run — there is no workflow to grade. -## Raw traces +## adapter.py -Each run saves the raw `claude --output-format stream-json` JSONL to -`results//.jsonl`. Open it to debug false negatives or unexpected behavior: +`ClaudeCodeAdapter` (`adapter.py`) implements the `HarnessAdapter` protocol for this platform: -```bash -# See all bash commands the agent attempted -jq -r 'select(.type=="assistant") | .message.content[] | select(.type=="tool_use" and .name=="Bash") | .input.command' \ - results/hawkscan/hw-07.jsonl -``` +- `parse_stream(raw)` — parses `claude --output-format stream-json` JSONL into a `ParsedRun` +- `detect_trigger(run, skill)` — checks CLI command signals and invocation-phrase signals +- `launch(prompt, skill, run_id, ...)` — spawns `claude -p` in a temp directory, captures stdout in-memory, and returns a `ParsedRun` ## CI usage -The harness exits non-zero if trigger accuracy falls below 100% or any blocking check -fails. Wire it into CI after bumping a skill version to catch regressions: - ```yaml - name: Run skill evals - run: | - python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan - python3 evals/harnesses/claude-code/run-evals.py --skill api env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + uv run evals --harness claude-code --skill hawkscan --bare --max-budget 0.15 + uv run evals --harness claude-code --skill api --bare --max-budget 0.15 ``` -Note: CI runs are in observe mode by default (no `--full-auto`), which avoids needing -a live `hawk` CLI or running application. Add `--full-auto` only in a dedicated sandbox. +CI runs use observe mode by default (no `--full-auto`), which avoids needing a live `hawk` CLI or running application. diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py new file mode 100644 index 0000000..3c70b79 --- /dev/null +++ b/evals/harnesses/claude-code/adapter.py @@ -0,0 +1,150 @@ +"""claude-code Harness adapter. Parsing + signal lists ported from run-evals.py.""" +from __future__ import annotations +import json +import shutil +import subprocess +import tempfile + +from evals.lib.models import ParsedRun +from evals.lib.triggers import explicit_decision, decide_trigger +from evals.lib.observe import observe_suffix + +CLI_SIGNALS = { + # Scan-distinctive commands only. `hawk version`/`hawk config`/`hawk init` are + # generic preflight an agent runs while merely *assessing* the environment (even + # for a non-scan request), so they over-trigger; rely on scan commands or the + # explicit decision line instead. + "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", + "hawk create app", "hawk perch"], + "api": ["hawkop scan", "hawkop app", "hawkop org", "hawkop env", "hawkop status", + "hawkop init", "/api/v1/scan", "/api/v2/org", "hawk_api GET"], + # data-seed emits checked-in artifacts rather than running a distinctive CLI; + # its discovery + emission paths are the executable signals. + "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials", + "manifest.yaml"], +} + +INVOCATION_SIGNALS = { + "hawkscan": [ + "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes", "hawkscan:hawkscan**: yes", + "hawkscan:hawkscan** — yes", "hawkscan:hawkscan: yes", "hawkscan:hawkscan — yes", + "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes", "hawkscan**: yes", + "hawkscan** — yes", "hawkscan** - yes", "hawkscan: yes", "hawkscan — yes", + "hawkscan - yes", "autonomous security scan", "dast scan after code", + "dast scan triggered", "dast scan required", "security scan required", + "security scan after", "run the security scan", "running the hawkscan", + ], + "api": [ + "stackhawk-api:api`: yes", "stackhawk-api:api` — yes", "stackhawk-api:api**: yes", + "stackhawk-api:api** — yes", "stackhawk-api:api: yes", "stackhawk-api:api — yes", + "stackhawk-api:api - yes", "stackhawk-api**: yes", "stackhawk-api** — yes", + "stackhawk-api** - yes", "stackhawk-api: yes", "stackhawk-api — yes", + "stackhawk-api - yes", + ], + "stackhawk-data-seed": [ + "stackhawk-data-seed:stackhawk-data-seed`: yes", + "stackhawk-data-seed:stackhawk-data-seed` — yes", + "stackhawk-data-seed:stackhawk-data-seed**: yes", + "stackhawk-data-seed:stackhawk-data-seed** — yes", + "stackhawk-data-seed:stackhawk-data-seed: yes", + "stackhawk-data-seed:stackhawk-data-seed — yes", + "stackhawk-data-seed:stackhawk-data-seed - yes", + "stackhawk-data-seed**: yes", "stackhawk-data-seed** — yes", + "stackhawk-data-seed** - yes", "stackhawk-data-seed: yes", + "stackhawk-data-seed — yes", "stackhawk-data-seed - yes", + "seed data for hawkscan", "seed this repo", "minimum seed entities", + "seed entities required", "data seed complete", "data-seed/manifest", + ], +} + +# Observe-mode suffix is shared across all harnesses (per-skill). See +# evals/lib/observe.py for the rationale and wording. + + +def parse_stream(raw: str) -> ParsedRun: + bash, written, edited, text, cost, err = [], [], [], "", 0.0, None + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + etype = event.get("type", "") + if etype == "assistant": + for block in event.get("message", {}).get("content", []): + bt = block.get("type", "") + if bt == "text": + text += block.get("text", "") + "\n" + elif bt == "tool_use": + name, inp = block.get("name", ""), block.get("input", {}) + if name == "Bash" and inp.get("command"): + bash.append(inp["command"]) + elif name == "Write" and inp.get("file_path"): + written.append(inp["file_path"]) + elif name == "Edit" and inp.get("file_path"): + edited.append(inp["file_path"]) + elif etype == "result": + cost = event.get("total_cost_usd") or event.get("cost_usd") or 0.0 + text += event.get("result", "") + if event.get("subtype") == "error_during_execution": + err = event.get("result", "unknown error") + return ParsedRun(bash_commands=bash, files_written=written, files_edited=edited, + output_text=text.strip(), cost_usd=cost, error=err) + + +class ClaudeCodeAdapter: + platform = "claude-code" + + def cli_signals(self, skill): return CLI_SIGNALS.get(skill, []) + def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, []) + def parse_stream(self, raw): return parse_stream(raw) + + def detect_trigger(self, run: ParsedRun, skill: str) -> bool: + cli = " ".join(run.bash_commands).lower() + executed = any(s.lower() in cli for s in self.cli_signals(skill)) + text = run.output_text.lower() + loose = any(s.lower() in text for s in self.invocation_signals(skill)) + return decide_trigger(executed_cli=executed, + declared=explicit_decision(run.output_text, skill), + loose_hit=loose) + + def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill, + max_budget, bare, full_auto) -> ParsedRun: + tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_") + try: + # Observe mode (default): ask the agent to declare + outline its + # workflow. Full-auto/extended runs against a real target execute for + # real, so they use the bare prompt. + effective_prompt = prompt if full_auto else prompt + observe_suffix(skill) + cmd = ["claude", "-p", effective_prompt, "--output-format", "stream-json", + "--verbose", "--no-session-persistence", + "--max-budget-usd", str(max_budget)] + if model: + cmd += ["--model", model] + if load_skill: + for pd in plugin_dirs: + cmd += ["--plugin-dir", pd] + if full_auto: + cmd.append("--dangerously-skip-permissions") + if bare: + cmd.append("--bare") + try: + proc = subprocess.run(cmd, capture_output=True, text=True, + timeout=300, cwd=tmpdir) + except subprocess.TimeoutExpired: + return ParsedRun(error="timeout") + run = parse_stream(proc.stdout) + run.returncode = proc.returncode + run.stderr_tail = (proc.stderr or "")[-2000:] + if proc.returncode != 0 and not run.error: + run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}" + elif not run.output_text and not run.bash_commands and not run.error: + run.error = f"empty output (exit {proc.returncode})" + return run + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +ADAPTER = ClaudeCodeAdapter() diff --git a/evals/harnesses/claude-code/run-evals.py b/evals/harnesses/claude-code/run-evals.py index 6d8679f..9489d2b 100644 --- a/evals/harnesses/claude-code/run-evals.py +++ b/evals/harnesses/claude-code/run-evals.py @@ -1,650 +1,11 @@ #!/usr/bin/env python3 -""" -Claude Code eval harness for StackHawk agent skills. - -Usage: - python3 run-evals.py --skill hawkscan # all prompts - python3 run-evals.py --skill api # all prompts - python3 run-evals.py --skill hawkscan --id hw-07 # single prompt - python3 run-evals.py --skill hawkscan --dry-run # print prompts, no claude calls - python3 run-evals.py --skill hawkscan --full-auto # allow agent to execute commands - python3 run-evals.py --skill hawkscan --rubric # also run qualitative rubric grader - python3 run-evals.py --skill hawkscan --bare # CI mode: ANTHROPIC_API_KEY only, no keychain - -Requirements: - - claude CLI installed and authenticated (https://claude.ai/code) - - Run from the agent-skills repo root (plugin dirs are auto-detected) - -Output: - evals/harnesses/claude-code/results//.jsonl raw trace - evals/harnesses/claude-code/results//.result.json scored result - evals/harnesses/claude-code/results//summary.json run summary -""" - -import argparse -import csv -import json -import os -import shutil -import subprocess +"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/. +Run `uv run evals --harness claude-code --skill ` instead. +This shim forwards old invocations to the new CLI.""" import sys -import tempfile -from datetime import datetime, timezone -from pathlib import Path - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -HARNESS_DIR = Path(__file__).parent.resolve() -EVALS_DIR = HARNESS_DIR.parent.parent -REPO_ROOT = EVALS_DIR.parent -RESULTS_DIR = HARNESS_DIR / "results" - -# --------------------------------------------------------------------------- -# Trigger signals -# Any of these appearing in bash commands or output text means the skill fired. -# --------------------------------------------------------------------------- -# CLI signals — checked against bash_commands only (prevents documentation content -# from creating false positives when the agent writes README/guides about HawkScan). -CLI_SIGNALS = { - "hawkscan": [ - "hawk scan", - "hawk validate", - "hawk rescan", - # "hawk version" intentionally excluded: running 'hawk version' alone is common - # for installation-check tasks and would cause false positives. The preflight - # workflow always runs 'hawk config --help' in the same command, so 'hawk config' - # below is sufficient to distinguish scan-intent from install-check tasks. - "hawk config", - "hawk create app", - "hawk init", - "hawk perch", - ], - "api": [ - "hawkop scan", - "hawkop app", - "hawkop org", - "hawkop env", - "hawkop status", - "hawkop init", - "/api/v1/scan", - "/api/v2/org", - "hawk_api GET", - ], -} - -# Invocation signals — checked against output_text only. Catches contextual prompts -# where the agent correctly identifies the skill should trigger and says so explicitly, -# but can't reach the CLI workflow (empty working dir, no running app, etc.). -# -# These are intentionally specific to action-intent phrases, NOT the generic -# "hawkscan:hawkscan: yes" pattern (which also fires on educational/informational -# responses where the agent answers "what does HawkScan detect?" type questions). -INVOCATION_SIGNALS = { - "hawkscan": [ - # Generic YES-evaluation signals — catch any run where the agent explicitly - # evaluates hawkscan as YES regardless of phrasing. Models vary in their markdown - # formatting: backtick (`` `hawkscan:hawkscan` ``), bold (**hawkscan:hawkscan**), - # or plain text. Each produces a different character sequence around `: YES`. - # Safe because SKILL.md now instructs NO for educational questions (hw-20), - # doc-only changes (hw-16/17/18), installation tasks (hw-19), and explicit skips. - "hawkscan:hawkscan`: yes", # "`hawkscan:hawkscan`: YES" — backtick + colon (Sonnet/Haiku) - "hawkscan:hawkscan` — yes", # "`hawkscan:hawkscan` — YES" — backtick + em-dash - "hawkscan:hawkscan**: yes", # "**hawkscan:hawkscan**: YES" — bold + colon - "hawkscan:hawkscan** — yes", # "**hawkscan:hawkscan** — YES" — bold + em-dash - "hawkscan:hawkscan: yes", # "hawkscan:hawkscan: YES" — plain colon - "hawkscan:hawkscan — yes", # "hawkscan:hawkscan — YES" — em-dash - "hawkscan:hawkscan - yes", # "hawkscan:hawkscan - YES" — plain hyphen (Opus 4.7) - "hawkscan:hawkscan - **yes", # "hawkscan:hawkscan - **YES**" — bold YES (Opus 4.7) - # Plugin name only — Opus 4.7 sometimes omits :hawkscan suffix - "hawkscan**: yes", # "**hawkscan**: YES" — bold, no skill suffix - "hawkscan** — yes", # bold + em-dash, no skill suffix - "hawkscan** - yes", # "**hawkscan:hawkscan** - YES" — bold name + hyphen (Opus) - "hawkscan: yes", # plain colon, no skill suffix - "hawkscan — yes", # em-dash, no skill suffix - "hawkscan - yes", # plain hyphen, no skill suffix - # Specific action-intent phrases as belt-and-suspenders for unusual formats - "autonomous security scan", - "dast scan after code", - "dast scan triggered", - "dast scan required", - "security scan required", - "security scan after", - "run the security scan", - "running the hawkscan", - ], - "api": [ - # Full skill name (plugin:skill) — Sonnet/Haiku format - "stackhawk-api:api`: yes", # backtick + colon - "stackhawk-api:api` — yes", # backtick + em-dash - "stackhawk-api:api**: yes", # bold + colon - "stackhawk-api:api** — yes", # bold + em-dash - "stackhawk-api:api: yes", # plain colon - "stackhawk-api:api — yes", # em-dash - "stackhawk-api:api - yes", # plain hyphen (Opus 4.7) - # Plugin name only (Opus 4.7 sometimes omits :api suffix) - "stackhawk-api**: yes", # bold + colon, no skill suffix - "stackhawk-api** — yes", # bold + em-dash, no skill suffix - "stackhawk-api** - yes", # bold + plain hyphen, no skill suffix (Opus) - "stackhawk-api: yes", # plain colon, no skill suffix - "stackhawk-api — yes", # em-dash, no skill suffix - "stackhawk-api - yes", # plain hyphen, no skill suffix - ], -} - -# --------------------------------------------------------------------------- -# Stream-json parsing -# --------------------------------------------------------------------------- - -def parse_stream(jsonl: str) -> dict: - """Extract structured data from a claude --output-format stream-json run.""" - bash_commands: list[str] = [] - files_written: list[str] = [] - files_edited: list[str] = [] - output_text = "" - cost_usd = 0.0 - error = None - - for line in jsonl.splitlines(): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - except json.JSONDecodeError: - continue - - etype = event.get("type", "") - - if etype == "assistant": - for block in event.get("message", {}).get("content", []): - btype = block.get("type", "") - if btype == "text": - output_text += block.get("text", "") + "\n" - elif btype == "tool_use": - name = block.get("name", "") - inp = block.get("input", {}) - if name == "Bash": - cmd = inp.get("command", "") - if cmd: - bash_commands.append(cmd) - elif name == "Write": - path = inp.get("file_path", "") - if path: - files_written.append(path) - elif name == "Edit": - path = inp.get("file_path", "") - if path: - files_edited.append(path) - - elif etype == "result": - cost_usd = event.get("cost_usd") or 0.0 - output_text += event.get("result", "") - if event.get("subtype") == "error_during_execution": - error = event.get("result", "unknown error") - - return { - "bash_commands": bash_commands, - "files_written": files_written, - "files_edited": files_edited, - "output_text": output_text.strip(), - "cost_usd": cost_usd, - "error": error, - } - - -# --------------------------------------------------------------------------- -# Trigger detection -# --------------------------------------------------------------------------- - -def detect_trigger(parsed: dict, skill: str) -> bool: - # CLI signals are checked only against actual bash commands executed — prevents - # documentation content (README guides, educational answers) from triggering. - cli_haystack = " ".join(parsed["bash_commands"]).lower() - if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])): - return True - - # Invocation signals are checked only against output text — catches cases where - # the agent evaluated the skill as YES but couldn't run CLI commands (e.g. empty - # working dir, permission blocks on hawkop, no running app). - text_haystack = parsed["output_text"].lower() - return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, [])) - - -# --------------------------------------------------------------------------- -# Process checks -# --------------------------------------------------------------------------- - -def run_process_checks(parsed: dict, checks: list) -> list[dict]: - haystack = " ".join([ - *parsed["bash_commands"], - parsed["output_text"], - ]).lower() - all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower() - - results = [] - for check in checks: - ctype = check.get("type", "command_executed") - signals = [s.lower() for s in check.get("signals", [])] - antis = [a.lower() for a in check.get("anti_patterns", [])] - - signal_hit = next((s for s in signals if s in haystack), None) - anti_hit = next((a for a in antis if a in haystack), None) - - if ctype in ("command_negative", "file_content_negative", "output_negative"): - passed = anti_hit is None - elif ctype == "file_absent": - target = check.get("target_file", "").lower() - passed = target not in all_files - elif ctype == "conditional_command": - # Only enforce when the condition's keyword appears in the trace. - # Extract the keyword inside single quotes from the condition string, - # e.g. "stackhawk.yml contains 'authentication:'" → "authentication:" - import re as _re - condition_str = check.get("condition", "") - m = _re.search(r"'([^']+)'", condition_str) - condition_keyword = m.group(1).lower() if m else None - if condition_keyword and condition_keyword not in haystack: - passed = True # condition not met — check is not applicable - else: - passed = signal_hit is not None - elif ctype == "command_preference": - preferred = [p.lower() for p in check.get("preferred", [])] - passed = any(p in haystack for p in preferred) and anti_hit is None - else: - passed = signal_hit is not None - if antis: - passed = passed and anti_hit is None - - results.append({ - "id": check["id"], - "pass": passed, - "severity": check.get("severity", "warning"), - "signal_found": signal_hit, - "anti_found": anti_hit, - }) - - return results - - -def score_checks(results: list[dict]) -> dict: - blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking") - warning_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "warning") - score = max(0, 100 - blocking_failed * 15 - warning_failed * 5) - return { - "total": len(results), - "passed": sum(1 for r in results if r["pass"]), - "blocking_failed": blocking_failed, - "warning_failed": warning_failed, - "score": score, - } - - -# --------------------------------------------------------------------------- -# Run claude -p -# --------------------------------------------------------------------------- - -def run_claude( - prompt: str, - skill: str, - run_id: str, - plugin_dirs: list[str], - full_auto: bool = False, - bare: bool = False, - max_budget: float = 0.20, - model: str | None = None, -) -> tuple[dict, int]: - # Each eval runs in a fresh temp dir so there is no state leakage. - tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_") - try: - cmd = [ - "claude", "-p", prompt, - "--output-format", "stream-json", - "--verbose", - "--no-session-persistence", - "--max-budget-usd", str(max_budget), - ] - if model: - cmd += ["--model", model] - for pd in plugin_dirs: - cmd += ["--plugin-dir", pd] - if full_auto: - cmd.append("--dangerously-skip-permissions") - if bare: - cmd.append("--bare") - - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, - cwd=tmpdir, - ) - - trace_dir = RESULTS_DIR / skill - trace_dir.mkdir(parents=True, exist_ok=True) - (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout) - - return parse_stream(proc.stdout), proc.returncode - - except subprocess.TimeoutExpired: - return { - "bash_commands": [], "files_written": [], "files_edited": [], - "output_text": "", "cost_usd": 0.0, "error": "timeout", - }, 1 - except FileNotFoundError: - print( - "ERROR: 'claude' CLI not found. " - "Install Claude Code (https://claude.ai/code) and ensure it is in PATH.", - file=sys.stderr, - ) - sys.exit(1) - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Rubric grader (qualitative, model-assisted, optional) -# --------------------------------------------------------------------------- - -def run_rubric_grader( - parsed: dict, - skill: str, - run_id: str, - plugin_dirs: list[str], - bare: bool = False, -) -> dict | None: - rubric_path = EVALS_DIR / skill / "rubric-items.json" - schema_path = EVALS_DIR / "rubric-schema.json" - if not rubric_path.exists() or not schema_path.exists(): - print(" [rubric] rubric-items.json or rubric-schema.json not found — skipping", - file=sys.stderr) - return None - - rubric_data = json.loads(rubric_path.read_text()) - schema = json.loads(schema_path.read_text()) - - grader_prompt = f"""{rubric_data['grader_prompt']} - -## Bash Commands Executed: -{json.dumps(parsed['bash_commands'], indent=2)} - -## Files Written/Edited: -{json.dumps(parsed['files_written'] + parsed['files_edited'], indent=2)} - -## Agent Output (first 4000 chars): -{parsed['output_text'][:4000]} - -## Rubric Checks to Grade: -{json.dumps(rubric_data['checks'], indent=2)} - -Populate the JSON result with: - skill = "{skill}" - run_id = "{run_id}" - overall_pass = true if all checks pass and score >= 70 - score = 0-100 - checks = one entry per check id listed above""" - - cmd = [ - "claude", "-p", grader_prompt, - "--output-format", "json", - "--no-session-persistence", - "--json-schema", json.dumps(schema), - "--max-budget-usd", "0.10", - ] - for pd in plugin_dirs: - cmd += ["--plugin-dir", pd] - if bare: - cmd.append("--bare") - - try: - proc = subprocess.run(cmd, capture_output=True, text=True, timeout=120) - envelope = json.loads(proc.stdout) - # --output-format json wraps the response: {"result": "", ...} - raw_result = envelope.get("result", "{}") - if isinstance(raw_result, dict): - return raw_result - return json.loads(raw_result) - except Exception as exc: - print(f" [rubric] grader failed: {exc}", file=sys.stderr) - return None - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main() -> None: - parser = argparse.ArgumentParser( - description="Claude Code eval harness for StackHawk agent skills", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument("--skill", required=True, choices=["hawkscan", "api"]) - parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID", - help="Run a single prompt by id (e.g. hw-07)") - parser.add_argument("--dry-run", action="store_true", - help="Print prompts without calling claude") - parser.add_argument("--rubric", action="store_true", - help="Run qualitative rubric grader after process checks (extra cost + time)") - parser.add_argument("--full-auto", action="store_true", - help="Pass --dangerously-skip-permissions so the agent can execute commands") - parser.add_argument("--bare", action="store_true", - help="Pass --bare to claude: ANTHROPIC_API_KEY only, no keychain/hooks/CLAUDE.md (recommended for CI)") - parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD", - help="Max spend per eval run in USD (default: 0.20)") - parser.add_argument("--plugin-dir", action="append", dest="plugin_dirs", - help="Plugin dir to load; auto-detected from repo root if omitted") - parser.add_argument("--model", metavar="MODEL_ID", - help="Override the Claude model (e.g. claude-haiku-4-5-20251001, claude-sonnet-4-6)") - args = parser.parse_args() - - skill = args.skill - plugin_dirs = args.plugin_dirs or [str(REPO_ROOT / "plugins" / skill)] - - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" - - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] - - if args.prompt_id: - prompts = [p for p in all_prompts if p["id"] == args.prompt_id] - if not prompts: - print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr) - sys.exit(1) - else: - prompts = all_prompts - - mode = "full-auto" if args.full_auto else "observe" - if args.bare: - mode += "+bare" - model_label = f" | Model: {args.model}" if args.model else "" - print(f"\nSkill: {skill} | Platform: claude-code | Mode: {mode}{model_label} | Prompts: {len(prompts)}") - if args.dry_run: - print("[dry-run — no claude calls]") - print("─" * 68) - - all_results = [] - total_cost = 0.0 - - for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - - print(f"\n[{run_id}] {itype:<12} trigger={'Y' if should_trigger else 'N'}") - print(f" {prompt[:92]}{'…' if len(prompt) > 92 else ''}") - - if args.dry_run: - print(" → skipped") - continue - - parsed, _exit = run_claude( - prompt, skill, run_id, plugin_dirs, - full_auto=args.full_auto, - bare=args.bare, - max_budget=args.max_budget, - model=args.model, - ) - total_cost += parsed.get("cost_usd", 0.0) - - if parsed.get("error"): - print(f" ERROR: {parsed['error']}") - - did_trigger = detect_trigger(parsed, skill) - trigger_ok = did_trigger == should_trigger - - process_results: list[dict] = [] - scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0} - if should_trigger and did_trigger: - process_results = run_process_checks(parsed, checks) - scoring = score_checks(process_results) - - rubric_result = None - if args.rubric and should_trigger and did_trigger: - print(" [rubric] grading…", end=" ", flush=True) - rubric_result = run_rubric_grader(parsed, skill, run_id, plugin_dirs, bare=args.bare) - print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed") - - result = { - "platform": "claude-code", - "skill": skill, - "run_id": run_id, - "prompt": prompt, - "should_trigger": should_trigger, - "did_trigger": did_trigger, - "trigger_correct": trigger_ok, - "bash_commands": parsed["bash_commands"], - "files_written": parsed["files_written"], - "process_checks": process_results, - "scoring": scoring, - "rubric_result": rubric_result, - "cost_usd": parsed.get("cost_usd", 0.0), - "timestamp": datetime.now(timezone.utc).isoformat(), - } - all_results.append(result) - - out_dir = RESULTS_DIR / skill - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2)) - - t_icon = "✓" if trigger_ok else "✗" - score_str = f"score={scoring['score']}/100" if process_results else "—" - print(f" {t_icon} did_trigger={did_trigger} {score_str} ${parsed.get('cost_usd', 0):.3f}") - - for pr in process_results: - if not pr["pass"] and pr["severity"] == "blocking": - print(f" BLOCKING FAIL: {pr['id']}") - - if args.dry_run or not all_results: - return - - # ── Final summary ────────────────────────────────────────────────────── - trigger_correct = sum(1 for r in all_results if r["trigger_correct"]) - total = len(all_results) - false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]] - false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]] - process_runs = [r for r in all_results if r["process_checks"]] - avg_score = (sum(r["scoring"]["score"] for r in process_runs) // len(process_runs) - if process_runs else None) - total_blocking = (sum(r["scoring"]["blocking_failed"] for r in process_runs) - if process_runs else 0) - - print("\n" + "═" * 68) - print(f"SUMMARY skill={skill} platform=claude-code") - print(f" Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)") - if false_pos: - print(f" False positives : {', '.join(r['run_id'] for r in false_pos)}") - if false_neg: - print(f" False negatives : {', '.join(r['run_id'] for r in false_neg)}") - if avg_score is not None: - print(f" Process avg score: {avg_score}/100 (blocking failures: {total_blocking})") - print(f" Total cost : ${total_cost:.3f}") - print(f" Results in : {RESULTS_DIR / skill}/") - - summary = { - "skill": skill, - "platform": "claude-code", - "timestamp": datetime.now(timezone.utc).isoformat(), - "trigger_accuracy": {"correct": trigger_correct, "total": total}, - "false_positives": [r["run_id"] for r in false_pos], - "false_negatives": [r["run_id"] for r in false_neg], - "process_avg_score": avg_score, - "total_blocking_failures": total_blocking, - "total_cost_usd": round(total_cost, 4), - "runs": [ - { - "run_id": r["run_id"], - "trigger_correct": r["trigger_correct"], - "score": r["scoring"]["score"], - "cost_usd": r["cost_usd"], - } - for r in all_results - ], - } - (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2)) - - # ── GitHub Actions step summary ──────────────────────────────────────── - step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY") - if step_summary_path: - _write_step_summary( - step_summary_path, skill, all_results, - false_pos, false_neg, avg_score, total_blocking, total_cost, - ) - - # ── Exit non-zero for CI on any regression ───────────────────────────── - if false_pos or false_neg or total_blocking > 0: - sys.exit(1) - - -def _write_step_summary( - path: str, - skill: str, - results: list[dict], - false_pos: list[dict], - false_neg: list[dict], - avg_score: int | None, - total_blocking: int, - total_cost: float, -) -> None: - correct = sum(1 for r in results if r["trigger_correct"]) - total = len(results) - trigger_icon = "✅" if correct == total else "❌" - score_icon = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌" - - lines = [ - f"## Skill Eval: `{skill}` (claude-code)\n", - "| Metric | Value |", - "|---|---|", - f"| Trigger accuracy | {trigger_icon} {correct}/{total} |", - ] - if false_pos: - lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |") - if false_neg: - lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |") - if avg_score is not None: - lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |") - lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |") - lines.append(f"| Total cost | ${total_cost:.3f} |") - lines.append("") - - # Per-run table - lines += [ - "
Per-run results\n", - "| ID | Trigger | Score | Cost |", - "|---|---|---|---|", - ] - for r in results: - t = "✅" if r["trigger_correct"] else "❌" - score = r["scoring"]["score"] if r["process_checks"] else "—" - lines.append(f"| {r['run_id']} | {t} | {score} | ${r['cost_usd']:.3f} |") - lines.append("\n
\n") - - with open(path, "a") as f: - f.write("\n".join(lines) + "\n") - +from evals.cli import main if __name__ == "__main__": + if "--harness" not in sys.argv: + sys.argv += ["--harness", "claude-code"] main() diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py new file mode 100644 index 0000000..7196d48 --- /dev/null +++ b/evals/harnesses/codex/adapter.py @@ -0,0 +1,185 @@ +"""codex Harness adapter. Parsing + signals ported from pre-shim run-evals.py.""" +from __future__ import annotations +import json +import os +import shutil +import subprocess +import tempfile + +from evals.lib.models import ParsedRun +from evals.lib.triggers import explicit_decision, decide_trigger +from evals.lib.observe import observe_suffix + +# CLI signals — checked against bash_commands only (prevents documentation content +# from creating false positives when the agent writes README/guides about HawkScan). +CLI_SIGNALS = { + # Scan-distinctive commands only — generic preflight (hawk version/config/init) + # over-triggers when the agent merely assesses the environment for a non-scan + # request. Triggering falls back to the explicit decision line otherwise. + "hawkscan": [ + "hawk scan", + "hawk validate", + "hawk rescan", + "hawk create app", + "hawk perch", + ], + # Signals specific to the api reporting workflow — avoids false positives + # from hawkop status/app/env commands that the hawkscan skill also runs. + "api": [ + "hawkop scan get", # api Step 4: app deep dive + "hawkop org get", # api Step 1: establish orgId + "hawkop org set", # api Step 1: switch org + "/api/v2/org", # api Step 3: org posture endpoint (hawkop doesn't wrap it) + "/api/v1/scan", # api Step 4: raw scan drill-down + "hawk_api GET", # api raw API helper function + ], + # data-seed emits checked-in artifacts rather than a distinctive CLI. + "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials", + "manifest.yaml"], +} + +# Invocation signals — checked against output_text only. In full-auto mode these are +# belt-and-suspenders: the agent usually runs CLI commands directly. They catch +# contextual prompts where the skill fires but the agent finds an empty working dir +# and stops before reaching the CLI (same as observe mode in Claude Code harness). +INVOCATION_SIGNALS = { + "hawkscan": [ + # All markdown formatting variants the model uses around `: YES` or ` — YES` + "hawkscan:hawkscan`: yes", # backtick + colon + "hawkscan:hawkscan` — yes", # backtick + dash + "hawkscan:hawkscan**: yes", # bold + colon + "hawkscan:hawkscan** — yes", # bold + dash + "hawkscan:hawkscan: yes", # plain colon + "hawkscan:hawkscan — yes", # plain dash + # Specific action-intent phrases + "autonomous security scan", + "dast scan after code", + "dast scan triggered", + "dast scan required", + "security scan required", + "security scan after", + "run the security scan", + "running the hawkscan", + ], + "api": [ + "stackhawk-api:api`: yes", + "stackhawk-api:api` — yes", + "stackhawk-api:api: yes", + "stackhawk-api:api — yes", + ], + "stackhawk-data-seed": [ + "stackhawk-data-seed:stackhawk-data-seed`: yes", + "stackhawk-data-seed:stackhawk-data-seed` — yes", + "stackhawk-data-seed:stackhawk-data-seed**: yes", + "stackhawk-data-seed:stackhawk-data-seed** — yes", + "stackhawk-data-seed:stackhawk-data-seed: yes", + "stackhawk-data-seed:stackhawk-data-seed — yes", + "stackhawk-data-seed: yes", "stackhawk-data-seed — yes", + "seed data for hawkscan", "seed this repo", "minimum seed entities", + "seed entities required", "data seed complete", "data-seed/manifest", + ], +} + + +def parse_stream(raw: str) -> ParsedRun: + cmds, out, otok, err, seen = [], "", 0, None, set() + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + t = ev.get("type", "") + if t == "item.started": + it = ev.get("item", {}) + if it.get("type") == "command_execution": + c = it.get("command", "") + if c and c not in seen: + cmds.append(c) + seen.add(c) + elif t == "item.completed": + it = ev.get("item", {}) + if it.get("type") in ("message", "agent_message"): + txt = it.get("text", "") + if txt: + out += txt + "\n" + content = it.get("content", "") + if isinstance(content, str): + out += content + "\n" + elif isinstance(content, list): + for b in content: + if isinstance(b, dict) and b.get("type") == "text": + out += b.get("text", "") + "\n" + elif t == "turn.completed": + otok += ev.get("usage", {}).get("output_tokens", 0) + elif t == "error": + err = ev.get("message", "unknown error") + return ParsedRun(bash_commands=cmds, output_text=out.strip(), + output_tokens=otok or None, error=err) + + +class CodexAdapter: + platform = "codex" + + def cli_signals(self, skill): return CLI_SIGNALS.get(skill, []) + def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, []) + def parse_stream(self, raw): return parse_stream(raw) + + def detect_trigger(self, run: ParsedRun, skill: str) -> bool: + cli = " ".join(run.bash_commands).lower() + executed = any(s.lower() in cli for s in self.cli_signals(skill)) + text = run.output_text.lower() + loose = any(s.lower() in text for s in self.invocation_signals(skill)) + return decide_trigger(executed_cli=executed, + declared=explicit_decision(run.output_text, skill), + loose_hit=loose) + + def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill, + max_budget, bare, full_auto) -> ParsedRun: + tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_") + try: + # In CI the bubblewrap sandbox can't initialize (Ubuntu 24.04 blocks + # unprivileged user namespaces), so codex exits at sandbox startup + # before running any command — the agent can't reach hawk. Bypass the + # sandbox there; it's safe on an ephemeral runner in a throwaway tmpdir, + # and the agent needs write+exec to run the skill workflow anyway. + # Locally, keep the real sandbox (workspace-write for full-auto, + # else read-only). Passing --sandbox twice makes codex exit 2. + if os.environ.get("CI"): + cmd = [ + "codex", "exec", "--json", + "--dangerously-bypass-approvals-and-sandbox", + "--skip-git-repo-check", + ] + else: + sandbox = "workspace-write" if full_auto else "read-only" + cmd = [ + "codex", "exec", "--json", + "--sandbox", sandbox, + "--skip-git-repo-check", + ] + if model: + cmd += ["-m", model] + # Observe mode: append the per-skill walkthrough suffix. Full-auto / + # extended runs against a real target use the bare prompt. + cmd.append(prompt if full_auto else prompt + observe_suffix(skill)) + try: + proc = subprocess.run(cmd, capture_output=True, text=True, + timeout=300, cwd=tmpdir) + except subprocess.TimeoutExpired: + return ParsedRun(error="timeout") + run = parse_stream(proc.stdout) + run.returncode = proc.returncode + run.stderr_tail = (proc.stderr or "")[-2000:] + if proc.returncode != 0 and not run.error: + run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}" + elif not run.output_text and not run.bash_commands and not run.error: + run.error = f"empty output (exit {proc.returncode})" + return run + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +ADAPTER = CodexAdapter() diff --git a/evals/harnesses/codex/run-evals.py b/evals/harnesses/codex/run-evals.py index 3c0828f..24df734 100644 --- a/evals/harnesses/codex/run-evals.py +++ b/evals/harnesses/codex/run-evals.py @@ -1,592 +1,11 @@ #!/usr/bin/env python3 -""" -Codex eval harness for StackHawk agent skills. - -Usage: - python3 run-evals.py --skill hawkscan # all prompts - python3 run-evals.py --skill api # all prompts - python3 run-evals.py --skill hawkscan --id hw-07 # single prompt - python3 run-evals.py --skill hawkscan --dry-run # print prompts, no codex calls - python3 run-evals.py --skill hawkscan --rubric # also run qualitative rubric grader - -Requirements: - - codex CLI installed and authenticated (https://openai.com/codex) - - Run from the agent-skills repo root - -Output: - evals/harnesses/codex/results//.jsonl raw JSONL trace - evals/harnesses/codex/results//.result.json scored result - evals/harnesses/codex/results//summary.json run summary -""" - -import argparse -import csv -import json -import os -import shutil -import subprocess +"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/. +Run `uv run evals --harness codex --skill ` instead. +This shim forwards old invocations to the new CLI.""" import sys -import tempfile -from datetime import datetime, timezone -from pathlib import Path - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -HARNESS_DIR = Path(__file__).parent.resolve() -EVALS_DIR = HARNESS_DIR.parent.parent -REPO_ROOT = EVALS_DIR.parent -RESULTS_DIR = HARNESS_DIR / "results" - -# --------------------------------------------------------------------------- -# Trigger signals -# --------------------------------------------------------------------------- -# CLI signals — checked against bash_commands only (prevents documentation content -# from creating false positives when the agent writes README/guides about HawkScan). -CLI_SIGNALS = { - "hawkscan": [ - "hawk scan", - "hawk validate", - "hawk rescan", - # "hawk version" excluded: running 'hawk version' alone is common for - # installation-check tasks and would cause false positives. The preflight - # workflow always also runs 'hawk config --help', so 'hawk config' below suffices. - "hawk config", - "hawk create app", - "hawk init", - "hawk perch", - ], - # Signals specific to the api reporting workflow — avoids false positives - # from hawkop status/app/env commands that the hawkscan skill also runs. - "api": [ - "hawkop scan get", # api Step 4: app deep dive - "hawkop org get", # api Step 1: establish orgId - "hawkop org set", # api Step 1: switch org - "/api/v2/org", # api Step 3: org posture endpoint (hawkop doesn't wrap it) - "/api/v1/scan", # api Step 4: raw scan drill-down - "hawk_api GET", # api raw API helper function - ], -} - -# Invocation signals — checked against output_text only. In full-auto mode these are -# belt-and-suspenders: the agent usually runs CLI commands directly. They catch -# contextual prompts where the skill fires but the agent finds an empty working dir -# and stops before reaching the CLI (same as observe mode in Claude Code harness). -INVOCATION_SIGNALS = { - "hawkscan": [ - # All markdown formatting variants the model uses around `: YES` or ` — YES` - "hawkscan:hawkscan`: yes", # backtick + colon - "hawkscan:hawkscan` — yes", # backtick + dash - "hawkscan:hawkscan**: yes", # bold + colon - "hawkscan:hawkscan** — yes", # bold + dash - "hawkscan:hawkscan: yes", # plain colon - "hawkscan:hawkscan — yes", # plain dash - # Specific action-intent phrases - "autonomous security scan", - "dast scan after code", - "dast scan triggered", - "dast scan required", - "security scan required", - "security scan after", - "run the security scan", - "running the hawkscan", - ], - "api": [ - "stackhawk-api:api`: yes", - "stackhawk-api:api` — yes", - "stackhawk-api:api: yes", - "stackhawk-api:api — yes", - ], -} - -# --------------------------------------------------------------------------- -# JSONL parsing -# Codex --json event stream: item.started / item.completed / turn.completed -# --------------------------------------------------------------------------- - -def parse_stream(jsonl: str) -> dict: - commands: list[str] = [] - output_text = "" - input_tokens = 0 - output_tokens = 0 - error = None - - seen_commands: set[str] = set() - - for line in jsonl.splitlines(): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - except json.JSONDecodeError: - continue - - etype = event.get("type", "") - - if etype == "item.started": - item = event.get("item", {}) - if item.get("type") == "command_execution": - cmd = item.get("command", "") - # Deduplicate: item.started fires before item.completed for the same cmd - if cmd and cmd not in seen_commands: - commands.append(cmd) - seen_commands.add(cmd) - - elif etype == "item.completed": - item = event.get("item", {}) - # Capture any assistant message text — Codex uses "agent_message" type - if item.get("type") in ("message", "agent_message"): - text = item.get("text", "") - if text: - output_text += text + "\n" - content = item.get("content", "") - if isinstance(content, str): - output_text += content + "\n" - elif isinstance(content, list): - for block in content: - if isinstance(block, dict) and block.get("type") == "text": - output_text += block.get("text", "") + "\n" - - elif etype == "turn.completed": - usage = event.get("usage", {}) - input_tokens += usage.get("input_tokens", 0) - output_tokens += usage.get("output_tokens", 0) - - elif etype == "error": - error = event.get("message", "unknown error") - - return { - "bash_commands": commands, - "files_written": [], # populated by scanning tmpdir after run - "files_edited": [], - "output_text": output_text.strip(), - "input_tokens": input_tokens, - "output_tokens": output_tokens, - "error": error, - } - - -def _setup_skill_in_dir(skill: str, target_dir: Path) -> None: - """No-op: skills are installed globally via 'codex plugin add @stackhawk'. - Run: codex plugin marketplace add /path/to/agent-skills - codex plugin add hawkscan@stackhawk - codex plugin add stackhawk-api@stackhawk - """ - pass - - -# --------------------------------------------------------------------------- -# Trigger detection -# --------------------------------------------------------------------------- - -def detect_trigger(parsed: dict, skill: str) -> bool: - # CLI signals checked against actual bash commands only — prevents README/educational - # output text from creating false positives. - cli_haystack = " ".join(parsed["bash_commands"]).lower() - if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])): - return True - - # Invocation signals checked against output text only — belt-and-suspenders for - # contextual prompts where the skill fires but no CLI commands run (empty dir, etc.) - text_haystack = parsed["output_text"].lower() - return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, [])) - - -# --------------------------------------------------------------------------- -# Process checks -# --------------------------------------------------------------------------- - -def run_process_checks(parsed: dict, checks: list) -> list[dict]: - haystack = " ".join([ - *parsed["bash_commands"], - parsed["output_text"], - ]).lower() - all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower() - - results = [] - for check in checks: - ctype = check.get("type", "command_executed") - signals = [s.lower() for s in check.get("signals", [])] - antis = [a.lower() for a in check.get("anti_patterns", [])] - - signal_hit = next((s for s in signals if s in haystack), None) - anti_hit = next((a for a in antis if a in haystack), None) - - if ctype in ("command_negative", "file_content_negative", "output_negative"): - passed = anti_hit is None - elif ctype == "file_absent": - target = check.get("target_file", "").lower() - passed = target not in all_files - elif ctype == "conditional_command": - # Only enforce when the condition's keyword appears in the trace. - import re as _re - condition_str = check.get("condition", "") - m = _re.search(r"'([^']+)'", condition_str) - condition_keyword = m.group(1).lower() if m else None - if condition_keyword and condition_keyword not in haystack: - passed = True # condition not met — check not applicable - else: - passed = signal_hit is not None - elif ctype == "command_preference": - preferred = [p.lower() for p in check.get("preferred", [])] - passed = any(p in haystack for p in preferred) and anti_hit is None - else: - passed = signal_hit is not None - if antis: - passed = passed and anti_hit is None - - results.append({ - "id": check["id"], - "pass": passed, - "severity": check.get("severity", "warning"), - "signal_found": signal_hit, - "anti_found": anti_hit, - }) - - return results - - -def score_checks(results: list[dict]) -> dict: - blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking") - warning_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "warning") - score = max(0, 100 - blocking_failed * 15 - warning_failed * 5) - return { - "total": len(results), - "passed": sum(1 for r in results if r["pass"]), - "blocking_failed": blocking_failed, - "warning_failed": warning_failed, - "score": score, - } - - -# --------------------------------------------------------------------------- -# Run codex exec -# --------------------------------------------------------------------------- - -def run_codex( - prompt: str, - skill: str, - run_id: str, - full_auto: bool = True, - max_budget: float = 0.20, - model: str | None = None, -) -> tuple[dict, int]: - tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")) - try: - _setup_skill_in_dir(skill, tmpdir) - - cmd = [ - "codex", "exec", "--json", - "--sandbox", "workspace-write", - "--skip-git-repo-check", - ] - if model: - cmd += ["-m", model] - if not full_auto: - cmd += ["--sandbox", "read-only"] - cmd.append(prompt) - - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, - cwd=str(tmpdir), - ) - - trace_dir = RESULTS_DIR / skill - trace_dir.mkdir(parents=True, exist_ok=True) - (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout) - - parsed = parse_stream(proc.stdout) - - # Scan tmpdir for files created during the run (more reliable than JSONL parsing) - created = [ - str(p.relative_to(tmpdir)) - for p in tmpdir.rglob("*") - if p.is_file() and not str(p).startswith(str(tmpdir / ".codex")) - ] - parsed["files_written"] = created - - return parsed, proc.returncode - - except subprocess.TimeoutExpired: - return { - "bash_commands": [], "files_written": [], "files_edited": [], - "output_text": "", "input_tokens": 0, "output_tokens": 0, "error": "timeout", - }, 1 - except FileNotFoundError: - print( - "ERROR: 'codex' CLI not found. " - "Install the Codex CLI and ensure it is in PATH.", - file=sys.stderr, - ) - sys.exit(1) - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Rubric grader -# Uses: codex exec "" --output-schema -o -# --------------------------------------------------------------------------- - -def run_rubric_grader(parsed: dict, skill: str, run_id: str) -> dict | None: - rubric_path = EVALS_DIR / skill / "rubric-items.json" - schema_path = EVALS_DIR / "rubric-schema.json" - if not rubric_path.exists() or not schema_path.exists(): - return None - - rubric_data = json.loads(rubric_path.read_text()) - - grader_prompt = f"""{rubric_data['grader_prompt']} - -## Commands Executed: -{json.dumps(parsed['bash_commands'], indent=2)} - -## Files Created: -{json.dumps(parsed['files_written'], indent=2)} - -## Agent Output (first 4000 chars): -{parsed['output_text'][:4000]} - -## Rubric Checks to Grade: -{json.dumps(rubric_data['checks'], indent=2)} - -Populate: skill="{skill}", run_id="{run_id}", overall_pass, score 0-100, checks array.""" - - tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkrubric_{run_id}_")) - try: - output_file = tmpdir / "rubric_result.json" - cmd = [ - "codex", "exec", - grader_prompt, - "--output-schema", str(schema_path), - "-o", str(output_file), - ] - subprocess.run(cmd, capture_output=True, text=True, timeout=120, cwd=str(tmpdir)) - - if output_file.exists(): - return json.loads(output_file.read_text()) - return None - except Exception as exc: - print(f" [rubric] grader failed: {exc}", file=sys.stderr) - return None - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main() -> None: - parser = argparse.ArgumentParser( - description="Codex eval harness for StackHawk agent skills", - ) - parser.add_argument("--skill", required=True, choices=["hawkscan", "api"]) - parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID", - help="Run a single prompt by id (e.g. hw-07)") - parser.add_argument("--dry-run", action="store_true", - help="Print prompts without calling codex") - parser.add_argument("--rubric", action="store_true", - help="Run qualitative rubric grader after process checks (extra cost)") - parser.add_argument("--no-full-auto", action="store_true", - help="Run without --full-auto (restricts filesystem access)") - parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD", - help="Max spend per eval run in USD (default: 0.20)") - parser.add_argument("--model", metavar="MODEL_ID", - help="Override the Codex model (e.g. o3, o4-mini, gpt-4o)") - args = parser.parse_args() - - skill = args.skill - full_auto = not args.no_full_auto - - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" - - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] - - if args.prompt_id: - prompts = [p for p in all_prompts if p["id"] == args.prompt_id] - if not prompts: - print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr) - sys.exit(1) - else: - prompts = all_prompts - - mode = "full-auto" if full_auto else "sandbox" - model_label = f" | Model: {args.model}" if args.model else "" - print(f"\nSkill: {skill} | Platform: codex | Mode: {mode}{model_label} | Prompts: {len(prompts)}") - if args.dry_run: - print("[dry-run — no codex calls]") - print("─" * 68) - - all_results = [] - total_cost = 0.0 - - for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - - print(f"\n[{run_id}] {itype:<12} trigger={'Y' if should_trigger else 'N'}") - print(f" {prompt[:92]}{'…' if len(prompt) > 92 else ''}") - - if args.dry_run: - print(" → skipped") - continue - - parsed, _exit = run_codex( - prompt, skill, run_id, - full_auto=full_auto, - max_budget=args.max_budget, - model=args.model, - ) - - # Codex doesn't report USD cost directly; estimate from token usage - tokens = parsed.get("input_tokens", 0) + parsed.get("output_tokens", 0) - est_cost = tokens * 0.000015 # rough estimate - total_cost += est_cost - - if parsed.get("error"): - print(f" ERROR: {parsed['error']}") - - did_trigger = detect_trigger(parsed, skill) - trigger_ok = did_trigger == should_trigger - - process_results: list[dict] = [] - scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0} - if should_trigger and did_trigger: - process_results = run_process_checks(parsed, checks) - scoring = score_checks(process_results) - - rubric_result = None - if args.rubric and should_trigger and did_trigger: - print(" [rubric] grading…", end=" ", flush=True) - rubric_result = run_rubric_grader(parsed, skill, run_id) - print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed") - - result = { - "platform": "codex", - "skill": skill, - "run_id": run_id, - "prompt": prompt, - "should_trigger": should_trigger, - "did_trigger": did_trigger, - "trigger_correct": trigger_ok, - "bash_commands": parsed["bash_commands"], - "files_written": parsed["files_written"], - "process_checks": process_results, - "scoring": scoring, - "rubric_result": rubric_result, - "tokens": {"input": parsed.get("input_tokens", 0), "output": parsed.get("output_tokens", 0)}, - "timestamp": datetime.now(timezone.utc).isoformat(), - } - all_results.append(result) - - out_dir = RESULTS_DIR / skill - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2)) - - t_icon = "✓" if trigger_ok else "✗" - score_str = f"score={scoring['score']}/100" if process_results else "—" - print(f" {t_icon} did_trigger={did_trigger} {score_str}") - - for pr in process_results: - if not pr["pass"] and pr["severity"] == "blocking": - print(f" BLOCKING FAIL: {pr['id']}") - - if args.dry_run or not all_results: - return - - # ── Summary ──────────────────────────────────────────────────────────── - trigger_correct = sum(1 for r in all_results if r["trigger_correct"]) - total = len(all_results) - false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]] - false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]] - proc_runs = [r for r in all_results if r["process_checks"]] - avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs) - if proc_runs else None) - total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0 - - print("\n" + "═" * 68) - print(f"SUMMARY skill={skill} platform=codex") - print(f" Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)") - if false_pos: - print(f" False positives : {', '.join(r['run_id'] for r in false_pos)}") - if false_neg: - print(f" False negatives : {', '.join(r['run_id'] for r in false_neg)}") - if avg_score is not None: - print(f" Process avg score: {avg_score}/100 (blocking failures: {total_blocking})") - print(f" Results in : {RESULTS_DIR / skill}/") - - summary = { - "skill": skill, - "platform": "codex", - "timestamp": datetime.now(timezone.utc).isoformat(), - "trigger_accuracy": {"correct": trigger_correct, "total": total}, - "false_positives": [r["run_id"] for r in false_pos], - "false_negatives": [r["run_id"] for r in false_neg], - "process_avg_score": avg_score, - "total_blocking_failures": total_blocking, - "runs": [ - {"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], "score": r["scoring"]["score"]} - for r in all_results - ], - } - (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2)) - - # ── GitHub Actions step summary ───────────────────────────────────────── - step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY") - if step_summary_path: - _write_step_summary(step_summary_path, skill, all_results, false_pos, false_neg, avg_score, total_blocking) - - if false_pos or false_neg or total_blocking > 0: - sys.exit(1) - - -def _write_step_summary( - path: str, skill: str, results: list[dict], - false_pos: list[dict], false_neg: list[dict], - avg_score: int | None, total_blocking: int, -) -> None: - correct = sum(1 for r in results if r["trigger_correct"]) - total = len(results) - trigger_icon = "✅" if correct == total else "❌" - score_icon = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌" - - lines = [ - f"## Skill Eval: `{skill}` (codex)\n", - "| Metric | Value |", "|---|---|", - f"| Trigger accuracy | {trigger_icon} {correct}/{total} |", - ] - if false_pos: - lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |") - if false_neg: - lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |") - if avg_score is not None: - lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |") - lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |") - lines.append("") - - lines += [ - "
Per-run results\n", - "| ID | Trigger | Score |", "|---|---|---|", - ] - for r in results: - t = "✅" if r["trigger_correct"] else "❌" - score = r["scoring"]["score"] if r["process_checks"] else "—" - lines.append(f"| {r['run_id']} | {t} | {score} |") - lines.append("\n
\n") - - with open(path, "a") as f: - f.write("\n".join(lines) + "\n") - +from evals.cli import main if __name__ == "__main__": + if "--harness" not in sys.argv: + sys.argv += ["--harness", "codex"] main() diff --git a/evals/harnesses/copilot/run-evals.py b/evals/harnesses/copilot/run-evals.py index 9779110..d04c71e 100644 --- a/evals/harnesses/copilot/run-evals.py +++ b/evals/harnesses/copilot/run-evals.py @@ -1,391 +1,11 @@ #!/usr/bin/env python3 -""" -GitHub Copilot CLI eval harness for StackHawk agent skills. - -Uses `copilot -p --output-format json --allow-all-tools --plugin-dir`. -Skills are loaded from plugins// via --plugin-dir. - -The trigger detection is uniquely reliable: Copilot emits an explicit - tool.execution_start {"toolName":"skill","arguments":{"skill":"hawkscan"}} -event when the skill fires. No heuristic text-matching needed. - -Usage: - python3 evals/harnesses/copilot/run-evals.py --skill hawkscan - python3 evals/harnesses/copilot/run-evals.py --skill api - python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --id hw-07 - python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --dry-run - python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex - -Requirements: - - GitHub Copilot CLI installed and authenticated (copilot login) - - Run from the agent-skills repo root - -Note: Copilot actually executes commands (--allow-all-tools), so process -check scores reflect real hawk workflow completion — not just observations. -""" - -import argparse -import csv -import json -import os -import re -import shutil -import subprocess +"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/. +Run `uv run evals --harness copilot --skill ` instead. +This shim forwards old invocations to the new CLI.""" import sys -import tempfile -from datetime import datetime, timezone -from pathlib import Path - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -HARNESS_DIR = Path(__file__).parent.resolve() -EVALS_DIR = HARNESS_DIR.parent.parent -REPO_ROOT = EVALS_DIR.parent -RESULTS_DIR = HARNESS_DIR / "results" - -# --------------------------------------------------------------------------- -# Trigger detection -# Copilot emits an unambiguous tool.execution_start event when a skill fires: -# {"type":"tool.execution_start","data":{"toolName":"skill","arguments":{"skill":"hawkscan"}}} -# This eliminates all heuristic signal-matching needed for other platforms. -# --------------------------------------------------------------------------- - -def detect_trigger(parsed: dict, skill: str) -> bool: - # Primary: explicit skill tool call (unambiguous) - for call in parsed.get("skill_calls", []): - if call.lower() == skill.lower() or call.lower() == f"stackhawk-{skill}".lower(): - return True - - # Fallback: CLI signals in bash commands (belt-and-suspenders) - cli_signals = { - "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config", - "hawk create app", "hawk init", "hawk perch"], - "api": ["hawkop scan get", "hawkop org get", "/api/v2/org", "/api/v1/scan"], - } - cmd_haystack = " ".join(parsed.get("bash_commands", [])).lower() - return any(s.lower() in cmd_haystack for s in cli_signals.get(skill, [])) - - -# --------------------------------------------------------------------------- -# Stream-json parsing — Copilot JSONL event format: -# tool.execution_start {"toolName":"bash","arguments":{"command":"..."}} -# tool.execution_start {"toolName":"skill","arguments":{"skill":"hawkscan"}} -# tool.execution_partial_result {"partialOutput":"..."} -# assistant.message {"content":"..."} -# result {} -# --------------------------------------------------------------------------- - -def parse_stream(jsonl: str) -> dict: - bash_commands: list[str] = [] - files_written: list[str] = [] - skill_calls: list[str] = [] - output_text = "" - usage: dict = {} - error = None - - for line in jsonl.splitlines(): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - except json.JSONDecodeError: - continue - - etype = event.get("type", "") - data = event.get("data", {}) - - if etype == "tool.execution_start": - tool_name = data.get("toolName", "") - args = data.get("arguments", {}) - - if tool_name == "bash": - cmd = args.get("command", "") - if cmd: - bash_commands.append(cmd) - - elif tool_name == "skill": - skill_name = args.get("skill", "") - if skill_name: - skill_calls.append(skill_name) - - elif tool_name in ("write_file", "create_file", "str_replace_editor"): - path = args.get("path") or args.get("file_path") or "" - if path: - files_written.append(path) - - elif etype == "assistant.message": - content = data.get("content", "") - if content: - output_text += content + "\n" - - elif etype == "result": - usage = data.get("usage", {}) - if data.get("error"): - error = str(data["error"]) - - return { - "bash_commands": bash_commands, - "files_written": files_written, - "skill_calls": skill_calls, - "output_text": output_text.strip(), - "usage": usage, - "error": error, - } - - -# --------------------------------------------------------------------------- -# Process checks -# --------------------------------------------------------------------------- - -def run_process_checks(parsed: dict, checks: list) -> list[dict]: - haystack = " ".join([ - *parsed["bash_commands"], - parsed["output_text"], - ]).lower() - all_files = " ".join(parsed["files_written"]).lower() - - results = [] - for check in checks: - ctype = check.get("type", "command_executed") - signals = [s.lower() for s in check.get("signals", [])] - antis = [a.lower() for a in check.get("anti_patterns", [])] - - signal_hit = next((s for s in signals if s in haystack), None) - anti_hit = next((a for a in antis if a in haystack), None) - - if ctype in ("command_negative", "file_content_negative", "output_negative"): - passed = anti_hit is None - elif ctype == "file_absent": - target = check.get("target_file", "").lower() - passed = target not in all_files - elif ctype == "conditional_command": - m = re.search(r"'([^']+)'", check.get("condition", "")) - condition_keyword = m.group(1).lower() if m else None - if condition_keyword and condition_keyword not in haystack: - passed = True - else: - passed = signal_hit is not None - elif ctype == "command_preference": - preferred = [p.lower() for p in check.get("preferred", [])] - passed = any(p in haystack for p in preferred) and anti_hit is None - else: - passed = signal_hit is not None - if antis: - passed = passed and anti_hit is None - - results.append({ - "id": check["id"], - "pass": passed, - "severity": check.get("severity", "warning"), - "signal_found": signal_hit, - "anti_found": anti_hit, - }) - return results - - -def score_checks(results: list[dict]) -> dict: - blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking") - warning_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "warning") - return { - "total": len(results), - "passed": sum(1 for r in results if r["pass"]), - "blocking_failed": blocking_failed, - "warning_failed": warning_failed, - "score": max(0, 100 - blocking_failed * 15 - warning_failed * 5), - } - - -# --------------------------------------------------------------------------- -# Run copilot -# --------------------------------------------------------------------------- - -def run_copilot( - prompt: str, - skill: str, - run_id: str, - model: str | None = None, -) -> tuple[dict, int]: - tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")) - try: - plugin_dir = str(REPO_ROOT / "plugins" / skill) - - cmd = [ - "copilot", "-p", prompt, - "--output-format", "json", - "--allow-all-tools", - "--plugin-dir", plugin_dir, - "--no-ask-user", - ] - if model: - cmd += ["--model", model] - - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=600, - cwd=str(tmpdir), - env={**os.environ}, - ) - - trace_dir = RESULTS_DIR / skill - trace_dir.mkdir(parents=True, exist_ok=True) - (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout) - - parsed = parse_stream(proc.stdout) - if proc.returncode != 0 and not parsed["output_text"] and not parsed["skill_calls"]: - stderr = proc.stderr.strip() - if stderr: - parsed["error"] = stderr[:300] - - return parsed, proc.returncode - - except subprocess.TimeoutExpired: - return {"bash_commands": [], "files_written": [], "skill_calls": [], - "output_text": "", "usage": {}, "error": "timeout"}, 1 - except FileNotFoundError: - print("ERROR: 'copilot' CLI not found. Install GitHub Copilot CLI.", file=sys.stderr) - sys.exit(1) - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main() -> None: - parser = argparse.ArgumentParser( - description="GitHub Copilot CLI eval harness for StackHawk agent skills", - ) - parser.add_argument("--skill", required=True, choices=["hawkscan", "api"]) - parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--model", metavar="MODEL_ID", - help="Model override (e.g. gpt-5.3-codex)") - args = parser.parse_args() - - skill = args.skill - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" - - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] - - if args.prompt_id: - prompts = [p for p in all_prompts if p["id"] == args.prompt_id] - if not prompts: - print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr) - sys.exit(1) - else: - prompts = all_prompts - - model_label = f" | Model: {args.model}" if args.model else "" - print(f"\nSkill: {skill} | Platform: copilot | Mode: full-auto{model_label} | Prompts: {len(prompts)}") - if args.dry_run: - print("[dry-run — no copilot calls]") - print("─" * 68) - - all_results = [] - - for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - - print(f"\n[{run_id}] {itype:<12} trigger={'Y' if should_trigger else 'N'}") - print(f" {prompt[:92]}{'…' if len(prompt) > 92 else ''}") - - if args.dry_run: - print(" → skipped") - continue - - parsed, _exit = run_copilot(prompt, skill, run_id, model=args.model) - - if parsed.get("error"): - print(f" ERROR: {parsed['error']}") - - did_trigger = detect_trigger(parsed, skill) - trigger_ok = did_trigger == should_trigger - - process_results: list[dict] = [] - scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0} - if should_trigger and did_trigger: - process_results = run_process_checks(parsed, checks) - scoring = score_checks(process_results) - - result = { - "platform": "copilot", - "skill": skill, - "run_id": run_id, - "prompt": prompt, - "should_trigger": should_trigger, - "did_trigger": did_trigger, - "trigger_correct": trigger_ok, - "bash_commands": parsed["bash_commands"], - "files_written": parsed["files_written"], - "skill_calls": parsed["skill_calls"], - "process_checks": process_results, - "scoring": scoring, - "usage": parsed.get("usage", {}), - "timestamp": datetime.now(timezone.utc).isoformat(), - } - all_results.append(result) - - out_dir = RESULTS_DIR / skill - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2)) - - t_icon = "✓" if trigger_ok else "✗" - score_str = f"score={scoring['score']}/100" if process_results else "—" - print(f" {t_icon} did_trigger={did_trigger} skill_calls={parsed['skill_calls']} {score_str}") - for pr in process_results: - if not pr["pass"] and pr["severity"] == "blocking": - print(f" BLOCKING FAIL: {pr['id']}") - - if args.dry_run or not all_results: - return - - trigger_correct = sum(1 for r in all_results if r["trigger_correct"]) - total = len(all_results) - false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]] - false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]] - proc_runs = [r for r in all_results if r["process_checks"]] - avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs) - if proc_runs else None) - total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0 - - print("\n" + "═" * 68) - print(f"SUMMARY skill={skill} platform=copilot") - print(f" Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)") - if false_pos: - print(f" False positives : {', '.join(r['run_id'] for r in false_pos)}") - if false_neg: - print(f" False negatives : {', '.join(r['run_id'] for r in false_neg)}") - if avg_score is not None: - print(f" Process avg score: {avg_score}/100 (blocking failures: {total_blocking})") - print(f" Results in : {RESULTS_DIR / skill}/") - - summary = { - "skill": skill, "platform": "copilot", - "timestamp": datetime.now(timezone.utc).isoformat(), - "trigger_accuracy": {"correct": trigger_correct, "total": total}, - "false_positives": [r["run_id"] for r in false_pos], - "false_negatives": [r["run_id"] for r in false_neg], - "process_avg_score": avg_score, - "total_blocking_failures": total_blocking, - "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], - "score": r["scoring"]["score"]} for r in all_results], - } - (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2)) - - if false_pos or false_neg or total_blocking > 0: - sys.exit(1) - +from evals.cli import main if __name__ == "__main__": + if "--harness" not in sys.argv: + sys.argv += ["--harness", "copilot"] main() diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py new file mode 100644 index 0000000..3d5bdcc --- /dev/null +++ b/evals/harnesses/cursor/adapter.py @@ -0,0 +1,238 @@ +"""cursor Harness adapter. Parsing + signals ported from pre-shim run-evals.py.""" +from __future__ import annotations +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +from evals.lib.models import ParsedRun +from evals.lib.triggers import explicit_decision, decide_trigger +from evals.lib.observe import observe_suffix + +# adapter.py -> cursor -> harnesses -> evals -> repo root +REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent +# cursor/.cursor/rules/ holds the alwaysApply .mdc skill rules (pre-shim path). +CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules" + + +def _setup_skill(target_dir: str) -> None: + """Copy cursor/.cursor/rules/*.mdc into the run's workspace so alwaysApply + rules load. Mirrors the pre-shim run-evals.py _setup_workspace().""" + dst = Path(target_dir) / ".cursor" / "rules" + dst.mkdir(parents=True, exist_ok=True) + for mdc in CURSOR_RULES_DIR.glob("*.mdc"): + shutil.copy2(mdc, dst / mdc.name) + +# CLI signals — checked against bash_commands only. +# Cursor goes directly into execution, so CLI signals are the primary trigger +# indicator. Invocation signals cover narrative phrases the agent uses when +# kicking off a skill workflow without immediately running commands. +CLI_SIGNALS = { + # Scan-distinctive commands only — generic preflight (hawk version/config/init) + # over-triggers when the agent merely assesses the environment for a non-scan + # request. Triggering falls back to the explicit decision line otherwise. + "hawkscan": [ + "hawk scan", + "hawk validate", + "hawk rescan", + "hawk create app", + "hawk perch", + ], + # Cursor api: agent runs hawkop status as its first step, then deeper + # hawkop commands. Broader hawkop signals included since Cursor doesn't + # have false-positive risk of Codex full-auto mode. + "api": [ + "hawkop status", + "hawkop scan get", + "hawkop org get", + "hawkop org set", + "hawkop app list", + "/api/v2/org", + "/api/v1/scan", + "hawk_api GET", + ], + "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials", + "manifest.yaml"], +} + +# Invocation signals — checked against output_text only. +# Cursor doesn't use the Claude Code "EVALUATE: YES/NO" evaluation step, so +# these focus on narrative phrases the agent uses when kicking off a skill workflow. +INVOCATION_SIGNALS = { + "hawkscan": [ + "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes", + "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes", + "hawkscan:hawkscan: yes", "hawkscan:hawkscan — yes", + "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes", + "hawkscan** - yes", "hawkscan** — yes", + "hawkscan**: yes", "hawkscan: yes", + "hawkscan — yes", "hawkscan - yes", + "autonomous security scan", + "dast scan after code", "dast scan triggered", "dast scan required", + "security scan required", "security scan after", + "run the security scan", "running the hawkscan", + ], + "api": [ + # Claude Code evaluation-format signals (if model uses that format) + "stackhawk-api:api`: yes", "stackhawk-api:api` — yes", + "stackhawk-api:api**: yes", "stackhawk-api:api** — yes", + "stackhawk-api:api: yes", "stackhawk-api:api — yes", + "stackhawk-api:api - yes", + "stackhawk-api**: yes", "stackhawk-api** — yes", + "stackhawk-api: yes", "stackhawk-api — yes", + "stackhawk-api - yes", + # Cursor narrative-style signals + "stackhawk api skill", + "stackhawk api", + "api skill to", + "security posture", + "untriaged findings", + "scan history", + "findings across", + ], + "stackhawk-data-seed": [ + "stackhawk-data-seed:stackhawk-data-seed`: yes", + "stackhawk-data-seed:stackhawk-data-seed** — yes", + "stackhawk-data-seed:stackhawk-data-seed: yes", + "stackhawk-data-seed:stackhawk-data-seed — yes", + "stackhawk-data-seed: yes", "stackhawk-data-seed — yes", + "stackhawk-data-seed - yes", + # narrative-style + "seed data for hawkscan", "seed this repo", "minimum seed entities", + "seed entities required", "data seed complete", "data-seed/manifest", + "set up seed data", + ], +} + + +def parse_stream(raw: str) -> ParsedRun: + """Parse cursor stream-json output. + + Cursor event shapes (from pre-shim run-evals.py): + - type="assistant": message.content[] with blocks of type="text" + - type="tool_call" subtype="started": + tool_call.shellToolCall.args.command -> bash_commands + tool_call.writeToolCall.args.path -> files_written + - type="result": usage.outputTokens, is_error, result + """ + bash_commands: list[str] = [] + files_written: list[str] = [] + output_text = "" + output_tokens: int | None = None + error = None + + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + etype = event.get("type", "") + + if etype == "assistant": + for block in event.get("message", {}).get("content", []): + if block.get("type") == "text": + output_text += block.get("text", "") + "\n" + + elif etype == "tool_call" and event.get("subtype") == "started": + tc = event.get("tool_call", {}) + # Shell command + shell = tc.get("shellToolCall", {}) + if shell: + cmd = shell.get("args", {}).get("command", "") + if cmd: + bash_commands.append(cmd) + # File write + write = tc.get("writeToolCall", {}) + if write: + path = write.get("args", {}).get("path", "") + if path: + files_written.append(path) + + elif etype == "result": + usage = event.get("usage", {}) + otok = usage.get("outputTokens") + if otok is not None: + output_tokens = (output_tokens or 0) + int(otok) + if event.get("is_error"): + error = event.get("result", "unknown error") + + return ParsedRun( + bash_commands=bash_commands, + files_written=files_written, + output_text=output_text.strip(), + output_tokens=output_tokens or None, + error=error, + ) + + +class CursorAdapter: + platform = "cursor" + + def cli_signals(self, skill): return CLI_SIGNALS.get(skill, []) + def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, []) + def parse_stream(self, raw): return parse_stream(raw) + + def detect_trigger(self, run: ParsedRun, skill: str) -> bool: + cli = " ".join(run.bash_commands).lower() + executed = any(s.lower() in cli for s in self.cli_signals(skill)) + text = run.output_text.lower() + loose = any(s.lower() in text for s in self.invocation_signals(skill)) + return decide_trigger(executed_cli=executed, + declared=explicit_decision(run.output_text, skill), + loose_hit=loose) + + def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill, + max_budget, bare, full_auto) -> ParsedRun: + tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_") + try: + # With/without-skill switch: only install the cursor rules when the + # skill should be loaded (pre-shim always installed them). + if load_skill: + _setup_skill(tmpdir) + # Observe mode: append the per-skill walkthrough suffix. Full-auto / + # extended runs against a real target use the bare prompt. + effective_prompt = prompt if full_auto else prompt + observe_suffix(skill) + cmd = [ + "agent", "-p", effective_prompt, + "--output-format", "stream-json", + "--print", + "--trust", + ] + if model: + cmd += ["--model", model] + if full_auto: + cmd.append("--force") + # Pass CURSOR_API_KEY via the environment, never on the command line + # (a CLI arg leaks the secret into process listings and logs). The + # agent CLI reads CURSOR_API_KEY from the environment directly. + env = dict(os.environ) + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + cwd=tmpdir, + env=env, + ) + except subprocess.TimeoutExpired: + return ParsedRun(error="timeout") + run = parse_stream(proc.stdout) + run.returncode = proc.returncode + run.stderr_tail = (proc.stderr or "")[-2000:] + if proc.returncode != 0 and not run.error: + run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}" + elif not run.output_text and not run.bash_commands and not run.error: + run.error = f"empty output (exit {proc.returncode})" + return run + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +ADAPTER = CursorAdapter() diff --git a/evals/harnesses/cursor/run-evals.py b/evals/harnesses/cursor/run-evals.py index 364a3f7..d83ce7a 100644 --- a/evals/harnesses/cursor/run-evals.py +++ b/evals/harnesses/cursor/run-evals.py @@ -1,451 +1,11 @@ #!/usr/bin/env python3 -""" -Cursor Agent eval harness for StackHawk agent skills. - -Uses `agent --print --output-format stream-json` (Cursor's headless CLI). -Skills are loaded from cursor/.cursor/rules/*.mdc (alwaysApply rules). - -Usage: - python3 evals/harnesses/cursor/run-evals.py --skill hawkscan - python3 evals/harnesses/cursor/run-evals.py --skill api - python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --id hw-07 - python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --dry-run - python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --full-auto # actually execute commands - -Requirements: - - Cursor CLI installed and authenticated (`agent status`) - - Run from the agent-skills repo root - - cursor/.cursor/rules/ contains generated .mdc files (run generate-cursor-rules.sh) -""" - -import argparse -import csv -import json -import os -import shutil -import subprocess +"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/. +Run `uv run evals --harness cursor --skill ` instead. +This shim forwards old invocations to the new CLI.""" import sys -import tempfile -from datetime import datetime, timezone -from pathlib import Path - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -HARNESS_DIR = Path(__file__).parent.resolve() -EVALS_DIR = HARNESS_DIR.parent.parent -REPO_ROOT = EVALS_DIR.parent -RESULTS_DIR = HARNESS_DIR / "results" -# cursor/.cursor/rules/ contains the alwaysApply .mdc skill rules -CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules" - -# --------------------------------------------------------------------------- -# Trigger signals — Cursor-specific tuning. -# Cursor goes directly into execution without the Claude Code "EVALUATE: YES/NO" -# evaluation step, so invocation signals focus on narrative phrases the agent -# uses when kicking off a skill workflow. -# CLI_SIGNALS are checked against shell commands the agent attempted to run. -# --------------------------------------------------------------------------- -CLI_SIGNALS = { - "hawkscan": [ - "hawk scan", - "hawk validate", - "hawk rescan", - "hawk config", - "hawk create app", - "hawk init", - "hawk perch", - ], - # Cursor api: the agent runs hawkop status as its first step, then - # deeper hawkop commands. Include broader hawkop signals since Cursor - # doesn't have the false-positive risk of Codex full-auto mode. - "api": [ - "hawkop status", - "hawkop scan get", - "hawkop org get", - "hawkop org set", - "hawkop app list", - "/api/v2/org", - "/api/v1/scan", - "hawk_api GET", - ], -} - -INVOCATION_SIGNALS = { - "hawkscan": [ - "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes", - "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes", - "hawkscan:hawkscan: yes", "hawkscan:hawkscan — yes", - "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes", - "hawkscan** - yes", "hawkscan** — yes", - "hawkscan**: yes", "hawkscan: yes", - "hawkscan — yes", "hawkscan - yes", - "autonomous security scan", - "dast scan after code", "dast scan triggered", "dast scan required", - "security scan required", "security scan after", - "run the security scan", "running the hawkscan", - ], - "api": [ - # Claude Code evaluation-format signals (if model uses that format) - "stackhawk-api:api`: yes", "stackhawk-api:api` — yes", - "stackhawk-api:api**: yes","stackhawk-api:api** — yes", - "stackhawk-api:api: yes", "stackhawk-api:api — yes", - "stackhawk-api:api - yes", - "stackhawk-api**: yes", "stackhawk-api** — yes", - "stackhawk-api: yes", "stackhawk-api — yes", - "stackhawk-api - yes", - # Cursor narrative-style signals — agent says these instead of evaluating - "stackhawk api skill", # "I'll use the StackHawk API skill" - "stackhawk api", # "using the StackHawk API" - "api skill to", # "api skill to pull your org..." - "security posture", # "pull your org's security posture" - "untriaged findings", # "untriaged findings across all apps" - "scan history", # "scan history for" - "findings across", # "findings across all apps" - ], -} - -# --------------------------------------------------------------------------- -# Stream-json parsing -# Cursor events: system / user / thinking / assistant / tool_call / result -# --------------------------------------------------------------------------- - -def parse_stream(jsonl: str) -> dict: - bash_commands: list[str] = [] - output_text = "" - files_written: list[str] = [] - usage: dict = {} - error = None - - for line in jsonl.splitlines(): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - except json.JSONDecodeError: - continue - - etype = event.get("type", "") - - if etype == "assistant": - for block in event.get("message", {}).get("content", []): - if block.get("type") == "text": - output_text += block.get("text", "") + "\n" - - elif etype == "tool_call" and event.get("subtype") == "started": - tc = event.get("tool_call", {}) - # Shell command - shell = tc.get("shellToolCall", {}) - if shell: - cmd = shell.get("args", {}).get("command", "") - if cmd: - bash_commands.append(cmd) - # File write - write = tc.get("writeToolCall", {}) - if write: - path = write.get("args", {}).get("path", "") - if path: - files_written.append(path) - - elif etype == "result": - usage = event.get("usage", {}) - if event.get("is_error"): - error = event.get("result", "unknown error") - - return { - "bash_commands": bash_commands, - "files_written": files_written, - "output_text": output_text.strip(), - "usage": usage, - "error": error, - } - - -# --------------------------------------------------------------------------- -# Trigger detection — same split-signal approach as Claude Code harness -# --------------------------------------------------------------------------- - -def detect_trigger(parsed: dict, skill: str) -> bool: - cli_haystack = " ".join(parsed["bash_commands"]).lower() - if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])): - return True - text_haystack = parsed["output_text"].lower() - return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, [])) - - -# --------------------------------------------------------------------------- -# Process checks — shared with Claude Code harness -# --------------------------------------------------------------------------- - -def run_process_checks(parsed: dict, checks: list) -> list[dict]: - haystack = " ".join([ - *parsed["bash_commands"], - parsed["output_text"], - ]).lower() - all_files = " ".join(parsed["files_written"]).lower() - - results = [] - for check in checks: - ctype = check.get("type", "command_executed") - signals = [s.lower() for s in check.get("signals", [])] - antis = [a.lower() for a in check.get("anti_patterns", [])] - - signal_hit = next((s for s in signals if s in haystack), None) - anti_hit = next((a for a in antis if a in haystack), None) - - if ctype in ("command_negative", "file_content_negative", "output_negative"): - passed = anti_hit is None - elif ctype == "file_absent": - target = check.get("target_file", "").lower() - passed = target not in all_files - elif ctype == "conditional_command": - import re as _re - m = _re.search(r"'([^']+)'", check.get("condition", "")) - condition_keyword = m.group(1).lower() if m else None - if condition_keyword and condition_keyword not in haystack: - passed = True - else: - passed = signal_hit is not None - elif ctype == "command_preference": - preferred = [p.lower() for p in check.get("preferred", [])] - passed = any(p in haystack for p in preferred) and anti_hit is None - else: - passed = signal_hit is not None - if antis: - passed = passed and anti_hit is None - - results.append({ - "id": check["id"], - "pass": passed, - "severity": check.get("severity", "warning"), - "signal_found": signal_hit, - "anti_found": anti_hit, - }) - return results - - -def score_checks(results: list[dict]) -> dict: - blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking") - warning_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "warning") - return { - "total": len(results), - "passed": sum(1 for r in results if r["pass"]), - "blocking_failed": blocking_failed, - "warning_failed": warning_failed, - "score": max(0, 100 - blocking_failed * 15 - warning_failed * 5), - } - - -# --------------------------------------------------------------------------- -# Run agent -# --------------------------------------------------------------------------- - -def _setup_workspace(skill: str, target_dir: Path) -> None: - """Copy cursor/.cursor/rules/ into a fresh workspace so alwaysApply rules load.""" - dst = target_dir / ".cursor" / "rules" - dst.mkdir(parents=True, exist_ok=True) - for mdc in CURSOR_RULES_DIR.glob("*.mdc"): - shutil.copy2(mdc, dst / mdc.name) - - -def run_cursor( - prompt: str, - skill: str, - run_id: str, - full_auto: bool = False, - model: str | None = None, -) -> tuple[dict, int]: - tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")) - try: - _setup_workspace(skill, tmpdir) - - api_key = os.environ.get("CURSOR_API_KEY", "") - cmd = [ - "agent", "-p", prompt, - "--output-format", "stream-json", - "--print", - "--trust", - ] - if api_key: - cmd += ["--api-key", api_key] - if model: - cmd += ["--model", model] - if full_auto: - cmd.append("--force") - - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, - cwd=str(tmpdir), - ) - - trace_dir = RESULTS_DIR / skill - trace_dir.mkdir(parents=True, exist_ok=True) - (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout) - - return parse_stream(proc.stdout), proc.returncode - - except subprocess.TimeoutExpired: - return {"bash_commands": [], "files_written": [], "output_text": "", - "usage": {}, "error": "timeout"}, 1 - except FileNotFoundError: - print("ERROR: 'agent' CLI not found. Install Cursor and ensure it is in PATH.", - file=sys.stderr) - sys.exit(1) - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main() -> None: - parser = argparse.ArgumentParser( - description="Cursor Agent eval harness for StackHawk agent skills", - ) - parser.add_argument("--skill", required=True, choices=["hawkscan", "api"]) - parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--full-auto", action="store_true", - help="Pass --force so the agent can execute commands") - parser.add_argument("--model", metavar="MODEL_ID", - help="Model override (e.g. gpt-5.5, sonnet-4)") - args = parser.parse_args() - - skill = args.skill - prompts_path = EVALS_DIR / skill / "prompts.csv" - checks_path = EVALS_DIR / skill / "process-checks.json" - - with open(prompts_path) as f: - all_prompts = list(csv.DictReader(f)) - checks = json.loads(checks_path.read_text())["checks"] - - if args.prompt_id: - prompts = [p for p in all_prompts if p["id"] == args.prompt_id] - if not prompts: - print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr) - sys.exit(1) - else: - prompts = all_prompts - - if not CURSOR_RULES_DIR.exists(): - print(f"ERROR: {CURSOR_RULES_DIR} not found. Run scripts/generate-cursor-rules.sh first.", - file=sys.stderr) - sys.exit(1) - - mode = "full-auto" if args.full_auto else "observe" - model_label = f" | Model: {args.model}" if args.model else "" - print(f"\nSkill: {skill} | Platform: cursor | Mode: {mode}{model_label} | Prompts: {len(prompts)}") - if args.dry_run: - print("[dry-run — no agent calls]") - print("─" * 68) - - all_results = [] - total_tokens = {"input": 0, "output": 0} - - for row in prompts: - run_id = row["id"] - prompt = row["prompt"] - should_trigger = row["should_trigger"].lower() == "true" - itype = row.get("invocation_type", "") - - print(f"\n[{run_id}] {itype:<12} trigger={'Y' if should_trigger else 'N'}") - print(f" {prompt[:92]}{'…' if len(prompt) > 92 else ''}") - - if args.dry_run: - print(" → skipped") - continue - - parsed, _exit = run_cursor(prompt, skill, run_id, full_auto=args.full_auto, model=args.model) - u = parsed.get("usage", {}) - total_tokens["input"] += u.get("inputTokens", 0) - total_tokens["output"] += u.get("outputTokens", 0) - - if parsed.get("error"): - print(f" ERROR: {parsed['error']}") - - did_trigger = detect_trigger(parsed, skill) - trigger_ok = did_trigger == should_trigger - - process_results: list[dict] = [] - scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0} - if should_trigger and did_trigger: - process_results = run_process_checks(parsed, checks) - scoring = score_checks(process_results) - - result = { - "platform": "cursor", - "skill": skill, - "run_id": run_id, - "prompt": prompt, - "should_trigger": should_trigger, - "did_trigger": did_trigger, - "trigger_correct": trigger_ok, - "bash_commands": parsed["bash_commands"], - "files_written": parsed["files_written"], - "process_checks": process_results, - "scoring": scoring, - "usage": u, - "timestamp": datetime.now(timezone.utc).isoformat(), - } - all_results.append(result) - - out_dir = RESULTS_DIR / skill - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2)) - - t_icon = "✓" if trigger_ok else "✗" - score_str = f"score={scoring['score']}/100" if process_results else "—" - print(f" {t_icon} did_trigger={did_trigger} {score_str}") - for pr in process_results: - if not pr["pass"] and pr["severity"] == "blocking": - print(f" BLOCKING FAIL: {pr['id']}") - - if args.dry_run or not all_results: - return - - trigger_correct = sum(1 for r in all_results if r["trigger_correct"]) - total = len(all_results) - false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]] - false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]] - proc_runs = [r for r in all_results if r["process_checks"]] - avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs) - if proc_runs else None) - total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0 - - print("\n" + "═" * 68) - print(f"SUMMARY skill={skill} platform=cursor") - print(f" Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)") - if false_pos: - print(f" False positives : {', '.join(r['run_id'] for r in false_pos)}") - if false_neg: - print(f" False negatives : {', '.join(r['run_id'] for r in false_neg)}") - if avg_score is not None: - print(f" Process avg score: {avg_score}/100 (blocking failures: {total_blocking})") - print(f" Total tokens : {total_tokens['input']} in / {total_tokens['output']} out") - print(f" Results in : {RESULTS_DIR / skill}/") - - summary = { - "skill": skill, "platform": "cursor", - "timestamp": datetime.now(timezone.utc).isoformat(), - "trigger_accuracy": {"correct": trigger_correct, "total": total}, - "false_positives": [r["run_id"] for r in false_pos], - "false_negatives": [r["run_id"] for r in false_neg], - "process_avg_score": avg_score, - "total_blocking_failures": total_blocking, - "total_tokens": total_tokens, - "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], - "score": r["scoring"]["score"]} for r in all_results], - } - (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2)) - - if false_pos or false_neg or total_blocking > 0: - sys.exit(1) - +from evals.cli import main if __name__ == "__main__": + if "--harness" not in sys.argv: + sys.argv += ["--harness", "cursor"] main() diff --git a/evals/harnesses/gemini/run-evals.py b/evals/harnesses/gemini/run-evals.py index d00c8c5..00fce99 100644 --- a/evals/harnesses/gemini/run-evals.py +++ b/evals/harnesses/gemini/run-evals.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 """ +FROZEN/LEGACY: superseded by the agy harness and the unified 'uv run evals' CLI. Not wired into CI. References the removed prompts.csv and will not run as-is. Kept for historical reference only. + Gemini CLI eval harness for StackHawk agent skills. Uses `gemini -p --output-format stream-json` (Gemini's headless CLI). diff --git a/evals/hawkscan/prompts.csv b/evals/hawkscan/prompts.csv deleted file mode 100644 index 20e1ebd..0000000 --- a/evals/hawkscan/prompts.csv +++ /dev/null @@ -1,21 +0,0 @@ -id,should_trigger,invocation_type,prompt,notes -hw-01,true,explicit,"Use $hawkscan to scan my API running on localhost:8080","Direct skill reference by name; baseline — if this fails everything else is moot" -hw-02,true,explicit,"Run the hawkscan skill against my running application","hawkscan keyword in natural language" -hw-03,true,implicit,"Run a DAST scan on my running application","DAST keyword from skill description" -hw-04,true,implicit,"Scan my app for security vulnerabilities","'security scan' + 'app' language" -hw-05,true,implicit,"Use StackHawk to check my service on port 8080","StackHawk platform keyword" -hw-06,true,implicit,"I need to run a security scan on my API","'security scan' + 'API' language" -hw-07,true,contextual,"I just finished implementing the payment processing endpoint. Run a security scan before we deploy.","Explicit scan request after code change" -hw-08,true,contextual,"I added OAuth2 support to the auth service. Check it for security issues.","Implicit security check after feature; tests autonomous trigger" -hw-09,true,contextual,"The SQL injection fix is in. Verify the vulnerability is actually resolved.","Fix verification — should route to rescan path not a fresh scan" -hw-10,true,contextual,"I refactored the API gateway — check it hasn't introduced any security regressions.","Post-refactor security check; tests autonomous trigger on refactor" -hw-11,true,contextual,"My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security holes before I merge.","Pre-merge gate with OpenAPI spec context" -hw-12,true,contextual,"Feature is done. Let's make sure it's secure.","Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing" -hw-13,false,negative,"Show me my security findings from the last StackHawk scan","Reporting query → api skill; hawkscan must NOT trigger" -hw-14,false,negative,"What's my security posture across all my applications?","Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan" -hw-15,false,negative,"How many HIGH severity findings do I have untriaged?","Finding count query → api skill" -hw-16,false,negative,"Update the README with instructions for running security scans","Doc-only change; skill explicitly excludes doc-only changes" -hw-17,false,negative,"Don't run a security scan after this change — just commit it","Explicit user skip; skill must respect user's explicit opt-out" -hw-18,false,negative,"Change the host in stackhawk.yml from localhost:8080 to localhost:3000","Config-only edit to stackhawk.yml; not a code change, not a scan trigger" -hw-19,false,negative,"Install the hawk CLI on my machine","Installation task with no scan intent" -hw-20,false,negative,"What types of vulnerabilities does HawkScan detect?","Research question; no scan intent, no code change" diff --git a/evals/hawkscan/prompts.yaml b/evals/hawkscan/prompts.yaml new file mode 100644 index 0000000..3c6e5dd --- /dev/null +++ b/evals/hawkscan/prompts.yaml @@ -0,0 +1,107 @@ +- id: hw-01 + should_trigger: true + invocation_type: explicit + prompt: Use $hawkscan to scan my API running on localhost:8080 + notes: Direct skill reference by name; baseline — if this fails everything else is moot +- id: hw-02 + should_trigger: true + invocation_type: explicit + prompt: Run the hawkscan skill against my running application + notes: hawkscan keyword in natural language +- id: hw-03 + should_trigger: true + invocation_type: implicit + prompt: Run a DAST scan on my running application + notes: DAST keyword from skill description +- id: hw-04 + should_trigger: true + invocation_type: implicit + prompt: Scan my app for security vulnerabilities + notes: '''security scan'' + ''app'' language' +- id: hw-05 + should_trigger: true + invocation_type: implicit + prompt: Use StackHawk to check my service on port 8080 + notes: StackHawk platform keyword +- id: hw-06 + should_trigger: true + invocation_type: implicit + prompt: I need to run a security scan on my API + notes: '''security scan'' + ''API'' language' +- id: hw-07 + should_trigger: true + invocation_type: contextual + prompt: I just finished implementing the payment processing endpoint. Run a security scan before we + deploy. + notes: Explicit scan request after code change + budget: + bash_commands: 15 + cost_usd: 0.15 + expected: + - signal: hawk validate +- id: hw-08 + should_trigger: true + invocation_type: contextual + prompt: I added OAuth2 support to the auth service. Check it for security issues. + notes: Implicit security check after feature; tests autonomous trigger +- id: hw-09 + should_trigger: true + invocation_type: contextual + prompt: The SQL injection fix is in. Verify the vulnerability is actually resolved. + notes: Fix verification — should route to rescan path not a fresh scan +- id: hw-10 + should_trigger: true + invocation_type: contextual + prompt: I refactored the API gateway — check it hasn't introduced any security regressions. + notes: Post-refactor security check; tests autonomous trigger on refactor +- id: hw-11 + should_trigger: true + invocation_type: contextual + prompt: My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security + holes before I merge. + notes: Pre-merge gate with OpenAPI spec context +- id: hw-12 + should_trigger: true + invocation_type: contextual + prompt: Feature is done. Let's make sure it's secure. + notes: Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing +- id: hw-13 + should_trigger: false + invocation_type: negative + prompt: Show me my security findings from the last StackHawk scan + notes: Reporting query → api skill; hawkscan must NOT trigger +- id: hw-14 + should_trigger: false + invocation_type: negative + prompt: What's my security posture across all my applications? + notes: Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan +- id: hw-15 + should_trigger: false + invocation_type: negative + prompt: How many HIGH severity findings do I have untriaged? + notes: Finding count query → api skill +- id: hw-16 + should_trigger: false + invocation_type: negative + prompt: Update the README with instructions for running security scans + notes: Doc-only change; skill explicitly excludes doc-only changes +- id: hw-17 + should_trigger: false + invocation_type: negative + prompt: Don't run a security scan after this change — just commit it + notes: Explicit user skip; skill must respect user's explicit opt-out +- id: hw-18 + should_trigger: false + invocation_type: negative + prompt: Change the host in stackhawk.yml from localhost:8080 to localhost:3000 + notes: Config-only edit to stackhawk.yml; not a code change, not a scan trigger +- id: hw-19 + should_trigger: false + invocation_type: negative + prompt: Install the hawk CLI on my machine + notes: Installation task with no scan intent +- id: hw-20 + should_trigger: false + invocation_type: negative + prompt: What types of vulnerabilities does HawkScan detect? + notes: Research question; no scan intent, no code change diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/lib/baseline.py b/evals/lib/baseline.py new file mode 100644 index 0000000..a23575b --- /dev/null +++ b/evals/lib/baseline.py @@ -0,0 +1,45 @@ +"""Pure-Python (no AI) comparison of a run against a baseline run.""" +from __future__ import annotations +from pathlib import Path + +from evals.lib.models import CellReport + + +def diff(current: CellReport, baseline: CellReport) -> dict[str, str]: + cur = {r.run_id: r.verdict.value for r in current.results} + base = {r.run_id: r.verdict.value for r in baseline.results} + out: dict[str, str] = {} + for rid in set(cur) | set(base): + if rid not in base: + out[rid] = "new" + elif rid not in cur: + out[rid] = "dropped" + elif cur[rid] == base[rid]: + out[rid] = "same" + elif cur[rid] == "fail": + out[rid] = "regressed" + elif base[rid] == "fail": + out[rid] = "fixed" + else: + out[rid] = "changed" + return out + + +def score_delta(current_avg: int, baseline_avg: int, band: int = 3) -> str: + d = current_avg - baseline_avg + if abs(d) <= band: + return "no-change" + return "better" if d > 0 else "worse" + + +def load_baseline_dir(path: Path | None) -> dict[tuple[str, str, str], CellReport]: + out: dict[tuple[str, str, str], CellReport] = {} + if not path or not Path(path).exists(): + return out + for cj in Path(path).rglob("cell.json"): + try: + cell = CellReport.model_validate_json(cj.read_text()) + except Exception: + continue + out[(cell.platform, cell.skill, cell.model)] = cell + return out diff --git a/evals/lib/compare.py b/evals/lib/compare.py new file mode 100644 index 0000000..5f00856 --- /dev/null +++ b/evals/lib/compare.py @@ -0,0 +1,46 @@ +"""Run each should_trigger prompt with and without the skill loaded; report lift.""" +from __future__ import annotations +from pathlib import Path + +from evals.lib.config import load_skill +from evals.lib.grading import grade +from evals.lib.harness import get_adapter +from evals.lib.models import Verdict + + +def compare_skill(skill: str, platform: str, *, model: str | None = None, + max_budget: float = 0.20, bare: bool = False, + full_auto: bool = False, only_id: str | None = None) -> list[dict]: + cfg = load_skill(skill) + adapter = get_adapter(platform) + plugin_dirs = [str(Path.cwd() / "plugins" / skill)] + prompts = [p for p in cfg.prompts + if p.should_trigger and (not only_id or p.id == only_id)] + + rows = [] + for p in prompts: + graded = {} + for load in (True, False): + run = adapter.launch(p.prompt, skill, f"{p.id}-{'with' if load else 'without'}", + plugin_dirs, model=model, load_skill=load, + max_budget=max_budget, bare=bare, full_auto=full_auto) + did = adapter.detect_trigger(run, skill) + graded[load] = grade(p, run, cfg.checks, platform=platform, skill=skill, + did_trigger=did) + wv = graded[True].verdict + wo = graded[False].verdict + if wo == Verdict.FAIL and wv != Verdict.FAIL: + effect = "lift" + elif wo != Verdict.FAIL and wv == Verdict.FAIL: + effect = "regress" + else: + effect = "none" + rows.append({ + "id": p.id, + "with_verdict": wv, + "without_verdict": wo, + "with_cost": graded[True].cost_usd, + "without_cost": graded[False].cost_usd, + "effect": effect, + }) + return rows diff --git a/evals/lib/config.py b/evals/lib/config.py new file mode 100644 index 0000000..4736749 --- /dev/null +++ b/evals/lib/config.py @@ -0,0 +1,39 @@ +"""Load and validate a skill's eval config (prompts.yaml + process-checks.json).""" +from __future__ import annotations +import json +from pathlib import Path + +import yaml +from pydantic import BaseModel + +from evals.lib.models import PromptConfig + +EVALS_DIR = Path(__file__).resolve().parent.parent # repo/evals + + +class SkillConfig(BaseModel): + skill: str + prompts: list[PromptConfig] + checks: list[dict] + + +def load_skill(skill: str, base_dir: Path | None = None) -> SkillConfig: + base = base_dir or EVALS_DIR + skill_dir = base / skill + prompts_raw = yaml.safe_load((skill_dir / "prompts.yaml").read_text()) or [] + prompts = [PromptConfig(**row) for row in prompts_raw] # raises on bad fields + + ids = [p.id for p in prompts] + dupes = {i for i in ids if ids.count(i) > 1} + if dupes: + raise ValueError(f"duplicate prompt id(s) in {skill}: {sorted(dupes)}") + + checks = json.loads((skill_dir / "process-checks.json").read_text())["checks"] + id_set = set(ids) + for c in checks: + for target in c.get("applies_to", []): + if target not in id_set: + raise ValueError( + f"check '{c['id']}' applies_to references unknown prompt '{target}'") + + return SkillConfig(skill=skill, prompts=prompts, checks=checks) diff --git a/evals/lib/grading.py b/evals/lib/grading.py new file mode 100644 index 0000000..9b4fb3d --- /dev/null +++ b/evals/lib/grading.py @@ -0,0 +1,155 @@ +"""Grading: process checks (ported from the claude-code harness), per-prompt +ad-hoc expectations, budget scoring, and the three-state verdict.""" +from __future__ import annotations +import re + +from evals.lib.models import ( + ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict, + ProcessCheckResult, EvalResult, +) + + +def applicable_checks(checks: list[dict], prompt_id: str) -> list[dict]: + """A check applies if it has no applies_to (global) or names this prompt id.""" + out = [] + for c in checks: + targets = c.get("applies_to") + if not targets or prompt_id in targets: + out.append(c) + return out + + +def _haystack(run: ParsedRun) -> str: + return " ".join([*run.bash_commands, run.output_text]).lower() + + +def run_process_checks(run: ParsedRun, checks: list[dict]) -> list[ProcessCheckResult]: + haystack = _haystack(run) + all_files = " ".join(run.files_written + run.files_edited).lower() + results: list[ProcessCheckResult] = [] + + for check in checks: + ctype = check.get("type", "command_executed") + signals = [s.lower() for s in check.get("signals", [])] + antis = [a.lower() for a in check.get("anti_patterns", [])] + signal_hit = next((s for s in signals if s in haystack), None) + anti_hit = next((a for a in antis if a in haystack), None) + + if ctype in ("command_negative", "file_content_negative", "output_negative"): + passed = anti_hit is None + elif ctype in ("file_absent", "file_absent_or_unchanged"): + # The file(s) must NOT have been written/edited. Supports either a + # single target_file or a list of anti_pattern paths (data-seed uses + # both forms). "_or_unchanged" is the same absence test here — the + # eval doesn't diff pre-existing content. + target = check.get("target_file", "").lower() + passed = (not target or target not in all_files) and \ + not any(a in all_files for a in antis) + elif ctype == "file_present": + # The artifact should exist: written/edited for real (execution mode) + # OR named in the agent's narration (observe mode). + passed = any(s in all_files or s in haystack for s in signals) + elif ctype == "conditional_command": + condition_str = check.get("condition", "") + m = re.search(r"'([^']+)'", condition_str) + if condition_str and m is None: + raise ValueError( + f"conditional_command check '{check['id']}': condition " + f"'{condition_str}' has no single-quoted keyword") + keyword = m.group(1).lower() if m else None + passed = True if (keyword and keyword not in haystack) else signal_hit is not None + elif ctype == "command_preference": + preferred = [p.lower() for p in check.get("preferred", [])] + if preferred: + passed = any(p in haystack for p in preferred) and anti_hit is None + else: + passed = anti_hit is None # no preference expressed; only anti-patterns matter + else: + passed = signal_hit is not None and (anti_hit is None if antis else True) + + results.append(ProcessCheckResult( + id=check["id"], passed=passed, + severity=check.get("severity", "warning"), + signal_found=signal_hit, anti_found=anti_hit, + )) + return results + + +def run_adhoc_expected(run: ParsedRun, expected: list[ExpectedCheck]) -> list[ProcessCheckResult]: + """Per-prompt expectations. signal/anti_pattern are blocking; check_id refs are + resolved by the caller against process-checks and skipped here.""" + haystack = _haystack(run) + results: list[ProcessCheckResult] = [] + for i, exp in enumerate(expected): + if exp.check_id is not None: + continue # handled via applies_to / process checks + if exp.signal is not None: + hit = exp.signal.lower() in haystack + results.append(ProcessCheckResult( + id=f"expected[{i}]:signal", passed=hit, severity="blocking", + signal_found=exp.signal if hit else None)) + elif exp.anti_pattern is not None: + hit = exp.anti_pattern.lower() in haystack + results.append(ProcessCheckResult( + id=f"expected[{i}]:anti", passed=not hit, severity="blocking", + anti_found=exp.anti_pattern if hit else None)) + return results + + +def check_budget(run: ParsedRun, budget: BudgetSpec) -> list[str]: + breaches: list[str] = [] + if budget.cost_usd is not None and run.cost_usd > budget.cost_usd: + breaches.append(f"cost_usd {run.cost_usd:.3f} > {budget.cost_usd:.3f}") + if budget.bash_commands is not None and len(run.bash_commands) > budget.bash_commands: + breaches.append(f"bash_commands {len(run.bash_commands)} > {budget.bash_commands}") + if budget.output_tokens is not None and (run.output_tokens or 0) > budget.output_tokens: + breaches.append(f"output_tokens {run.output_tokens} > {budget.output_tokens}") + if budget.wall_seconds is not None and (run.wall_seconds or 0) > budget.wall_seconds: + breaches.append(f"wall_seconds {run.wall_seconds:.0f} > {budget.wall_seconds:.0f}") + return breaches + + +def _score(checks: list[ProcessCheckResult]) -> int: + blocking = sum(1 for c in checks if not c.passed and c.severity == "blocking") + warning = sum(1 for c in checks if not c.passed and c.severity == "warning") + return max(0, 100 - blocking * 15 - warning * 5) + + +def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *, + platform: str, skill: str, did_trigger: bool) -> EvalResult: + trigger_correct = (did_trigger == prompt.should_trigger) + + # Process checks, ad-hoc expectations, and budgets only apply when the skill + # should have fired AND did. For correct non-triggers, false positives, and + # false negatives, the verdict is purely the trigger outcome (no process grading). + if not (prompt.should_trigger and did_trigger): + return EvalResult( + platform=platform, skill=skill, run_id=prompt.id, + should_trigger=prompt.should_trigger, did_trigger=did_trigger, + trigger_correct=trigger_correct, + verdict=Verdict.PASS if trigger_correct else Verdict.FAIL, + budget_breaches=[], process_checks=[], + score=100 if trigger_correct else 0, cost_usd=run.cost_usd, + note=(run.error or ""), + ) + + proc = run_process_checks(run, applicable_checks(checks, prompt.id)) + proc += run_adhoc_expected(run, prompt.expected) + + blocking_failed = any(not c.passed and c.severity == "blocking" for c in proc) + verdict = Verdict.FAIL if blocking_failed else Verdict.PASS + + breaches: list[str] = [] + if verdict == Verdict.PASS and prompt.budget is not None: + breaches = check_budget(run, prompt.budget) + if breaches: + verdict = Verdict.PASS_SLOW + + return EvalResult( + platform=platform, skill=skill, run_id=prompt.id, + should_trigger=prompt.should_trigger, did_trigger=did_trigger, + trigger_correct=trigger_correct, + verdict=verdict, budget_breaches=breaches, process_checks=proc, + score=_score(proc), cost_usd=run.cost_usd, + note=(run.error or ""), + ) diff --git a/evals/lib/harness.py b/evals/lib/harness.py new file mode 100644 index 0000000..52fb0be --- /dev/null +++ b/evals/lib/harness.py @@ -0,0 +1,32 @@ +"""Harness protocol + adapter registry. An adapter owns everything runtime-specific: +how to launch the agent, how to parse its stream, and which signals indicate the +skill fired. Everything downstream consumes the ParsedRun it returns.""" +from __future__ import annotations +import importlib.util +from pathlib import Path +from typing import Protocol + +from evals.lib.models import ParsedRun + +EVALS_DIR = Path(__file__).resolve().parent.parent + + +class Harness(Protocol): + platform: str + def cli_signals(self, skill: str) -> list[str]: ... + def invocation_signals(self, skill: str) -> list[str]: ... + def parse_stream(self, raw: str) -> ParsedRun: ... + def detect_trigger(self, run: ParsedRun, skill: str) -> bool: ... + def launch(self, prompt: str, skill: str, run_id: str, plugin_dirs: list[str], + *, model: str | None, load_skill: bool, max_budget: float, + bare: bool, full_auto: bool) -> ParsedRun: ... + + +def get_adapter(platform: str) -> Harness: + path = EVALS_DIR / "harnesses" / platform / "adapter.py" + if not path.exists(): + raise ValueError(f"no adapter for platform '{platform}' at {path}") + spec = importlib.util.spec_from_file_location(f"adapter_{platform.replace('-', '_')}", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.ADAPTER diff --git a/evals/lib/models.py b/evals/lib/models.py new file mode 100644 index 0000000..0febcc9 --- /dev/null +++ b/evals/lib/models.py @@ -0,0 +1,117 @@ +"""Pydantic data contracts for the eval system. extra='forbid' makes config +typos hard load-time errors instead of silently-ignored fields.""" +from __future__ import annotations +from enum import Enum +from typing import Literal + +from pydantic import BaseModel, ConfigDict, model_validator + + +class BudgetSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + cost_usd: float | None = None + bash_commands: int | None = None + output_tokens: int | None = None + wall_seconds: float | None = None + + +class ExpectedCheck(BaseModel): + model_config = ConfigDict(extra="forbid") + check_id: str | None = None # reference an existing process-check by id + signal: str | None = None # ad-hoc substring that MUST appear + anti_pattern: str | None = None # substring that must NOT appear + + @model_validator(mode="after") + def _exactly_one(self) -> "ExpectedCheck": + set_count = sum(x is not None for x in (self.check_id, self.signal, self.anti_pattern)) + if set_count != 1: + raise ValueError("ExpectedCheck must set exactly one of " + "check_id / signal / anti_pattern") + return self + + +class PromptConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + id: str + should_trigger: bool + invocation_type: Literal["explicit", "implicit", "contextual", "negative"] + prompt: str + notes: str = "" + budget: BudgetSpec | None = None + expected: list[ExpectedCheck] = [] + + +class Verdict(str, Enum): + PASS = "pass" + PASS_SLOW = "pass-slow" + FAIL = "fail" + + +class ParsedRun(BaseModel): + bash_commands: list[str] = [] + files_written: list[str] = [] + files_edited: list[str] = [] + output_text: str = "" + cost_usd: float = 0.0 + output_tokens: int | None = None + wall_seconds: float | None = None + error: str | None = None + returncode: int | None = None + stderr_tail: str = "" + + +class ProcessCheckResult(BaseModel): + id: str + passed: bool + severity: Literal["blocking", "warning"] + signal_found: str | None = None + anti_found: str | None = None + + +class RubricCheckResult(BaseModel): + id: str + passed: bool + notes: str = "" + + +class RubricResult(BaseModel): + """Qualitative, model-graded result (ported from origin/main's --rubric pass). + A grader model reviews the transcript against rubric-items.json and returns + a 0-100 score + per-item pass/fail; overall_pass = all pass and score >= 70.""" + overall_pass: bool + score: int + checks: list[RubricCheckResult] = [] + error: str | None = None # set if the grader couldn't run/parse + + +class EvalResult(BaseModel): + platform: str + skill: str + run_id: str + should_trigger: bool + did_trigger: bool + trigger_correct: bool + verdict: Verdict + budget_breaches: list[str] = [] + process_checks: list[ProcessCheckResult] = [] + score: int + cost_usd: float = 0.0 + note: str = "" + rubric: RubricResult | None = None # populated only when --rubric is set + + +class CellReport(BaseModel): + model_config = ConfigDict(extra="forbid") + platform: str + skill: str + model: str + commit: str + results: list[EvalResult] + + +class LiftRow(BaseModel): + model_config = ConfigDict(extra="forbid") + id: str + without_verdict: Verdict + with_verdict: Verdict + effect: Literal["lift", "regress", "none"] diff --git a/evals/lib/observe.py b/evals/lib/observe.py new file mode 100644 index 0000000..95b032a --- /dev/null +++ b/evals/lib/observe.py @@ -0,0 +1,94 @@ +"""Shared per-skill observe-mode prompt suffixes, used by every harness adapter. + +Observe mode gauges whether the right skill TRIGGERS and whether the agent knows +its WORKFLOW, so we ask it to declare the skill and write out the commands it would +run. The declaration matches the explicit-decision parser (evals/lib/triggers.py); +the commands match the process-check signals (which scan bash_commands + +output_text). We deliberately do NOT list the commands here — producing them is the +skill's job, i.e. the test. + +The suffix is PER-SKILL: the three skills have different sandbox execution +profiles, so one shared string can't serve all of them. + - hawkscan needs a live target to scan. With none present, any execution attempt + stalls mid-workflow, so its observe pass is a pure paper walkthrough. + - api is a read-workflow over hawkop; it degrades gracefully (narrate if creds + absent, run the read-only queries if present). + - data-seed's product is the artifacts it emits (manifest + data-seed/), so its + walkthrough must enumerate those. + +Every harness shares this config and the same `plugin:skill: YES`/`none: NO` +decision format, so trigger detection is uniform across harnesses. Appended only +in observe mode — full-auto / extended runs against a real target use the bare +prompt. +""" +from __future__ import annotations + +# Anti-refusal core (all skills): in headless `-p` mode a model may have only the +# skill's description, not its body. A rigid "do not invent" then makes weak models +# refuse — "I can't access the skill definition, should I read it?" (haiku scored 0 +# this way). So tell it to invoke/load the skill and not pause to ask permission. +_USE_SKILL = ( + "Use the skill's own steps — if its full definition isn't already in your " + "context, invoke/load the skill to get them; do NOT pause to ask permission to " + "read or load it." +) + +# Command-emission guidance is PER-SKILL. "Include the command even if unsure of a +# flag" is safe for hawkscan/api (listing commands has no side effect) but wrong for +# data-seed: it's a code-EMITTER, and narrating a startup command like +# `docker-compose up` trips its no-startup anti-pattern. data-seed therefore gets +# read-only discovery guidance instead. +_CMDS_OK = ( + " Give the real commands with their flags, not a prose summary; if you can't " + "recall an exact flag, include the command anyway rather than skipping the step." +) +_DATA_SEED_GUIDANCE = ( + " Give the real discovery commands and the artifacts emitted, not a prose " + "summary. Discovery only READS the repo; data-seed emits files and never starts " + "services — do NOT run or list app-startup commands (docker compose up, npm " + "start, ./gradlew bootRun, etc.)." +) + +_OBSERVE_HEADER = ( + "\n\n---\n" + "(Eval harness — observe mode. The target app, credentials, or prior scans may " + "be unavailable here. Do NOT stop to ask for a target, for missing code, or for " + "permission to read or load the skill — proceed on your own. Output exactly:\n" + "1. A decision line naming the StackHawk skill this request should invoke, " + "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, " + "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n" +) + +OBSERVE_SUFFIX = { + # hawkscan: no live target here, so executing the scan stalls — keep it a + # pure paper walkthrough of the full command sequence. + "hawkscan": _OBSERVE_HEADER + ( + "2. If (and only if) the hawkscan skill applies, write out its COMPLETE " + "documented workflow as the exact CLI commands it runs, in order — every " + "phase from preflight through the verifying rescan. This is a paper " + "walkthrough: do NOT try to run the scan, there is no live target here. " + + _USE_SKILL + _CMDS_OK + ")" + ), + # api: a read-workflow over hawkop. Narrate the full command sequence; if + # hawkop + credentials happen to be present, the read-only queries may also run. + "api": _OBSERVE_HEADER + ( + "2. If (and only if) the api skill applies, write out its COMPLETE documented " + "workflow as the exact CLI commands it runs, in order — every phase from the " + "hawkop preflight/auth check and org resolution through the final query. " + + _USE_SKILL + _CMDS_OK + " If hawkop and credentials are available, you may " + "also run the read-only queries.)" + ), + # data-seed: its product is the emitted artifacts, so the walkthrough must name + # the discovery steps, the minimal seed set, and the files it writes. + "stackhawk-data-seed": _OBSERVE_HEADER + ( + "2. If (and only if) the data-seed skill applies, write out its COMPLETE " + "documented workflow in order — the discovery steps, the minimal seed set it " + "proposes, and the exact artifacts it emits (the data-seed/ directory, " + "manifest.yaml, and the credentials file). " + _USE_SKILL + _DATA_SEED_GUIDANCE + ")" + ), +} + + +def observe_suffix(skill: str) -> str: + """The observe-mode suffix for `skill`, or '' if the skill is unknown.""" + return OBSERVE_SUFFIX.get(skill, "") diff --git a/evals/lib/replay.py b/evals/lib/replay.py new file mode 100644 index 0000000..95e826c --- /dev/null +++ b/evals/lib/replay.py @@ -0,0 +1,29 @@ +"""Regrade a saved trace with no agent call — the zero-cost iteration loop. +The trace filename stem is the prompt id (e.g. hw-07.trace.jsonl -> hw-07).""" +from __future__ import annotations +from pathlib import Path + +from evals.lib.config import load_skill +from evals.lib.grading import grade +from evals.lib.harness import get_adapter +from evals.lib.models import EvalResult + + +def _prompt_id_from_path(trace_path: Path) -> str: + return trace_path.name.split(".")[0] + + +def regrade(trace_path: Path, *, skill: str, platform: str) -> EvalResult: + trace_path = Path(trace_path) + adapter = get_adapter(platform) + run = adapter.parse_stream(trace_path.read_text()) + + cfg = load_skill(skill) + prompt_id = _prompt_id_from_path(trace_path) + prompt = next((p for p in cfg.prompts if p.id == prompt_id), None) + if prompt is None: + raise ValueError(f"no prompt '{prompt_id}' in skill '{skill}'") + + did_trigger = adapter.detect_trigger(run, skill) + return grade(prompt, run, cfg.checks, platform=platform, skill=skill, + did_trigger=did_trigger) diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py new file mode 100644 index 0000000..71e0c5e --- /dev/null +++ b/evals/lib/reporting.py @@ -0,0 +1,235 @@ +"""Summaries + rich rendering for eval runs.""" +from __future__ import annotations +import os +import re +from collections import Counter + +from rich.console import Console +from rich.table import Table + +from evals.lib.models import CellReport, EvalResult, Verdict + +console = Console() +DOT = {Verdict.PASS: "[green]● PASS[/]", Verdict.PASS_SLOW: "[yellow]◐ PASS-SLOW[/]", + Verdict.FAIL: "[red]○ FAIL[/]"} + + +def build_summary(skill: str, platform: str, results: list[EvalResult]) -> dict: + correct = sum(1 for r in results if r.trigger_correct) + fp = [r.run_id for r in results if not r.should_trigger and r.did_trigger] + fn = [r.run_id for r in results if r.should_trigger and not r.did_trigger] + counts = Counter(r.verdict.value for r in results) + graded = [r for r in results if r.did_trigger and r.should_trigger] + avg = sum(r.score for r in graded) // len(graded) if graded else None + return { + "skill": skill, "platform": platform, + "trigger_accuracy": {"correct": correct, "total": len(results)}, + "false_positives": fp, "false_negatives": fn, + "verdict_counts": dict(counts), "process_avg_score": avg, + "total_blocking_failures": sum( + 1 for r in results for c in r.process_checks + if not c.passed and c.severity == "blocking"), + } + + +def render_table(results: list[EvalResult]) -> None: + t = Table(show_edge=False, box=None, padding=(0, 2)) + for col in ("ID", "Trigger", "Verdict", "Score", "Budget", "Cost"): + t.add_column(col) + for r in results: + trig = "[green]✓[/]" if r.trigger_correct else "[red]✗[/]" + budget = ", ".join(r.budget_breaches) or "—" + t.add_row(r.run_id, trig, DOT[r.verdict], str(r.score), budget, + f"${r.cost_usd:.3f}") + console.print(t) + + +def render_compare(rows: list[dict]) -> None: + """rows: {id, with_verdict, without_verdict, with_cost, without_cost}.""" + t = Table(show_edge=False, box=None, padding=(0, 2)) + for col in ("ID", "Without skill", "With skill", "Δ"): + t.add_column(col) + for row in rows: + w, wo = row["with_verdict"], row["without_verdict"] + delta = "[green]↑ lift[/]" if (wo == Verdict.FAIL and w != Verdict.FAIL) else ( + "[red]↓ regress[/]" if (wo != Verdict.FAIL and w == Verdict.FAIL) else "=") + t.add_row(row["id"], DOT[wo], DOT[w], delta) + console.print(t) + + +_BADGE_COLOR = { + "pass": "brightgreen", "pass-slow": "yellow", "fail": "red", + "regressed": "red", "fixed": "brightgreen", "changed": "blue", + "same": "lightgrey", "better": "brightgreen", "worse": "red", + "no-change": "lightgrey", +} + + +def badge(kind: str, label: str) -> str: + color = _BADGE_COLOR.get(kind, "lightgrey") + safe = label.replace("-", "--").replace(" ", "_") + return f"![{label}](https://img.shields.io/badge/{safe}-{color})" + + +_VERDICT_ICON = {"pass": "✅ PASS", "pass-slow": "◆ PASS-SLOW", "fail": "❌ FAIL"} + + +def _row_rank(r: EvalResult) -> int: + # failures first (incl. trigger-incorrect), then slow, then pass + if r.verdict.value == "fail" or not r.trigger_correct: + return 0 + if r.verdict.value == "pass-slow": + return 1 + return 2 + + +def write_github_summary(md: str) -> None: + path = os.environ.get("GITHUB_STEP_SUMMARY") + if not path: + return + with open(path, "a", encoding="utf-8") as fp: + fp.write(md) + + +_PLATFORM_ORDER = {p: i for i, p in + enumerate(["claude-code", "codex", "cursor", "agy", "copilot"])} +_PIVOT_ICON = {"pass": "✅", "pass-slow": "◆", "fail": "❌"} + + +def _short_model(model: str) -> str: + """Compact column label: drop a trailing date stamp and a redundant + 'claude-' prefix. 'claude-haiku-4-5-20251001' -> 'haiku-4-5'; 'o3' -> 'o3'.""" + m = re.sub(r"-\d{6,}$", "", model) + if m.startswith("claude-"): + m = m[len("claude-"):] + return m or model + + +def _id_sort_key(run_id: str): + m = re.search(r"(\d+)", run_id) + return (int(m.group(1)) if m else 0, run_id) + + +def _fail_reason(r: EvalResult) -> str: + reason = (r.note or "").strip() + if not reason: + if not r.trigger_correct: + reason = "false-positive" if r.did_trigger else "false-negative" + elif r.budget_breaches: + reason = "; ".join(r.budget_breaches) + else: + reason = "blocking check failed" + reason = reason.replace("|", "/").replace("\n", " ").strip() + return reason[:69] + "…" if len(reason) > 70 else reason + + +def _rubric_tag(r: EvalResult) -> str: + """Qualitative rubric badge woven into the cell: ` r85✓` / ` r55✗`. + Empty when the rubric didn't run for this prompt.""" + if r.rubric is None: + return "" + if r.rubric.error: + return " r?" + return f" r{r.rubric.score}{'✓' if r.rubric.overall_pass else '✗'}" + + +def _pivot_cell(r: EvalResult | None) -> str: + """One matrix cell: deterministic verdict emoji + a terse reason on non-pass, + with the qualitative rubric score (rNN✓/✗) appended when it ran.""" + if r is None: + return "·" # this harness/model didn't run this test + rub = _rubric_tag(r) + v = r.verdict.value + if v == "pass": + return f"{_PIVOT_ICON['pass']}{rub}" + if v == "pass-slow": + why = "; ".join(r.budget_breaches) or "slow" + return f"{_PIVOT_ICON['pass-slow']} — {why}"[:74] + rub + return f"{_PIVOT_ICON['fail']} — {_fail_reason(r)}{rub}" + + +def render_digest(cells, baselines=None, lift=None) -> str: + """One aggregated pivot table for the whole matrix. + + Rows are tests (skill/id); columns are platform-model combos; each cell is a + verdict emoji followed by a short reason on failures. Replaces the previous + per-cell tables so the Actions run summary holds a single table. + """ + out = ["", "## Skill Eval Results\n"] + if not cells: + out.append("_No results._\n") + return "\n".join(out) + "\n" + + cols = sorted({(c.platform, c.model) for c in cells}, + key=lambda pm: (_PLATFORM_ORDER.get(pm[0], 99), pm[1])) + col_label = {pm: f"{pm[0]}-{_short_model(pm[1])}" for pm in cols} + + lookup: dict[tuple, EvalResult] = {} + row_keys: dict[tuple, bool] = {} + for c in cells: + for r in c.results: + lookup[(c.platform, c.model, c.skill, r.run_id)] = r + row_keys[(c.skill, r.run_id)] = True + skill_rank = {"hawkscan": 0, "api": 1} + rows = sorted(row_keys, key=lambda sr: (skill_rank.get(sr[0], 9), *_id_sort_key(sr[1]))) + + out.append("| test | " + " | ".join(col_label[pm] for pm in cols) + " |") + out.append("|---" * (len(cols) + 1) + "|") + for skill, rid in rows: + line = " | ".join(_pivot_cell(lookup.get((pm[0], pm[1], skill, rid))) + for pm in cols) + out.append(f"| {skill}/{rid} | {line} |") + out.append("") + out.append("_Legend: ✅ pass · ◆ pass-slow · ❌ fail (reason follows) · `·` = not run. " + "`rNN✓/✗` = qualitative rubric score/verdict (when --rubric ran)._\n") + + # Optional, compact extras (kept off the main table to avoid the old sprawl). + if baselines is None: + out.append("_No baseline available — showing absolute results only._\n") + else: + from evals.lib.baseline import diff as _diff, score_delta + notes = [] + for c in cells: + base = baselines.get((c.platform, c.skill, c.model)) + if base is None: + continue + tag = f"{c.platform}-{_short_model(c.model)}/{c.skill}" + for k, v in sorted(_diff(c, base).items()): + if v in ("regressed", "fixed", "changed"): + notes.append(f"{badge(v, v)} {tag}:{k}") + g = [r for r in c.results if r.did_trigger and r.should_trigger] + bg = [r for r in base.results if r.did_trigger and r.should_trigger] + avg = sum(r.score for r in g) // len(g) if g else 0 + bavg = sum(r.score for r in bg) // len(bg) if bg else 0 + delta = score_delta(avg, bavg) + if delta in ("better", "worse"): + notes.append(f"{badge(delta, delta)} {tag}") + out.append(("**vs baseline:** " + ", ".join(notes) + "\n") if notes + else "_vs baseline: no changes._\n") + + if lift: + out.append("\n### Skill lift (with vs without)\n") + for key, rws in lift.items(): + lifted = sum(1 for r in rws if r["effect"] == "lift") + out.append(f"**{key[0]} · {key[1]} · {key[2]}** — " + f"{lifted}/{len(rws)} prompts lifted FAIL→PASS\n") + return "\n".join(out) + "\n" + + +def render_job_summary(cell: CellReport) -> str: + c = Counter(r.verdict.value for r in cell.results) + trig_ok = sum(1 for r in cell.results if r.trigger_correct) + n = len(cell.results) + head = (f"### {cell.platform} · {cell.skill} · {cell.model} " + f"— ✅ {c.get('pass',0)} / ◆ {c.get('pass-slow',0)} / " + f"❌ {c.get('fail',0)} · {c.get('fail',0)} failed · " + f"trigger {trig_ok}/{n}\n\n") + rows = ["| test | result | why |", "|---|---|---|"] + for r in sorted(cell.results, key=lambda r: (_row_rank(r), r.run_id)): + why = "; ".join(r.budget_breaches) if r.budget_breaches else ( + "" if r.trigger_correct else + ("false-positive" if r.did_trigger else "false-negative")) + if r.note: + why = f"{why} — {r.note}" if why else r.note + rows.append(f"| {r.run_id} | {_VERDICT_ICON[r.verdict.value]} | {why} |") + return head + "\n".join(rows) + "\n" diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py new file mode 100644 index 0000000..464569a --- /dev/null +++ b/evals/lib/rubric.py @@ -0,0 +1,126 @@ +"""Qualitative, model-assisted rubric grader. + +Ported from origin/main's `--rubric` pass (evals/harnesses/*/run-evals.py). +A grader model (claude) reviews an agent run's transcript against the skill's +rubric-items.json and returns a structured 0-100 quality score + per-item +pass/fail. This is the QUALITATIVE axis that complements the deterministic +process-checks, and it's woven into the pass/fail table by the reporter. + +The grader judges text only, so it is platform-independent: every harness's +transcript is graded by the same claude grader. Requires ANTHROPIC_API_KEY. +""" +from __future__ import annotations +import json +import re +import subprocess +from pathlib import Path + +from evals.lib.models import ParsedRun, RubricResult, RubricCheckResult + +EVALS_DIR = Path(__file__).resolve().parent.parent # repo/evals + + +def _extract_json_object(text: str) -> dict: + """Parse a JSON object out of a grader reply that may be pure JSON, wrapped in + a ```json fence, or embedded in prose (e.g. "No skills needed.\\n\\n```json + {...}```"). Tries direct parse, then a fenced block, then the first balanced + {...} object.""" + text = text.strip() + try: + return json.loads(text) + except json.JSONDecodeError: + pass + fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S) + if fence: + return json.loads(fence.group(1)) + start = text.find("{") + if start != -1: + depth = 0 + for i in range(start, len(text)): + if text[i] == "{": + depth += 1 + elif text[i] == "}": + depth -= 1 + if depth == 0: + return json.loads(text[start:i + 1]) + raise ValueError(f"no JSON object in grader result: {text[:120]}") + + +def _build_prompt(rubric_data: dict, run: ParsedRun, skill: str, run_id: str) -> str: + return f"""{rubric_data['grader_prompt']} + +## Bash Commands Executed: +{json.dumps(run.bash_commands, indent=2)} + +## Files Written/Edited: +{json.dumps(run.files_written + run.files_edited, indent=2)} + +## Agent Output (first 4000 chars): +{run.output_text[:4000]} + +## Rubric Checks to Grade: +{json.dumps(rubric_data['checks'], indent=2)} + +Populate the JSON result with: + skill = "{skill}" + run_id = "{run_id}" + overall_pass = true if all checks pass and score >= 70 + score = 0-100 (each failed check deducts: blocking 15, warning 5) + checks = one entry per check id listed above""" + + +# Cheap, capable grader by default — judging a transcript against a rubric is a +# structured classification task. Budget must cover the full prompt (transcript + +# rubric + schema); 0.10 hit error_max_budget_usd, so use a roomier cap. +DEFAULT_GRADER_MODEL = "claude-haiku-4-5-20251001" +GRADER_BUDGET_USD = "0.25" + + +def grade_rubric(run: ParsedRun, skill: str, run_id: str, *, + grader_model: str | None = None, timeout: int = 120, + base_dir: Path | None = None) -> RubricResult | None: + """Run the qualitative grader. Returns a RubricResult, or None if the rubric + config is absent. On grader failure returns a RubricResult with error set so + the run still records a (failed) rubric cell rather than silently dropping it.""" + base = base_dir or EVALS_DIR + rubric_path = base / skill / "rubric-items.json" + schema_path = base / "rubric-schema.json" + if not rubric_path.exists() or not schema_path.exists(): + return None + rubric_data = json.loads(rubric_path.read_text()) + schema = json.loads(schema_path.read_text()) + + # NOTE: no --bare here. --bare ("minimal mode") suppresses the structured + # --json-schema output (returns an empty result), so the grader must run in + # full mode. It's a one-shot text judge; no plugin-dir needed. + cmd = ["claude", "-p", _build_prompt(rubric_data, run, skill, run_id), + "--output-format", "json", "--no-session-persistence", + "--json-schema", json.dumps(schema), + "--max-budget-usd", GRADER_BUDGET_USD, + "--model", grader_model or DEFAULT_GRADER_MODEL] + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + if not proc.stdout.strip(): + # claude produced nothing on stdout — surface the real cause (exit + # code + stderr) instead of a misleading JSONDecodeError downstream. + tail = (proc.stderr or "").strip()[-200:] + raise ValueError(f"grader produced no output (exit {proc.returncode}): {tail}") + envelope = json.loads(proc.stdout) + # --output-format json wraps as {"result": "", ...}; some modes + # return the schema object directly. Handle both. + raw = envelope.get("result", envelope) if isinstance(envelope, dict) else envelope + # `raw` may be a dict already, or a string that is pure JSON, or — even with + # --json-schema — a model reply that wraps the JSON in prose / a ```json + # fence. Extract the object tolerantly. + result = raw if isinstance(raw, dict) else _extract_json_object(raw) + if "score" not in result and "overall_pass" not in result: + raise ValueError(f"grader returned no rubric fields: {str(result)[:120]}") + except Exception as exc: # noqa: BLE001 — grader is best-effort + return RubricResult(overall_pass=False, score=0, checks=[], + error=f"grader failed: {type(exc).__name__}: {exc}") + + checks = [RubricCheckResult(id=c.get("id", "?"), passed=bool(c.get("pass")), + notes=c.get("notes", "")) + for c in result.get("checks", [])] + return RubricResult(overall_pass=bool(result.get("overall_pass")), + score=int(result.get("score", 0)), checks=checks) diff --git a/evals/lib/triggers.py b/evals/lib/triggers.py new file mode 100644 index 0000000..af71077 --- /dev/null +++ b/evals/lib/triggers.py @@ -0,0 +1,73 @@ +"""Shared trigger-decision helpers used by every harness adapter. + +The agents declare a decision line under the observe suffix, e.g. +`hawkscan:hawkscan: YES` or `none: NO`. That explicit declaration is the agent's +considered verdict and must be authoritative — it should not be overridden by the +looser behavioral phrases in INVOCATION_SIGNALS (e.g. "security scan after"), which +frequently appear because the agent is *quoting the user's negative instruction* +("Don't run a security scan after this change"). Treating the explicit decline as +authoritative removes that class of false positive. +""" +from __future__ import annotations +import re + +# How the agent names each skill in its decision line. Full `plugin:skill` form +# first (most specific), then the bare skill name. Hyphens are literal here, so we +# never normalize them away (would corrupt `stackhawk-api`). +_DECL_NAMES = { + "hawkscan": ["hawkscan:hawkscan", "hawkscan"], + "api": ["stackhawk-api:api", "stackhawk-api"], + "stackhawk-data-seed": ["stackhawk-data-seed:stackhawk-data-seed", + "stackhawk-data-seed"], +} + +# Decision separator between the skill name and YES/NO: colon, hyphen, en/em dash. +_SEP = r"\s*[:\-–—]\s*" + + +# Phrases an agent uses to decline a skill without the literal `: NO`, e.g. +# "`hawkscan:hawkscan` does not apply". +_DECLINE = r"(?:does ?n.?t apply|not applicable|not needed|n/a)" + + +def explicit_decision(text: str, skill: str) -> str | None: + """Return 'yes'/'no' if the agent emitted an explicit decision for `skill` — + a `skill: YES`/`skill: NO` line, a global `none: NO`, a `skill … does not + apply` decline, or an explicit YES for a *different* skill (which means it + chose that one, not this). Else None. Strips markdown emphasis first so + `**hawkscan:hawkscan: YES**` and `` `none: NO` `` are recognized.""" + norm = re.sub(r"[*`_]+", "", text.lower()) + names = _DECL_NAMES.get(skill, [skill]) + + def declared(name: str, verdict: str) -> bool: + return re.search(re.escape(name) + _SEP + verdict + r"\b", norm) is not None + + if any(declared(n, "yes") for n in names): + return "yes" + # Explicit NO for this skill, a global decline, or a "does not apply" phrase. + if (re.search(r"\bnone" + _SEP + r"no\b", norm) + or any(declared(n, "no") for n in names) + or any(re.search(re.escape(n) + r"\W+" + _DECLINE, norm) for n in names)): + return "no" + # The agent explicitly chose a DIFFERENT skill → this skill was declined. + for other, onames in _DECL_NAMES.items(): + if other == skill: + continue + if any(re.search(re.escape(n) + _SEP + r"yes\b", norm) for n in onames): + return "no" + return None + + +def decide_trigger(*, executed_cli: bool, declared: str | None, loose_hit: bool) -> bool: + """Combine the three trigger signals with the right precedence: + 1. Real CLI execution is unambiguous — the skill ran. + 2. An explicit decision line (YES/NO) is authoritative for narration. + 3. Otherwise fall back to loose behavioral phrase matches. + """ + if executed_cli: + return True + if declared == "no": + return False + if declared == "yes": + return True + return loose_hit diff --git a/evals/stackhawk-data-seed/prompts.csv b/evals/stackhawk-data-seed/prompts.csv deleted file mode 100644 index afa2673..0000000 --- a/evals/stackhawk-data-seed/prompts.csv +++ /dev/null @@ -1,17 +0,0 @@ -id,should_trigger,invocation_type,prompt,notes -ds-01,true,explicit,"Use the $stackhawk-data-seed skill to set up seed data for my repo","Direct skill reference by name" -ds-02,true,implicit,"Set up data for HawkScan in this repo","Top-line trigger phrase from skill description" -ds-03,true,implicit,"My HawkScan run has no data to hit — can you seed it?","'no data to hit' keyword from description" -ds-04,true,implicit,"Seed this repo so HawkScan can authenticate and scan real paths","'seed this repo' phrasing" -ds-05,true,implicit,"I'm setting up HawkScan for the first time on this service — generate the seed data","First-time-setup phrasing" -ds-06,true,contextual,"HawkScan is logging in but every endpoint returns an empty list. Fix it.","Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly." -ds-07,true,contextual,"We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works.","Augment existing seed when data shape changed" -ds-08,true,contextual,"Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit","Describes the artifact, not the skill" -ds-09,false,negative,"Scan my API for vulnerabilities","Scan request → hawkscan skill" -ds-10,false,negative,"Show me my untriaged findings","Reporting query → stackhawk-api skill" -ds-11,false,negative,"Configure HawkScan in my GitHub Actions pipeline","CI config → hawkscan-ci skill" -ds-12,false,negative,"I just fixed a bug in the payment endpoint","Routine code change — skill is NOT autonomous; must not fire on post-change hook" -ds-13,false,negative,"Refactor the user service to use repositories instead of direct queries","Refactor — not autonomous; must not fire" -ds-14,false,negative,"Bootstrap a new Spring Boot project for me","Generic 'bootstrap' meaning project scaffolding — not our skill's domain" -ds-15,false,negative,"Write a database migration to add a 'role' column to users","DB migration is not seed data; not our skill's domain" -ds-16,false,negative,"Start the docker-compose stack and run the integration tests","Environment startup — skill explicitly forbids running services" diff --git a/evals/stackhawk-data-seed/prompts.yaml b/evals/stackhawk-data-seed/prompts.yaml new file mode 100644 index 0000000..5717341 --- /dev/null +++ b/evals/stackhawk-data-seed/prompts.yaml @@ -0,0 +1,80 @@ +- id: ds-01 + should_trigger: true + invocation_type: explicit + prompt: Use the $stackhawk-data-seed skill to set up seed data for my repo + notes: Direct skill reference by name +- id: ds-02 + should_trigger: true + invocation_type: implicit + prompt: Set up data for HawkScan in this repo + notes: Top-line trigger phrase from skill description +- id: ds-03 + should_trigger: true + invocation_type: implicit + prompt: My HawkScan run has no data to hit — can you seed it? + notes: '''no data to hit'' keyword from description' +- id: ds-04 + should_trigger: true + invocation_type: implicit + prompt: Seed this repo so HawkScan can authenticate and scan real paths + notes: '''seed this repo'' phrasing' +- id: ds-05 + should_trigger: true + invocation_type: implicit + prompt: I'm setting up HawkScan for the first time on this service — generate the seed data + notes: First-time-setup phrasing +- id: ds-06 + should_trigger: true + invocation_type: contextual + prompt: HawkScan is logging in but every endpoint returns an empty list. Fix it. + notes: 'Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly.' +- id: ds-07 + should_trigger: true + invocation_type: contextual + prompt: We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works. + notes: Augment existing seed when data shape changed +- id: ds-08 + should_trigger: true + invocation_type: contextual + prompt: Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit + notes: Describes the artifact, not the skill +- id: ds-09 + should_trigger: false + invocation_type: negative + prompt: Scan my API for vulnerabilities + notes: Scan request → hawkscan skill +- id: ds-10 + should_trigger: false + invocation_type: negative + prompt: Show me my untriaged findings + notes: Reporting query → stackhawk-api skill +- id: ds-11 + should_trigger: false + invocation_type: negative + prompt: Configure HawkScan in my GitHub Actions pipeline + notes: CI config → hawkscan-ci skill +- id: ds-12 + should_trigger: false + invocation_type: negative + prompt: I just fixed a bug in the payment endpoint + notes: Routine code change — skill is NOT autonomous; must not fire on post-change hook +- id: ds-13 + should_trigger: false + invocation_type: negative + prompt: Refactor the user service to use repositories instead of direct queries + notes: Refactor — not autonomous; must not fire +- id: ds-14 + should_trigger: false + invocation_type: negative + prompt: Bootstrap a new Spring Boot project for me + notes: Generic 'bootstrap' meaning project scaffolding — not our skill's domain +- id: ds-15 + should_trigger: false + invocation_type: negative + prompt: Write a database migration to add a 'role' column to users + notes: DB migration is not seed data; not our skill's domain +- id: ds-16 + should_trigger: false + invocation_type: negative + prompt: Start the docker-compose stack and run the integration tests + notes: Environment startup — skill explicitly forbids running services diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b87b331 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "agent-skills-evals" +version = "0.1.0" +description = "Eval harness + shared grading lib for StackHawk agent skills" +requires-python = ">=3.11" +dependencies = [ + "pydantic>=2.6", + "pyyaml>=6.0", + "rich>=13.0", +] + +[dependency-groups] +dev = ["pytest>=8.0"] + +[project.scripts] +evals = "evals.cli:main" +compare = "evals.cli:compare" +regrade = "evals.cli:regrade" +validate = "evals.cli:validate" +report = "evals.cli:report" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["evals"] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/scripts/migrate_prompts.py b/scripts/migrate_prompts.py new file mode 100644 index 0000000..3498fe3 --- /dev/null +++ b/scripts/migrate_prompts.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +"""One-time, idempotent migration of evals//prompts.csv -> prompts.yaml. +Preserves id, should_trigger (bool), invocation_type, prompt, notes. Adds no +budgets or expected[] — those are authored by hand afterward.""" +from __future__ import annotations +import csv +import sys +from pathlib import Path + +import yaml + +EVALS_DIR = Path(__file__).resolve().parent.parent / "evals" + + +def migrate(skill: str) -> None: + csv_path = EVALS_DIR / skill / "prompts.csv" + yaml_path = EVALS_DIR / skill / "prompts.yaml" + rows = [] + with open(csv_path) as f: + for r in csv.DictReader(f): + rows.append({ + "id": r["id"], + "should_trigger": r["should_trigger"].strip().lower() == "true", + "invocation_type": r["invocation_type"], + "prompt": r["prompt"], + "notes": r.get("notes", ""), + }) + yaml_path.write_text(yaml.safe_dump(rows, sort_keys=False, width=100, + allow_unicode=True)) + print(f"wrote {yaml_path} ({len(rows)} prompts)") + + +if __name__ == "__main__": + for skill in (sys.argv[1:] or ["hawkscan", "api"]): + migrate(skill) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/hw-07.trace.jsonl b/tests/fixtures/hw-07.trace.jsonl new file mode 100644 index 0000000..6869787 --- /dev/null +++ b/tests/fixtures/hw-07.trace.jsonl @@ -0,0 +1,4 @@ +{"type":"assistant","message":{"content":[{"type":"text","text":"`hawkscan:hawkscan`: YES — running the security scan after the code change. First I'll verify the CLI and that the app is running and reachable on localhost port 8080."},{"type":"tool_use","name":"Bash","input":{"command":"hawk version"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk config --help"}}]}} +{"type":"assistant","message":{"content":[{"type":"text","text":"Confirming the app and environment exist and credentials are set."},{"type":"tool_use","name":"Bash","input":{"command":"hawkop app list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawkop env list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk init"}}]}} +{"type":"assistant","message":{"content":[{"type":"text","text":"Validating config, then running the scan synchronously."},{"type":"tool_use","name":"Bash","input":{"command":"hawk validate config stackhawk.yml"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk scan --env Development"}}]}} +{"type":"result","result":"Scan complete. The application was reachable on localhost:8080 throughout.","cost_usd":0.07,"subtype":"success"} diff --git a/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json new file mode 100644 index 0000000..100a650 --- /dev/null +++ b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json @@ -0,0 +1,5 @@ +{"platform":"claude-code","skill":"hawkscan","model":"haiku","commit":"abc1234", + "results":[ + {"platform":"claude-code","skill":"hawkscan","run_id":"hw-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.05}, + {"platform":"claude-code","skill":"hawkscan","run_id":"hw-14","should_trigger":false,"did_trigger":true,"trigger_correct":false,"verdict":"fail","budget_breaches":[],"process_checks":[],"score":0,"cost_usd":0.02} + ]} diff --git a/tests/fixtures/results/eval-codex-api-haiku/cell.json b/tests/fixtures/results/eval-codex-api-haiku/cell.json new file mode 100644 index 0000000..1343366 --- /dev/null +++ b/tests/fixtures/results/eval-codex-api-haiku/cell.json @@ -0,0 +1,4 @@ +{"platform":"codex","skill":"api","model":"haiku","commit":"abc1234", + "results":[ + {"platform":"codex","skill":"api","run_id":"api-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.04} + ]} diff --git a/tests/fixtures/streams/agy.txt b/tests/fixtures/streams/agy.txt new file mode 100644 index 0000000..2726a9e --- /dev/null +++ b/tests/fixtures/streams/agy.txt @@ -0,0 +1,2 @@ +`hawkscan:hawkscan`: YES — running the security scan. +I ran `hawk scan --env Development`; the app was reachable on localhost:8080. diff --git a/tests/fixtures/streams/codex.txt b/tests/fixtures/streams/codex.txt new file mode 100644 index 0000000..048da79 --- /dev/null +++ b/tests/fixtures/streams/codex.txt @@ -0,0 +1,4 @@ +{"type":"item.started","item":{"type":"command_execution","command":"hawk validate config stackhawk.yml"}} +{"type":"item.started","item":{"type":"command_execution","command":"hawk scan --env Development"}} +{"type":"item.completed","item":{"type":"agent_message","text":"Running the security scan; app reachable on localhost:8080."}} +{"type":"turn.completed","usage":{"input_tokens":1200,"output_tokens":340}} diff --git a/tests/fixtures/streams/cursor.txt b/tests/fixtures/streams/cursor.txt new file mode 100644 index 0000000..2dfe9ee --- /dev/null +++ b/tests/fixtures/streams/cursor.txt @@ -0,0 +1,3 @@ +{"type":"tool_call","subtype":"started","tool_call":{"shellToolCall":{"args":{"command":"hawk scan --env Development"}}}} +{"type":"assistant","message":{"content":[{"type":"text","text":"Running HawkScan against the app on localhost:8080."}]}} +{"type":"result","usage":{"inputTokens":950,"outputTokens":210},"is_error":false} diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py new file mode 100644 index 0000000..9b68462 --- /dev/null +++ b/tests/lib/test_adapters.py @@ -0,0 +1,85 @@ +import importlib.util +from pathlib import Path +from evals.lib.harness import get_adapter +from evals.lib.models import ParsedRun + +FIX = Path(__file__).parent.parent / "fixtures" / "streams" +REPO_ROOT = Path(__file__).resolve().parent.parent.parent + + +def _load_adapter_module(platform: str): + path = REPO_ROOT / "evals" / "harnesses" / platform / "adapter.py" + spec = importlib.util.spec_from_file_location(f"_t_adapter_{platform}", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_codex_parse_stream(): + cx = get_adapter("codex") + run = cx.parse_stream((FIX / "codex.txt").read_text()) + assert isinstance(run, ParsedRun) + assert "hawk validate config stackhawk.yml" in run.bash_commands + assert "hawk scan --env Development" in run.bash_commands + assert "localhost:8080" in run.output_text + assert run.output_tokens == 340 + + +def test_codex_detect_trigger(): + cx = get_adapter("codex") + run = ParsedRun(bash_commands=["hawk scan --env Development"]) + assert cx.detect_trigger(run, "hawkscan") is True + assert cx.detect_trigger(ParsedRun(bash_commands=["echo hi"]), "hawkscan") is False + + +def test_cursor_parse_stream(): + cu = get_adapter("cursor") + run = cu.parse_stream((FIX / "cursor.txt").read_text()) + assert "hawk scan --env Development" in run.bash_commands + assert "localhost:8080" in run.output_text + + +def test_cursor_detect_trigger(): + cu = get_adapter("cursor") + assert cu.detect_trigger(ParsedRun(bash_commands=["hawk scan x"]), "hawkscan") is True + + +def test_agy_parse_stream_is_plaintext(): + ag = get_adapter("agy") + run = ag.parse_stream((FIX / "agy.txt").read_text()) + assert run.bash_commands == [] + assert "hawk scan --env Development" in run.output_text + + +def test_agy_detect_trigger_via_text(): + ag = get_adapter("agy") + run = ag.parse_stream((FIX / "agy.txt").read_text()) + assert ag.detect_trigger(run, "hawkscan") is True + + +def test_claude_code_parses_total_cost_usd(): + import json + cc = get_adapter("claude-code") + lines = [ + json.dumps({"type":"assistant","message":{"content":[{"type":"text","text":"hi"}]}}), + json.dumps({"type":"result","result":"done","total_cost_usd":0.123,"subtype":"success"}), + ] + run = cc.parse_stream("\n".join(lines)) + assert abs(run.cost_usd - 0.123) < 1e-9 + + +def test_agy_observe_suffix_and_skill_signal(): + ag = get_adapter("agy") + # The legacy `SKILL: hawkscan` declaration format must still be detected (it's + # retained as a loose INVOCATION_SIGNAL fallback). + run = ag.parse_stream("I would use SKILL: hawkscan for this task.") + assert ag.detect_trigger(run, "hawkscan") is True + # agy now uses the shared per-skill observe suffix, which requests the + # `plugin:skill: YES`/`none: NO` decision line and a full workflow walkthrough. + from evals.lib.observe import observe_suffix + suffix = observe_suffix("hawkscan") + assert suffix.strip() + assert "hawkscan:hawkscan: YES" in suffix + # The new decision line is recognized as an explicit trigger. + run2 = ag.parse_stream("**hawkscan:hawkscan: YES** — running the scan workflow") + assert ag.detect_trigger(run2, "hawkscan") is True diff --git a/tests/lib/test_baseline.py b/tests/lib/test_baseline.py new file mode 100644 index 0000000..727f270 --- /dev/null +++ b/tests/lib/test_baseline.py @@ -0,0 +1,32 @@ +from evals.lib.models import CellReport, EvalResult, Verdict +from evals.lib.baseline import diff, score_delta + + +def _cell(verdicts: dict): + results = [EvalResult(platform="p", skill="s", run_id=k, should_trigger=True, + did_trigger=True, trigger_correct=True, verdict=v, score=100) + for k, v in verdicts.items()] + return CellReport(platform="p", skill="s", model="m", commit="c", results=results) + + +def test_diff_statuses(): + base = _cell({"a": Verdict.PASS, "b": Verdict.FAIL, "c": Verdict.PASS, "d": Verdict.PASS}) + cur = _cell({"a": Verdict.FAIL, "b": Verdict.PASS, "c": Verdict.PASS, "e": Verdict.PASS}) + d = diff(cur, base) + assert d["a"] == "regressed" + assert d["b"] == "fixed" + assert d["c"] == "same" + assert d["e"] == "new" + assert d["d"] == "dropped" + + +def test_diff_changed_non_fail(): + base = _cell({"a": Verdict.PASS}) + cur = _cell({"a": Verdict.PASS_SLOW}) + assert diff(cur, base)["a"] == "changed" + + +def test_score_delta_bands(): + assert score_delta(90, 88) == "no-change" + assert score_delta(95, 88) == "better" + assert score_delta(80, 88) == "worse" diff --git a/tests/lib/test_cli_resilience.py b/tests/lib/test_cli_resilience.py new file mode 100644 index 0000000..9668e4f --- /dev/null +++ b/tests/lib/test_cli_resilience.py @@ -0,0 +1,41 @@ +import json +from pathlib import Path +import pytest +import evals.cli as cli_mod + + +class BoomAdapter: + platform = "boom" + + def cli_signals(self, s): + return [] + + def invocation_signals(self, s): + return [] + + def parse_stream(self, raw): + from evals.lib.models import ParsedRun + return ParsedRun() + + def detect_trigger(self, run, s): + return False + + def launch(self, *a, **k): + raise FileNotFoundError("agent: command not found") + + +def test_main_survives_launch_crash(monkeypatch, tmp_path): + # Point results at a temp dir and force the boom adapter + a tiny prompt set. + monkeypatch.setattr(cli_mod, "get_adapter", lambda p: BoomAdapter()) + monkeypatch.setattr(cli_mod, "RESULTS_ROOT", tmp_path) + monkeypatch.setattr("sys.argv", ["evals", "--harness", "claude-code", "--skill", "hawkscan"]) + with pytest.raises(SystemExit): # FP/FN cause sys.exit(1) — that's fine + cli_mod.main() + # The cell + summary were still written despite every launch crashing: + out = tmp_path / "claude-code" / "results" / "hawkscan" + assert (out / "cell.json").exists() + assert (out / "summary.json").exists() + cell = json.loads((out / "cell.json").read_text()) + assert len(cell["results"]) == 20 # all hawkscan prompts graded + # positive prompts failed with a harness note; at least one note mentions the crash + assert any("command not found" in r.get("note", "") for r in cell["results"]) diff --git a/tests/lib/test_compare.py b/tests/lib/test_compare.py new file mode 100644 index 0000000..4adb5cf --- /dev/null +++ b/tests/lib/test_compare.py @@ -0,0 +1,65 @@ +# tests/lib/test_compare.py +from evals.lib.models import ParsedRun, Verdict +from evals.lib import compare as compare_mod + + +# A realistic skill-loaded hawkscan run: preflight + step1 discovery + config +# validation + synchronous scan, with output mentioning the app is reachable. +# This satisfies hawkscan's blocking process-checks, the way a real run would. +_WITH_SKILL = ParsedRun( + bash_commands=[ + "hawk version", + "hawk config --help", + "hawkop app list", + "hawkop env list", + "hawk init", + "hawk validate config stackhawk.yml", + "hawk scan --env Development", + ], + output_text="The application was running and reachable on localhost:8080.", + cost_usd=0.05, +) +_WITHOUT_SKILL = ParsedRun(bash_commands=["echo idk"], cost_usd=0.02) + + +class StubAdapter: + platform = "stub" + def cli_signals(self, skill): return ["hawk scan"] + def invocation_signals(self, skill): return [] + def parse_stream(self, raw): return ParsedRun() + def detect_trigger(self, run, skill): + return any("hawk scan" in c for c in run.bash_commands) + def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill, + max_budget, bare, full_auto): + return _WITH_SKILL if load_skill else _WITHOUT_SKILL + + +def test_compare_shows_lift(monkeypatch): + monkeypatch.setattr(compare_mod, "get_adapter", lambda p: StubAdapter()) + rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01") + row = rows[0] + assert row["without_verdict"] == Verdict.FAIL # no skill -> blocking checks fail + assert row["with_verdict"] in (Verdict.PASS, Verdict.PASS_SLOW) # skill -> workflow satisfied + assert row["with_cost"] == 0.05 and row["without_cost"] == 0.02 + + +def test_compare_skill_returns_lift_effect(monkeypatch): + from evals.lib.models import ParsedRun, Verdict + from evals.lib import compare as compare_mod + + class Stub: + platform = "stub" + def cli_signals(self, s): return ["hawk scan"] + def invocation_signals(self, s): return [] + def parse_stream(self, raw): return ParsedRun() + def detect_trigger(self, run, s): return any("hawk scan" in c for c in run.bash_commands) + def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill, + max_budget, bare, full_auto): + return (ParsedRun(bash_commands=["hawk version","hawk config --help", + "hawkop app list","hawkop env list","hawk init", + "hawk validate config stackhawk.yml","hawk scan"], + output_text="reachable on localhost:8080") if load_skill + else ParsedRun(bash_commands=["echo idk"])) + monkeypatch.setattr(compare_mod, "get_adapter", lambda p: Stub()) + rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01") + assert rows[0]["effect"] == "lift" diff --git a/tests/lib/test_config.py b/tests/lib/test_config.py new file mode 100644 index 0000000..8f64c2e --- /dev/null +++ b/tests/lib/test_config.py @@ -0,0 +1,82 @@ +# tests/lib/test_config.py +import json +import textwrap +import pytest +from pydantic import ValidationError +from evals.lib.config import load_skill, SkillConfig + + +def _write_skill(tmp_path, prompts_yaml: str, checks: dict): + skill_dir = tmp_path / "demo" + skill_dir.mkdir() + (skill_dir / "prompts.yaml").write_text(prompts_yaml) + (skill_dir / "process-checks.json").write_text(json.dumps(checks)) + return skill_dir + + +def test_load_skill_parses_prompts_and_checks(tmp_path): + yaml_text = textwrap.dedent(""" + - id: d-01 + should_trigger: true + invocation_type: explicit + prompt: do the thing + budget: + bash_commands: 5 + expected: + - signal: "hawk scan" + """) + checks = {"skill": "demo", "checks": [ + {"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}]} + skill_dir = _write_skill(tmp_path, yaml_text, checks) + + cfg = load_skill("demo", base_dir=skill_dir.parent) + assert isinstance(cfg, SkillConfig) + assert cfg.skill == "demo" + assert len(cfg.prompts) == 1 + assert cfg.prompts[0].budget.bash_commands == 5 + assert cfg.checks[0]["id"] == "c1" + + +def test_load_skill_rejects_bad_prompt_field(tmp_path): + yaml_text = textwrap.dedent(""" + - id: d-01 + should_trigger: true + invocation_type: explicit + prompt: x + budget_usd: 0.1 + """) + skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []}) + with pytest.raises(ValidationError): + load_skill("demo", base_dir=skill_dir.parent) + + +def test_load_skill_rejects_duplicate_ids(tmp_path): + yaml_text = textwrap.dedent(""" + - id: dup + should_trigger: true + invocation_type: explicit + prompt: a + - id: dup + should_trigger: false + invocation_type: negative + prompt: b + """) + skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []}) + with pytest.raises(ValueError, match="duplicate prompt id"): + load_skill("demo", base_dir=skill_dir.parent) + + +def test_load_skill_rejects_applies_to_unknown_prompt(tmp_path): + yaml_text = textwrap.dedent(""" + - id: d-01 + should_trigger: true + invocation_type: explicit + prompt: x + """) + checks = {"skill": "demo", "checks": [ + {"id": "c1", "type": "command_executed", "signals": ["x"], + "severity": "warning", "applies_to": ["nope"]}]} + skill_dir = _write_skill(tmp_path, yaml_text, checks) + with pytest.raises(ValueError, match="applies_to references unknown prompt"): + load_skill("demo", base_dir=skill_dir.parent) diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py new file mode 100644 index 0000000..ce34aaf --- /dev/null +++ b/tests/lib/test_grading.py @@ -0,0 +1,243 @@ +# tests/lib/test_grading.py +from evals.lib.models import ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict +from evals.lib.grading import ( + applicable_checks, run_process_checks, run_adhoc_expected, check_budget, grade, +) + + +def _prompt(**kw): + base = dict(id="d-01", should_trigger=True, invocation_type="explicit", prompt="x") + base.update(kw) + return PromptConfig(**base) + + +def test_applicable_checks_global_and_scoped(): + checks = [ + {"id": "global", "type": "command_executed", "signals": ["a"], "severity": "warning"}, + {"id": "scoped", "type": "command_executed", "signals": ["b"], "severity": "warning", + "applies_to": ["d-02"]}, + ] + assert {c["id"] for c in applicable_checks(checks, "d-01")} == {"global"} + assert {c["id"] for c in applicable_checks(checks, "d-02")} == {"global", "scoped"} + + +def test_process_check_signal_hit(): + run = ParsedRun(bash_commands=["hawk scan --env test"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + res = run_process_checks(run, checks) + assert res[0].passed is True + assert res[0].signal_found == "hawk scan" + + +def test_process_check_anti_pattern_negative_type(): + run = ParsedRun(bash_commands=["curl https://api/v1/scan"]) + checks = [{"id": "c1", "type": "command_negative", "anti_patterns": ["curl"], + "severity": "warning"}] + res = run_process_checks(run, checks) + assert res[0].passed is False + assert res[0].anti_found == "curl" + + +def test_adhoc_expected_signal_and_anti(): + run = ParsedRun(bash_commands=["hawk validate"], output_text="done") + expected = [ExpectedCheck(signal="hawk validate"), + ExpectedCheck(anti_pattern="rm -rf")] + res = run_adhoc_expected(run, expected) + assert all(r.passed for r in res) + + +def test_adhoc_expected_missing_signal_is_blocking_fail(): + run = ParsedRun(bash_commands=["hawk scan"]) + res = run_adhoc_expected(run, [ExpectedCheck(signal="hawk validate")]) + assert res[0].passed is False + assert res[0].severity == "blocking" + + +def test_check_budget_detects_breaches(): + run = ParsedRun(bash_commands=["a", "b", "c"], cost_usd=0.30, output_tokens=9000) + budget = BudgetSpec(cost_usd=0.15, bash_commands=2, output_tokens=5000) + breaches = check_budget(run, budget) + assert any("cost_usd" in b for b in breaches) + assert any("bash_commands" in b for b in breaches) + assert any("output_tokens" in b for b in breaches) + + +def test_check_budget_ignores_unset_axes(): + run = ParsedRun(bash_commands=["a", "b", "c"]) + assert check_budget(run, BudgetSpec(cost_usd=1.0)) == [] + + +def test_grade_pass(): + run = ParsedRun(bash_commands=["hawk scan"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + result = grade(_prompt(), run, checks, platform="claude-code", skill="demo", + did_trigger=True) + assert result.verdict == Verdict.PASS + assert result.score == 100 + + +def test_grade_fail_on_blocking(): + run = ParsedRun(bash_commands=["echo nope"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + result = grade(_prompt(), run, checks, platform="claude-code", skill="demo", + did_trigger=True) + assert result.verdict == Verdict.FAIL + + +def test_grade_pass_slow_on_budget_breach(): + run = ParsedRun(bash_commands=["hawk scan", "a", "b", "c"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + p = _prompt(budget=BudgetSpec(bash_commands=2)) + result = grade(p, run, checks, platform="claude-code", skill="demo", + did_trigger=True) + assert result.verdict == Verdict.PASS_SLOW + assert any("bash_commands" in b for b in result.budget_breaches) + + +def test_process_check_conditional_command_enforced_when_keyword_present(): + run = ParsedRun(bash_commands=["cat stackhawk.yml: authentication: enabled"], + output_text="hawk validate ran") + checks = [{"id": "c1", "type": "conditional_command", + "condition": "stackhawk.yml contains 'authentication:'", + "signals": ["hawk validate"], "severity": "warning"}] + assert run_process_checks(run, checks)[0].passed is True + + +def test_process_check_conditional_command_skipped_when_keyword_absent(): + run = ParsedRun(bash_commands=["echo nothing relevant"]) + checks = [{"id": "c1", "type": "conditional_command", + "condition": "stackhawk.yml contains 'authentication:'", + "signals": ["hawk validate"], "severity": "warning"}] + # keyword not in haystack -> check is not applicable -> passes + assert run_process_checks(run, checks)[0].passed is True + + +def test_process_check_conditional_command_raises_without_quoted_keyword(): + import pytest + run = ParsedRun(bash_commands=["x"]) + checks = [{"id": "c1", "type": "conditional_command", + "condition": "no quotes here", "signals": ["x"], "severity": "warning"}] + with pytest.raises(ValueError, match="single-quoted keyword"): + run_process_checks(run, checks) + + +def test_process_check_command_preference_normal(): + run = ParsedRun(bash_commands=["hawkop scan get 123"]) + checks = [{"id": "c1", "type": "command_preference", + "preferred": ["hawkop scan get"], "anti_patterns": ["curl"], + "severity": "warning"}] + assert run_process_checks(run, checks)[0].passed is True + + +def test_process_check_command_preference_empty_is_unconstrained(): + run = ParsedRun(bash_commands=["anything"]) + checks = [{"id": "c1", "type": "command_preference", "preferred": [], + "anti_patterns": ["curl"], "severity": "warning"}] + assert run_process_checks(run, checks)[0].passed is True + + +def test_process_check_file_absent(): + run = ParsedRun(files_written=["stackhawk.yml"]) + present = [{"id": "c1", "type": "file_absent", "target_file": "stackhawk.yml", + "severity": "warning"}] + absent = [{"id": "c2", "type": "file_absent", "target_file": "secrets.env", + "severity": "warning"}] + assert run_process_checks(run, present)[0].passed is False + assert run_process_checks(run, absent)[0].passed is True + + +def test_adhoc_expected_check_id_is_skipped(): + run = ParsedRun(bash_commands=["x"]) + assert run_adhoc_expected(run, [ExpectedCheck(check_id="step1")]) == [] + + +def test_score_deductions(): + from evals.lib.grading import _score + from evals.lib.models import ProcessCheckResult + def pc(passed, sev): return ProcessCheckResult(id="x", passed=passed, severity=sev) + assert _score([pc(True, "blocking")]) == 100 + assert _score([pc(False, "blocking")]) == 85 + assert _score([pc(False, "warning")]) == 95 + assert _score([pc(False, "blocking"), pc(False, "warning")]) == 80 + assert _score([pc(False, "blocking")] * 8) == 0 # floored + + +def test_grade_correct_negative_passes_without_process_checks(): + # should_trigger=False, did_trigger=False -> correct -> PASS, no process checks run + run = ParsedRun(bash_commands=["echo not relevant"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + p = _prompt(should_trigger=False) + res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False) + assert res.verdict == Verdict.PASS + assert res.trigger_correct is True + assert res.process_checks == [] + assert res.score == 100 + + +def test_grade_false_negative_fails(): + # should_trigger=True but did_trigger=False -> incorrect -> FAIL, no process checks + run = ParsedRun(bash_commands=["echo nothing"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + p = _prompt(should_trigger=True) + res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False) + assert res.verdict == Verdict.FAIL + assert res.trigger_correct is False + assert res.process_checks == [] + + +def test_grade_false_positive_fails_without_process_checks(): + # should_trigger=False but did_trigger=True -> incorrect -> FAIL, no process checks + run = ParsedRun(bash_commands=["hawk scan"]) + checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"], + "severity": "blocking"}] + p = _prompt(should_trigger=False) + res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=True) + assert res.verdict == Verdict.FAIL + assert res.trigger_correct is False + assert res.process_checks == [] + + +def test_grade_propagates_harness_error_to_note(): + from evals.lib.models import ParsedRun, Verdict + from evals.lib.grading import grade + p = _prompt(should_trigger=True) # _prompt helper already in this file + run = ParsedRun(returncode=1, stderr_tail="agent: command not found", error="exit 1: agent: command not found") + res = grade(p, run, [], platform="cursor", skill="hawkscan", did_trigger=False) + assert res.verdict == Verdict.FAIL # didn't trigger + assert "command not found" in res.note # harness error surfaced + + +def test_file_absent_or_unchanged_passes_when_not_written(): + checks = [{"id": "no_yml", "type": "file_absent_or_unchanged", + "target_file": "stackhawk.yml", "severity": "blocking"}] + assert run_process_checks(ParsedRun(output_text="done"), checks)[0].passed is True + # ...and fails when the file IS written + bad = ParsedRun(output_text="done", files_written=["stackhawk.yml"]) + assert run_process_checks(bad, checks)[0].passed is False + + +def test_file_absent_with_anti_pattern_paths(): + checks = [{"id": "no_legacy", "type": "file_absent", + "anti_patterns": ["bootstrap/manifest.yaml"], "severity": "blocking"}] + assert run_process_checks(ParsedRun(output_text="x"), checks)[0].passed is True + bad = ParsedRun(files_written=["bootstrap/manifest.yaml"]) + assert run_process_checks(bad, checks)[0].passed is False + + +def test_file_present_via_write_or_narration(): + checks = [{"id": "emit", "type": "file_present", + "signals": ["data-seed/manifest.yaml"], "severity": "blocking"}] + # written for real (execution mode) + assert run_process_checks( + ParsedRun(files_written=["data-seed/manifest.yaml"]), checks)[0].passed is True + # only narrated (observe mode) + assert run_process_checks( + ParsedRun(output_text="I'll write data-seed/manifest.yaml"), checks)[0].passed is True + # neither -> fail + assert run_process_checks(ParsedRun(output_text="nope"), checks)[0].passed is False diff --git a/tests/lib/test_harness.py b/tests/lib/test_harness.py new file mode 100644 index 0000000..4689abb --- /dev/null +++ b/tests/lib/test_harness.py @@ -0,0 +1,31 @@ +# tests/lib/test_harness.py +import json +from evals.lib.harness import get_adapter +from evals.lib.models import ParsedRun + +CC = get_adapter("claude-code") + + +def test_parse_stream_extracts_bash_and_text(): + lines = [ + json.dumps({"type": "assistant", "message": {"content": [ + {"type": "tool_use", "name": "Bash", "input": {"command": "hawk scan"}}, + {"type": "text", "text": "scanning now"}, + ]}}), + json.dumps({"type": "result", "result": "done", "cost_usd": 0.04}), + ] + run = CC.parse_stream("\n".join(lines)) + assert isinstance(run, ParsedRun) + assert run.bash_commands == ["hawk scan"] + assert "scanning now" in run.output_text + assert run.cost_usd == 0.04 + + +def test_detect_trigger_via_cli_signal(): + run = ParsedRun(bash_commands=["hawk scan --env test"]) + assert CC.detect_trigger(run, "hawkscan") is True + + +def test_detect_trigger_negative(): + run = ParsedRun(bash_commands=["echo hello"], output_text="nothing relevant") + assert CC.detect_trigger(run, "hawkscan") is False diff --git a/tests/lib/test_models.py b/tests/lib/test_models.py new file mode 100644 index 0000000..ff84e20 --- /dev/null +++ b/tests/lib/test_models.py @@ -0,0 +1,94 @@ +# tests/lib/test_models.py +import pytest +from pydantic import ValidationError +from evals.lib.models import ( + BudgetSpec, ExpectedCheck, PromptConfig, ParsedRun, Verdict, +) + + +def test_prompt_config_minimal(): + p = PromptConfig(id="hw-01", should_trigger=True, + invocation_type="explicit", prompt="scan it") + assert p.budget is None + assert p.expected == [] + assert p.notes == "" + + +def test_prompt_config_rejects_unknown_field(): + with pytest.raises(ValidationError): + PromptConfig(id="hw-01", should_trigger=True, + invocation_type="explicit", prompt="x", budget_usd=0.1) + + +def test_budget_spec_rejects_unknown_axis(): + with pytest.raises(ValidationError): + BudgetSpec(cost_dollars=0.1) + + +def test_expected_check_requires_exactly_one(): + ExpectedCheck(signal="hawk scan") # ok + ExpectedCheck(check_id="step1") # ok + ExpectedCheck(anti_pattern="curl") # ok + with pytest.raises(ValidationError): + ExpectedCheck() # none set + with pytest.raises(ValidationError): + ExpectedCheck(signal="a", anti_pattern="b") # two set + + +def test_invocation_type_is_constrained(): + with pytest.raises(ValidationError): + PromptConfig(id="x", should_trigger=True, + invocation_type="bogus", prompt="x") + + +def test_verdict_values(): + assert Verdict.PASS == "pass" + assert Verdict.PASS_SLOW == "pass-slow" + assert Verdict.FAIL == "fail" + + +def test_parsed_run_defaults(): + r = ParsedRun() + assert r.bash_commands == [] + assert r.cost_usd == 0.0 + assert r.output_tokens is None + + +def test_cellreport_roundtrips(): + from evals.lib.models import CellReport, EvalResult, Verdict + r = EvalResult(platform="codex", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=True, trigger_correct=True, + verdict=Verdict.PASS, score=100) + cell = CellReport(platform="codex", skill="hawkscan", model="haiku", + commit="abc1234", results=[r]) + again = CellReport.model_validate_json(cell.model_dump_json()) + assert again.results[0].run_id == "hw-01" + assert again.model == "haiku" + + +def test_cellreport_rejects_unknown_field(): + import pytest + from pydantic import ValidationError + from evals.lib.models import CellReport + with pytest.raises(ValidationError): + CellReport(platform="x", skill="y", model="m", commit="c", results=[], extra=1) + + +def test_parsedrun_has_diagnostic_fields(): + from evals.lib.models import ParsedRun + r = ParsedRun() + assert r.returncode is None + assert r.stderr_tail == "" + r2 = ParsedRun(returncode=1, stderr_tail="boom") + assert r2.returncode == 1 and r2.stderr_tail == "boom" + + +def test_evalresult_has_note_field(): + from evals.lib.models import EvalResult, Verdict + e = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True, + did_trigger=True, trigger_correct=True, verdict=Verdict.PASS, score=100) + assert e.note == "" + e2 = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True, + did_trigger=False, trigger_correct=False, verdict=Verdict.FAIL, + score=0, note="harness error: agent: command not found") + assert "command not found" in e2.note diff --git a/tests/lib/test_replay.py b/tests/lib/test_replay.py new file mode 100644 index 0000000..a69f3a7 --- /dev/null +++ b/tests/lib/test_replay.py @@ -0,0 +1,20 @@ +# tests/lib/test_replay.py +from pathlib import Path +from evals.lib.replay import regrade +from evals.lib.models import Verdict + +FIXTURE = Path(__file__).parent.parent / "fixtures" / "hw-07.trace.jsonl" + + +def test_regrade_from_trace_passes(): + result = regrade(FIXTURE, skill="hawkscan", platform="claude-code") + assert result.did_trigger is True + assert result.verdict in (Verdict.PASS, Verdict.PASS_SLOW) + assert result.run_id == "hw-07" + + +def test_regrade_is_deterministic(): + a = regrade(FIXTURE, skill="hawkscan", platform="claude-code") + b = regrade(FIXTURE, skill="hawkscan", platform="claude-code") + assert a.verdict == b.verdict + assert a.score == b.score diff --git a/tests/lib/test_reporting.py b/tests/lib/test_reporting.py new file mode 100644 index 0000000..54707d2 --- /dev/null +++ b/tests/lib/test_reporting.py @@ -0,0 +1,20 @@ +# tests/lib/test_reporting.py +from evals.lib.models import EvalResult, Verdict +from evals.lib.reporting import build_summary + + +def _r(run_id, verdict, trigger_ok=True, should=True, did=True): + return EvalResult(platform="claude-code", skill="hawkscan", run_id=run_id, + should_trigger=should, did_trigger=did, trigger_correct=trigger_ok, + verdict=verdict, score=100 if verdict != Verdict.FAIL else 40) + + +def test_build_summary_counts(): + results = [_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS_SLOW), + _r("hw-03", Verdict.FAIL), + _r("hw-13", Verdict.PASS, trigger_ok=False, should=False, did=True)] + s = build_summary("hawkscan", "claude-code", results) + assert s["trigger_accuracy"]["correct"] == 3 + assert s["trigger_accuracy"]["total"] == 4 + assert s["false_positives"] == ["hw-13"] + assert s["verdict_counts"] == {"pass": 2, "pass-slow": 1, "fail": 1} diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py new file mode 100644 index 0000000..f2e27c6 --- /dev/null +++ b/tests/lib/test_reporting_render.py @@ -0,0 +1,119 @@ +from evals.lib.models import CellReport, EvalResult, Verdict +from evals.lib.reporting import badge, render_job_summary + + +def _cell(*results): + return CellReport(platform="claude-code", skill="hawkscan", model="haiku", + commit="abc1234", results=list(results)) + + +def _r(rid, verdict, trig=True, should=True, did=True, why=""): + return EvalResult(platform="claude-code", skill="hawkscan", run_id=rid, + should_trigger=should, did_trigger=did, trigger_correct=trig, + verdict=verdict, score=100 if verdict != Verdict.FAIL else 40, + budget_breaches=[why] if (why and verdict == Verdict.PASS_SLOW) else []) + + +def test_badge_is_shields_image(): + md = badge("fail", "FAIL") + assert md.startswith("![") and "img.shields.io/badge/" in md + + +def test_job_summary_has_counts_and_all_rows_failures_first(): + cell = _cell(_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS), + _r("hw-14", Verdict.FAIL, trig=False, should=False, did=True)) + md = render_job_summary(cell) + assert "claude-code" in md and "hawkscan" in md and "haiku" in md + assert "1 failed" in md.lower() or "❌ 1" in md + for rid in ("hw-01", "hw-02", "hw-14"): + assert rid in md + # failing row appears before the first passing row + assert md.index("hw-14") < md.index("hw-01") + + +def test_write_github_summary_appends(tmp_path, monkeypatch): + from evals.lib.reporting import write_github_summary + f = tmp_path / "summary.md" + monkeypatch.setenv("GITHUB_STEP_SUMMARY", str(f)) + write_github_summary("## hello\n") + assert "## hello" in f.read_text() + + +def test_write_github_summary_noop_when_unset(monkeypatch): + from evals.lib.reporting import write_github_summary + monkeypatch.delenv("GITHUB_STEP_SUMMARY", raising=False) + write_github_summary("nothing") # must not raise + + +def test_digest_shows_regression_vs_baseline(): + from evals.lib.models import CellReport, EvalResult, Verdict + from evals.lib.reporting import render_digest + + def cell(v): + r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=True, trigger_correct=True, + verdict=v, score=100 if v != Verdict.FAIL else 0) + return CellReport(platform="claude-code", skill="hawkscan", model="haiku", + commit="c", results=[r]) + cur = cell(Verdict.FAIL) + base = {("claude-code", "hawkscan", "haiku"): cell(Verdict.PASS)} + md = render_digest([cur], baselines=base) + assert "regressed" in md.lower() + assert "no baseline" not in md.lower() + + +def test_render_digest_overview_and_per_cell(): + from pathlib import Path + from evals.lib.models import CellReport + from evals.lib.reporting import render_digest + root = Path(__file__).parent.parent / "fixtures" / "results" + cells = [CellReport.model_validate_json((p / "cell.json").read_text()) + for p in sorted(root.iterdir()) if (p / "cell.json").exists()] + md = render_digest(cells) + assert "Skill Eval" in md + assert "claude-code" in md and "codex" in md + assert "hw-14" in md # failing test surfaced + assert "no baseline" in md.lower() # no baseline supplied + + +def test_digest_overview_shows_score_delta_vs_baseline(): + from evals.lib.models import CellReport, EvalResult, Verdict + from evals.lib.reporting import render_digest + + def cell(score): + r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=True, trigger_correct=True, + verdict=Verdict.PASS, score=score) + return CellReport(platform="claude-code", skill="hawkscan", model="haiku", + commit="c", results=[r]) + cur = cell(70) + base = {("claude-code", "hawkscan", "haiku"): cell(90)} + md = render_digest([cur], baselines=base) + assert "worse" in md.lower() # 70 vs 90 -> worse + + +def test_job_summary_shows_note(): + from evals.lib.models import CellReport, EvalResult, Verdict + from evals.lib.reporting import render_job_summary + r = EvalResult(platform="cursor", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=False, trigger_correct=False, + verdict=Verdict.FAIL, score=0, note="harness error: agent not found") + cell = CellReport(platform="cursor", skill="hawkscan", model="default", + commit="c", results=[r]) + md = render_job_summary(cell) + assert "agent not found" in md + + +def test_digest_renders_lift_section(): + from evals.lib.models import CellReport, EvalResult, Verdict + from evals.lib.reporting import render_digest + r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=True, trigger_correct=True, + verdict=Verdict.PASS, score=100) + cell = CellReport(platform="claude-code", skill="hawkscan", model="haiku", + commit="c", results=[r]) + lift = {("claude-code", "hawkscan", "haiku"): [ + {"id": "hw-01", "without_verdict": "fail", "with_verdict": "pass", "effect": "lift"}]} + md = render_digest([cell], lift=lift) + assert "lift" in md.lower() and "hw-01" in md + assert "1/1" in md or "1 of 1" in md.lower() diff --git a/tests/lib/test_rubric.py b/tests/lib/test_rubric.py new file mode 100644 index 0000000..b80d016 --- /dev/null +++ b/tests/lib/test_rubric.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from evals.lib.models import EvalResult, Verdict, RubricResult +from evals.lib.reporting import _pivot_cell +from evals.lib.rubric import grade_rubric +from evals.lib.models import ParsedRun + + +def _res(rubric=None, verdict=Verdict.PASS): + return EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01", + should_trigger=True, did_trigger=True, trigger_correct=True, + verdict=verdict, score=100, rubric=rubric) + + +def test_rubric_tag_pass(): + cell = _pivot_cell(_res(RubricResult(overall_pass=True, score=85))) + assert cell == "✅ r85✓" + + +def test_rubric_tag_fail_shows_score(): + cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=55))) + assert "r55✗" in cell and cell.startswith("✅") # deterministic pass, rubric flags quality + + +def test_no_rubric_tag_when_absent(): + assert _pivot_cell(_res(None)) == "✅" + + +def test_rubric_error_renders_question_mark(): + cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=0, error="grader failed"))) + assert "r?" in cell + + +def test_grade_rubric_none_when_config_missing(tmp_path: Path): + # no rubric-items.json / rubric-schema.json under base_dir -> None (not an error) + assert grade_rubric(ParsedRun(output_text="x"), "hawkscan", "hw-01", + base_dir=tmp_path) is None diff --git a/tests/lib/test_rubric_extract.py b/tests/lib/test_rubric_extract.py new file mode 100644 index 0000000..d9a5002 --- /dev/null +++ b/tests/lib/test_rubric_extract.py @@ -0,0 +1,30 @@ +"""Tests for the tolerant JSON extractor in the rubric grader — the grader reply +often wraps the object in prose or a ```json fence even under --json-schema.""" +import pytest +from evals.lib.rubric import _extract_json_object + + +def test_pure_json(): + assert _extract_json_object('{"score": 85, "overall_pass": true}')["score"] == 85 + + +def test_fenced_json(): + txt = "Here is the result:\n\n```json\n{\"score\": 70, \"overall_pass\": false}\n```" + assert _extract_json_object(txt)["score"] == 70 + + +def test_prose_prefixed_json(): + # The exact failure mode reproduced locally. + txt = 'No skills needed.\n\n```json\n{\n "score": 85,\n "overall_pass": true,\n "checks": []\n}\n```' + out = _extract_json_object(txt) + assert out["score"] == 85 and out["overall_pass"] is True + + +def test_bare_object_in_prose_no_fence(): + txt = 'The verdict is {"score": 60, "overall_pass": false, "checks": []} per the rubric.' + assert _extract_json_object(txt)["score"] == 60 + + +def test_no_json_raises(): + with pytest.raises(ValueError): + _extract_json_object("there is no json here") diff --git a/tests/lib/test_triggers.py b/tests/lib/test_triggers.py new file mode 100644 index 0000000..4e90bf8 --- /dev/null +++ b/tests/lib/test_triggers.py @@ -0,0 +1,84 @@ +"""Tests for the shared trigger-decision helpers (evals/lib/triggers.py). + +The motivating bug: hw-17 ("Don't run a security scan after this change — just +commit it") is a negative. The agent correctly declared `hawkscan:hawkscan: NO`, +but detect_trigger substring-matched the loose phrase "security scan after" from +the agent quoting the user's instruction, producing a false positive. +""" +from evals.lib.triggers import explicit_decision, decide_trigger + + +def test_explicit_yes_recognized(): + assert explicit_decision("hawkscan:hawkscan: YES — run a scan", "hawkscan") == "yes" + assert explicit_decision("**stackhawk-api:api: YES**", "api") == "yes" + assert explicit_decision("`stackhawk-data-seed:stackhawk-data-seed: YES`", + "stackhawk-data-seed") == "yes" + + +def test_explicit_no_recognized(): + assert explicit_decision("hawkscan:hawkscan: NO — user declined", "hawkscan") == "no" + assert explicit_decision("Decision: none: NO", "hawkscan") == "no" + assert explicit_decision("stackhawk-api:api — NO", "api") == "no" + + +def test_dash_and_emphasis_separators(): + assert explicit_decision("**hawkscan:hawkscan — YES**", "hawkscan") == "yes" + assert explicit_decision("hawkscan:hawkscan - NO", "hawkscan") == "no" + + +def test_no_decision_line_returns_none(): + assert explicit_decision("I'll run a security scan for you.", "hawkscan") is None + + +def test_hyphenated_skill_name_not_corrupted(): + # `stackhawk-api` must not be mis-split on its internal hyphen. + assert explicit_decision("stackhawk-api:api: NO", "api") == "no" + assert explicit_decision("stackhawk-data-seed:stackhawk-data-seed: NO", + "stackhawk-data-seed") == "no" + + +def test_hw17_false_positive_suppressed(): + # The exact failure mode: explicit decline + a loose phrase the agent quoted. + text = ("**hawkscan:hawkscan: NO** — User explicitly requested " + '"Don\'t run a security scan after this change"\n\n**Decision: none: NO**') + declared = explicit_decision(text, "hawkscan") + assert declared == "no" + # Even though a loose behavioral phrase matched, the explicit decline wins. + assert decide_trigger(executed_cli=False, declared=declared, loose_hit=True) is False + + +def test_real_execution_overrides_declared_no(): + # If the agent actually ran the CLI, it triggered regardless of what it said. + assert decide_trigger(executed_cli=True, declared="no", loose_hit=False) is True + + +def test_loose_fallback_when_no_decision(): + assert decide_trigger(executed_cli=False, declared=None, loose_hit=True) is True + assert decide_trigger(executed_cli=False, declared=None, loose_hit=False) is False + + +def test_explicit_yes_triggers_without_loose(): + assert decide_trigger(executed_cli=False, declared="yes", loose_hit=False) is True + + +def test_does_not_apply_is_decline(): + assert explicit_decision("`hawkscan:hawkscan` does not apply here", "hawkscan") == "no" + assert explicit_decision("the api skill is not needed: stackhawk-api:api not applicable", "api") == "no" + + +def test_choosing_a_different_skill_declines_this_one(): + # hw-13: agent picks api, says hawkscan doesn't apply — must not be a hawkscan trigger. + txt = "`stackhawk-api:api: YES`\n(`hawkscan:hawkscan` does not apply — you asked for findings.)" + assert explicit_decision(txt, "hawkscan") == "no" + assert explicit_decision(txt, "api") == "yes" + + +def test_other_skill_yes_alone_declines(): + assert explicit_decision("hawkscan:hawkscan: YES", "api") == "no" + assert explicit_decision("hawkscan:hawkscan: YES", "stackhawk-data-seed") == "no" + + +def test_own_yes_not_suppressed_by_other(): + # Both declared yes — this skill is still yes. + txt = "stackhawk-api:api: YES and hawkscan:hawkscan: YES" + assert explicit_decision(txt, "hawkscan") == "yes" diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..851950e --- /dev/null +++ b/uv.lock @@ -0,0 +1,325 @@ +version = 1 +revision = 2 +requires-python = ">=3.11" + +[[package]] +name = "agent-skills-evals" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "rich" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "pydantic", specifier = ">=2.6" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "rich", specifier = ">=13.0" }, +] + +[package.metadata.requires-dev] +dev = [{ name = "pytest", specifier = ">=8.0" }] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454, upload-time = "2026-05-07T12:08:28.36Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687, upload-time = "2026-05-07T12:08:27.182Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "packaging" +version = "26.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "pydantic" +version = "2.13.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775, upload-time = "2026-05-06T13:43:05.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262, upload-time = "2026-05-06T13:43:02.641Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.46.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464, upload-time = "2026-05-06T13:37:06.98Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/fa/6d7708d2cfc1a832acb6aeb0cd16e801902df8a0f583bb3b4b527fde022e/pydantic_core-2.46.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0e96592440881c74a213e5ad528e2b24d3d4f940de2766bed9010ab1d9e51594", size = 2111872, upload-time = "2026-05-06T13:40:27.596Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6f/aa064a3e74b5745afbdf250594f38e7ead05e2d651bcb35994b9417a0d4d/pydantic_core-2.46.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0d65b8c354be7fb5f720c3caa8bc940bc2d20ce749c8e06135f07f8ed95dd7c", size = 1948255, upload-time = "2026-05-06T13:39:12.574Z" }, + { url = "https://files.pythonhosted.org/packages/43/3a/41114a9f7569b84b4d84e7a018c57c56347dac30c0d4a872946ec4e36c46/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bfb192b3f4b9e8a89b6277b6ce787564f62cfd272055f6e685726b111dc7826", size = 1972827, upload-time = "2026-05-06T13:38:19.841Z" }, + { url = "https://files.pythonhosted.org/packages/ef/25/1ab42e8048fe551934d9884e8d64daa7e990ad386f310a15981aeb6a5b08/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9037063db01f09b09e237c282b6792bd4da634b5402c4e7f0c61effed7701a04", size = 2041051, upload-time = "2026-05-06T13:38:10.447Z" }, + { url = "https://files.pythonhosted.org/packages/94/c2/1a934597ddf08da410385b3b7aae91956a5a76c635effef456074fad7e88/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc010ab034c8c7452522748bf937df58020d256ccae0874463d1f4d01758af8e", size = 2221314, upload-time = "2026-05-06T13:40:13.089Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/9e8ad178c9c4df27ad3c8f25d1fe2a7ab0d2ba0559fad4aee5d3d1f16771/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c5dac79fa1614d1e06ca695109c6105923bd9c7d1d6c918d4e637b7e6b32fd3", size = 2285146, upload-time = "2026-05-06T13:38:59.224Z" }, + { url = "https://files.pythonhosted.org/packages/80/50/540cd3aeefc041beb111125c4bff779831a2111fc6b15a9138cda277d32c/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fa868638bf362d3d138ea55829cefb3d5f4b0d7f142234382a15e2485dbec4", size = 2089685, upload-time = "2026-05-06T13:38:17.762Z" }, + { url = "https://files.pythonhosted.org/packages/6b/a4/b440ad35f05f6a38f89fa0f149accb3f0e02be94ca5e15f3c449a61b4bc9/pydantic_core-2.46.4-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:17299feefe090f2caa5b8e37222bb5f663e4935a8bfa6931d4102e5df1a9f398", size = 2115420, upload-time = "2026-05-06T13:37:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/99/61/de4f55db8dfd57bfdfa9a12ec90fe1b57c4f41062f7ca86f08586b3e0ac0/pydantic_core-2.46.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4c63ebc82684aa89d9a3bcbd13d515b3be44250dc68dd3bd81526c1cb31286c3", size = 2165122, upload-time = "2026-05-06T13:37:01.167Z" }, + { url = "https://files.pythonhosted.org/packages/f7/52/7c529d7bdb2d1068bd52f51fe32572c8301f9a4febf1948f10639f1436f5/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaa2a54443eff1950ba5ddc6b6ccda0d9c84a364276a62f969bdf2a390650848", size = 2182573, upload-time = "2026-05-06T13:38:45.04Z" }, + { url = "https://files.pythonhosted.org/packages/37/b3/7c40325848ba78247f2812dcf9c7274e38cd801820ca6dd9fe63bcfb0eb4/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:18e5ceec2ab67e6d5f1a9085e5a24c9c4e2ac4545730bfe668680bca05e555f3", size = 2317139, upload-time = "2026-05-06T13:37:15.539Z" }, + { url = "https://files.pythonhosted.org/packages/d9/37/f913f81a657c865b75da6c0dbed79876073c2a43b5bd9edbe8da785e4d49/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a0f62d0a58f4e7da165457e995725421e0064f2255d8eccebc49f41bbc23b109", size = 2360433, upload-time = "2026-05-06T13:37:30.099Z" }, + { url = "https://files.pythonhosted.org/packages/c4/67/6acaa1be2567f9256b056d8477158cac7240813956ce86e49deae8e173b4/pydantic_core-2.46.4-cp311-cp311-win32.whl", hash = "sha256:041bde0a48fd37cf71cab1c9d56d3e8625a3793fef1f7dd232b3ff37e978ecda", size = 1985513, upload-time = "2026-05-06T13:38:15.669Z" }, + { url = "https://files.pythonhosted.org/packages/aa/e6/c505f83dfeda9a2e5c995cfd872949e4d05e12f7feb3dca72f633daefa94/pydantic_core-2.46.4-cp311-cp311-win_amd64.whl", hash = "sha256:6f2eeda33a839975441c86a4119e1383c50b47faf0cbb5176985565c6bb02c33", size = 2071114, upload-time = "2026-05-06T13:40:35.416Z" }, + { url = "https://files.pythonhosted.org/packages/0f/da/7a263a96d965d9d0df5e8de8a475f33495451117035b09acb110288c381f/pydantic_core-2.46.4-cp311-cp311-win_arm64.whl", hash = "sha256:14f4c5d6db102bd796a627bbb3a17b4cf4574b9ae861d8b7c9a9661c6dd3362d", size = 2044298, upload-time = "2026-05-06T13:38:29.754Z" }, + { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158, upload-time = "2026-05-06T13:38:57.215Z" }, + { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724, upload-time = "2026-05-06T13:37:02.697Z" }, + { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742, upload-time = "2026-05-06T13:37:09.448Z" }, + { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418, upload-time = "2026-05-06T13:37:38.234Z" }, + { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274, upload-time = "2026-05-06T13:38:27.753Z" }, + { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940, upload-time = "2026-05-06T13:38:05.353Z" }, + { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516, upload-time = "2026-05-06T13:39:10.577Z" }, + { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854, upload-time = "2026-05-06T13:40:22.59Z" }, + { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306, upload-time = "2026-05-06T13:40:10.666Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044, upload-time = "2026-05-06T13:40:43.231Z" }, + { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133, upload-time = "2026-05-06T13:39:57.365Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464, upload-time = "2026-05-06T13:38:06.976Z" }, + { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823, upload-time = "2026-05-06T13:40:47.985Z" }, + { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919, upload-time = "2026-05-06T13:39:21.153Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604, upload-time = "2026-05-06T13:39:03.753Z" }, + { url = "https://files.pythonhosted.org/packages/51/a2/5d30b469c5267a17b39dec53208222f76a8d351dfac4af661888c5aee77d/pydantic_core-2.46.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5d5902252db0d3cedf8d4a1bc68f70eeb430f7e4c7104c8c476753519b423008", size = 2106306, upload-time = "2026-05-06T13:37:48.029Z" }, + { url = "https://files.pythonhosted.org/packages/c1/81/4fa520eaffa8bd7d1525e644cd6d39e7d60b1592bc5b516693c7340b50f1/pydantic_core-2.46.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94f0688e7b8d0a67abf40e57a7eaaecd17cc9586706a31b76c031f63df052b4", size = 1951906, upload-time = "2026-05-06T13:37:17.012Z" }, + { url = "https://files.pythonhosted.org/packages/03/d5/fd02da45b659668b05923b17ba3a0100a0a3d5541e3bd8fcc4ecb711309e/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f027324c56cd5406ca49c124b0db10e56c69064fec039acc571c29020cc87c76", size = 1976802, upload-time = "2026-05-06T13:37:35.113Z" }, + { url = "https://files.pythonhosted.org/packages/21/f2/95727e1368be3d3ed485eaab7adbd7dda408f33f7a36e8b48e0144002b91/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e739fee756ba1010f8bcccb534252e85a35fe45ae92c295a06059ce58b74ccd3", size = 2052446, upload-time = "2026-05-06T13:37:12.313Z" }, + { url = "https://files.pythonhosted.org/packages/9c/86/5d99feea3f77c7234b8718075b23db11532773c1a0dbd9b9490215dc2eeb/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d56801be94b86a9da183e5f3766e6310752b99ff647e38b09a9500d88e46e76", size = 2232757, upload-time = "2026-05-06T13:39:01.149Z" }, + { url = "https://files.pythonhosted.org/packages/d2/3a/508ac615935ef7588cf6d9e9b91309fdc2da751af865e02a9098de88258c/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2412e734dcb48da14d4e4006b82b46b74f2518b8a26ee7e58c6844a6cd6d03c4", size = 2309275, upload-time = "2026-05-06T13:37:41.406Z" }, + { url = "https://files.pythonhosted.org/packages/07/f8/41db9de19d7987d6b04715a02b3b40aea467000275d9d758ffaa31af7d50/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9551187363ffc0de2a00b2e47c25aeaeb1020b69b668762966df15fc5659dd5a", size = 2094467, upload-time = "2026-05-06T13:39:18.847Z" }, + { url = "https://files.pythonhosted.org/packages/2c/e2/f35033184cb11d0052daf4416e8e10a502ea2ac006fc4f459aee872727d1/pydantic_core-2.46.4-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0186750b482eefa11d7f435892b09c5c606193ef3375bcf94aa00ae6bfb66262", size = 2134417, upload-time = "2026-05-06T13:40:17.944Z" }, + { url = "https://files.pythonhosted.org/packages/7e/7b/6ceeb1cc90e193862f444ebe373d8fdf613f0a82572dde03fb10734c6c71/pydantic_core-2.46.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5855698a4856556d86e8e6cd8434bc3ac0314ee8e12089ae0e143f64c6256e4e", size = 2179782, upload-time = "2026-05-06T13:40:32.618Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f2/c8d7773ede6af08036423a00ae0ceffce266c3c52a096c435d68c896083f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cbaf13819775b7f769bf4a1f066cb6df7a28d4480081a589828ef190226881cd", size = 2188782, upload-time = "2026-05-06T13:36:51.018Z" }, + { url = "https://files.pythonhosted.org/packages/59/31/0c864784e31f09f05cdd87606f08923b9c9e7f6e51dd27f20f62f975ce9f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:633147d34cf4550417f12e2b1a0383973bdf5cdfde212cb09e9a581cf10820be", size = 2328334, upload-time = "2026-05-06T13:40:37.764Z" }, + { url = "https://files.pythonhosted.org/packages/c2/eb/4f6c8a41efa30baa755590f4141abf3a8c370fab610915733e74134a7270/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:82cf5301172168103724d49a1444d3378cb20cdee30b116a1bd6031236298a5d", size = 2372986, upload-time = "2026-05-06T13:39:34.152Z" }, + { url = "https://files.pythonhosted.org/packages/5b/24/b375a480d53113860c299764bfe9f349a3dc9108b3adc0d7f0d786492ebf/pydantic_core-2.46.4-cp313-cp313-win32.whl", hash = "sha256:9fa8ae11da9e2b3126c6426f147e0fba88d96d65921799bb30c6abd1cb2c97fb", size = 1973693, upload-time = "2026-05-06T13:37:55.072Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e8/cff247591966f2d22ec8c003cd7587e27b7ba7b81ab2fb888e3ab75dc285/pydantic_core-2.46.4-cp313-cp313-win_amd64.whl", hash = "sha256:6b3ace8194b0e5204818c92802dcdca7fc6d88aabbb799d7c795540d9cd6d292", size = 2071819, upload-time = "2026-05-06T13:38:49.139Z" }, + { url = "https://files.pythonhosted.org/packages/c6/1a/f4aee670d5670e9e148e0c82c7db98d780be566c6e6a97ee8035528ca0b3/pydantic_core-2.46.4-cp313-cp313-win_arm64.whl", hash = "sha256:184c081504d17f1c1066e430e117142b2c77d9448a97f7b65c6ac9fd9aee238d", size = 2027411, upload-time = "2026-05-06T13:40:45.796Z" }, + { url = "https://files.pythonhosted.org/packages/8d/74/228a26ddad29c6672b805d9fd78e8d251cd04004fa7eed0e622096cd0250/pydantic_core-2.46.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:428e04521a40150c85216fc8b85e8d39fece235a9cf5e383761238c7fa9b96fb", size = 2102079, upload-time = "2026-05-06T13:38:41.019Z" }, + { url = "https://files.pythonhosted.org/packages/ad/1f/8970b150a4b4365623ae00fc88603491f763c627311ae8031e3111356d6e/pydantic_core-2.46.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23ace664830ee0bfe014a0c7bc248b1f7f25ed7ad103852c317624a1083af462", size = 1952179, upload-time = "2026-05-06T13:36:59.812Z" }, + { url = "https://files.pythonhosted.org/packages/95/30/5211a831ae054928054b2f79731661087a2bc5c01e825c672b3a4a8f1b3e/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce5c1d2a8b27468f433ca974829c44060b8097eedc39933e3c206a90ee49c4a9", size = 1978926, upload-time = "2026-05-06T13:37:39.933Z" }, + { url = "https://files.pythonhosted.org/packages/57/e9/689668733b1eb67adeef047db3c2e8788fcf65a7fd9c9e2b46b7744fe245/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7283d57845ecf5a163403eb0702dfc220cc4fbdd18919cb5ccea4f95ee1cdab4", size = 2046785, upload-time = "2026-05-06T13:38:01.995Z" }, + { url = "https://files.pythonhosted.org/packages/60/d9/6715260422ff50a2109878fd24d948a6c3446bb2664f34ee78cd972b3acd/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8daafc69c93ee8a0204506a3b6b30f586ef54028f52aeeeb5c4cfc5184fd5914", size = 2228733, upload-time = "2026-05-06T13:40:50.371Z" }, + { url = "https://files.pythonhosted.org/packages/18/ae/fdb2f64316afca925640f8e70bb1a564b0ec2721c1389e25b8eb4bf9a299/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2213145bcc2ba85884d0ac63d222fece9209678f77b9b4d76f054c561adb28", size = 2307534, upload-time = "2026-05-06T13:37:21.531Z" }, + { url = "https://files.pythonhosted.org/packages/89/1d/8eff589b45bb8190a9d12c49cfad0f176a5cbd1534908a6b5125e2886239/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a5f930472650a82629163023e630d160863fce524c616f4e5186e5de9d9a49b", size = 2099732, upload-time = "2026-05-06T13:39:31.942Z" }, + { url = "https://files.pythonhosted.org/packages/06/d5/ee5a3366637fee41dee51a1fc91562dcf12ddbc68fda34e6b253da2324bb/pydantic_core-2.46.4-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:c1b3f518abeca3aa13c712fd202306e145abf59a18b094a6bafb2d2bbf59192c", size = 2129627, upload-time = "2026-05-06T13:37:25.033Z" }, + { url = "https://files.pythonhosted.org/packages/94/33/2414be571d2c6a6c4d08be21f9292b6d3fdb08949a97b6dfe985017821db/pydantic_core-2.46.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a7dd0b3ee80d90150e3495a3a13ac34dbcbfd4f012996a6a1d8900e91b5c0fb", size = 2179141, upload-time = "2026-05-06T13:37:14.046Z" }, + { url = "https://files.pythonhosted.org/packages/7b/79/7daa95be995be0eecc4cf75064cb33f9bbbfe3fe0158caf2f0d4a996a5c7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:3fb702cd90b0446a3a1c5e470bfa0dd23c0233b676a9099ddcc964fa6ca13898", size = 2184325, upload-time = "2026-05-06T13:36:53.615Z" }, + { url = "https://files.pythonhosted.org/packages/9f/cb/d0a382f5c0de8a222dc61c65348e0ce831b1f68e0a018450d31c2cace3a5/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b8458003118a712e66286df6a707db01c52c0f52f7db8e4a38f0da1d3b94fc4e", size = 2323990, upload-time = "2026-05-06T13:40:29.971Z" }, + { url = "https://files.pythonhosted.org/packages/05/db/d9ba624cc4a5aced1598e88c04fdbd8310c8a69b9d38b9a3d39ce3a61ed7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:372429a130e469c9cd698925ce5fc50940b7a1336b0d82038e63d5bbc4edc519", size = 2369978, upload-time = "2026-05-06T13:37:23.027Z" }, + { url = "https://files.pythonhosted.org/packages/f2/20/d15df15ba918c423461905802bfd2981c3af0bfa0e40d05e13edbfa48bc3/pydantic_core-2.46.4-cp314-cp314-win32.whl", hash = "sha256:85bb3611ff1802f3ee7fdd7dbff26b56f343fb432d57a4728fdd49b6ef35e2f4", size = 1966354, upload-time = "2026-05-06T13:38:03.499Z" }, + { url = "https://files.pythonhosted.org/packages/fc/b6/6b8de4c0a7d7ab3004c439c80c5c1e0a3e8d78bbae19379b01960383d9e5/pydantic_core-2.46.4-cp314-cp314-win_amd64.whl", hash = "sha256:811ff8e9c313ab425368bcbb36e5c4ebd7108c2bbf4e4089cfbb0b01eff63fac", size = 2072238, upload-time = "2026-05-06T13:39:40.807Z" }, + { url = "https://files.pythonhosted.org/packages/32/36/51eb763beec1f4cf59b1db243a7dcc39cbb41230f050a09b9d69faaf0a48/pydantic_core-2.46.4-cp314-cp314-win_arm64.whl", hash = "sha256:bfec22eab3c8cc2ceec0248aec886624116dc079afa027ecc8ad4a7e62010f8a", size = 2018251, upload-time = "2026-05-06T13:37:26.72Z" }, + { url = "https://files.pythonhosted.org/packages/e8/91/855af51d625b23aa987116a19e231d2aaef9c4a415273ddc189b79a45fee/pydantic_core-2.46.4-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:af8244b2bef6aaad6d92cda81372de7f8c8d36c9f0c3ea36e827c60e7d9467a0", size = 2099593, upload-time = "2026-05-06T13:39:47.682Z" }, + { url = "https://files.pythonhosted.org/packages/fb/1b/8784a54c65edb5f49f0a14d6977cf1b209bba85a4c77445b255c2de58ab3/pydantic_core-2.46.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a4330cdbc57162e4b3aa303f588ba752257694c9c9be3e7ebb11b4aca659b5d", size = 1935226, upload-time = "2026-05-06T13:40:40.428Z" }, + { url = "https://files.pythonhosted.org/packages/e8/e7/1955d28d1afc56dd4b3ad7cc0cf39df1b9852964cf16e5d13912756d6d6b/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c61fc04a3d840155ff08e475a04809278972fe6aef51e2720554e96367e34b", size = 1974605, upload-time = "2026-05-06T13:37:32.029Z" }, + { url = "https://files.pythonhosted.org/packages/93/e2/3fedbf0ba7a22850e6e9fd78117f1c0f10f950182344d8a6c535d468fdd8/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c50f2528cf200c5eed56faf3f4e22fcd5f38c157a8b78576e6ba3168ec35f000", size = 2030777, upload-time = "2026-05-06T13:38:55.239Z" }, + { url = "https://files.pythonhosted.org/packages/f8/61/46be275fcaaba0b4f5b9669dd852267ce1ff616592dccf7a7845588df091/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0cbe8b01f948de4286c74cdd6c667aceb38f5c1e26f0693b3983d9d74887c65e", size = 2236641, upload-time = "2026-05-06T13:37:08.096Z" }, + { url = "https://files.pythonhosted.org/packages/60/db/12e93e46a8bac9988be3c016860f83293daea8c716c029c9ace279036f2f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:617d7e2ca7dcb8c5cf6bcb8c59b8832c94b36196bbf1cbd1bfb56ed341905edd", size = 2286404, upload-time = "2026-05-06T13:40:20.221Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4a/4d8b19008f38d31c53b8219cfedc2e3d5de5fe99d90076b7e767de29274f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027560ee92211647d0d34e3f7cd6f50da56399d26a9c8ad0da286d3869a53f3", size = 2109219, upload-time = "2026-05-06T13:38:12.153Z" }, + { url = "https://files.pythonhosted.org/packages/88/70/3cbc40978fefb7bb09c6708d40d4ad1a5d70fd7213c3d17f971de868ec1f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:f99626688942fb746e545232e7726926f3be91b5975f8b55327665fafda991c7", size = 2110594, upload-time = "2026-05-06T13:40:02.971Z" }, + { url = "https://files.pythonhosted.org/packages/9d/20/b8d36736216e29491125531685b2f9e61aa5b4b2599893f8268551da3338/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3e9034a63de20e15e8ade85358bc6efc614008cab72898b4b4952bea0509ff", size = 2159542, upload-time = "2026-05-06T13:39:27.506Z" }, + { url = "https://files.pythonhosted.org/packages/1d/a2/367df868eb584dacf6bf82a389272406d7178e301c4ac82545ab98bc2dd9/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:97e7cf2be5c77b7d1a9713a05605d49460d02c6078d38d8bef3cbe323c548424", size = 2168146, upload-time = "2026-05-06T13:38:31.93Z" }, + { url = "https://files.pythonhosted.org/packages/c1/b8/4460f77f7e201893f649a29ab355dddd3beee8a97bcb1a320db414f9a06e/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:3bf92c5d0e00fefaab325a4d27828fe6b6e2a21848686b5b60d2d9eeb09d76c6", size = 2306309, upload-time = "2026-05-06T13:37:44.717Z" }, + { url = "https://files.pythonhosted.org/packages/64/c4/be2639293acd87dc8ddbcec41a73cee9b2ebf996fe6d892a1a74e88ad3f7/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:3ecbc122d18468d06ca279dc26a8c2e2d5acb10943bb35e36ae92096dc3b5565", size = 2369736, upload-time = "2026-05-06T13:37:05.645Z" }, + { url = "https://files.pythonhosted.org/packages/30/a6/9f9f380dbb301f67023bf8f707aaa75daadf84f7152d95c410fd7e81d994/pydantic_core-2.46.4-cp314-cp314t-win32.whl", hash = "sha256:e846ae7835bf0703ae43f534ab79a867146dadd59dc9ca5c8b53d5c8f7c9ef02", size = 1955575, upload-time = "2026-05-06T13:38:51.116Z" }, + { url = "https://files.pythonhosted.org/packages/40/1f/f1eb9eb350e795d1af8586289746f5c5677d16043040d63710e22abc43c9/pydantic_core-2.46.4-cp314-cp314t-win_amd64.whl", hash = "sha256:2108ba5c1c1eca18030634489dc544844144ee36357f2f9f780b93e7ddbb44b5", size = 2051624, upload-time = "2026-05-06T13:38:21.672Z" }, + { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" }, + { url = "https://files.pythonhosted.org/packages/ee/a4/73995fd4ebbb46ba0ee51e6fa049b8f02c40daebb762208feda8a6b7894d/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:14d4edf427bdcf950a8a02d7cb44a08614388dd6e1bdcbf4f67504fa7887da9c", size = 2111589, upload-time = "2026-05-06T13:37:10.817Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7f/f37d3a5e8bfcc2e403f5c57a730f2d815693fb42119e8ea48b3789335af1/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ce40cd7b21210e99342afafbd4d0f76d784eb5b1d60f3bdc566be4983c6c73b", size = 1944552, upload-time = "2026-05-06T13:36:56.717Z" }, + { url = "https://files.pythonhosted.org/packages/15/3c/d7eb777b3ff43e8433a4efb39a17aa8fd98a4ee8561a24a67ef5db07b2d6/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90884113d8b48f760e9587002789ddd741e76ab9f89518cd1e43b1f1a52ec44b", size = 1982984, upload-time = "2026-05-06T13:39:06.207Z" }, + { url = "https://files.pythonhosted.org/packages/63/87/70b9f40170a81afd55ca26c9b2acb25c20d64bcfbf888fafecb3ba077d4c/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66ce7632c22d837c95301830e111ad0128a32b8207533b60896a96c4915192ea", size = 2138417, upload-time = "2026-05-06T13:39:45.476Z" }, + { url = "https://files.pythonhosted.org/packages/9d/1d/8987ad40f65ae1432753072f214fb5c74fe47ffbd0698bb9cbbb585664f8/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:1d8ba486450b14f3b1d63bc521d410ec7565e52f887b9fb671791886436a42f7", size = 2095527, upload-time = "2026-05-06T13:39:52.283Z" }, + { url = "https://files.pythonhosted.org/packages/64/d3/84c282a7eee1d3ac4c0377546ef5a1ea436ce26840d9ac3b7ed54a377507/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:3009f12e4e90b7f88b4f9adb1b0c4a3d58fe7820f3238c190047209d148026df", size = 1936024, upload-time = "2026-05-06T13:40:15.671Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ca/eac61596cdeb4d7e174d3dc0bd8a6238f14f75f97a24e7b7db4c7e7340a0/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad785e92e6dc634c21555edc8bd6b64957ab844541bcb96a1366c202951ae526", size = 1990696, upload-time = "2026-05-06T13:38:34.717Z" }, + { url = "https://files.pythonhosted.org/packages/fa/c3/7c8b240552251faf6b3a957db200fcfbbcec36763c050428b601e0c9b83b/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c603d540afdd6b80eb39f078f33ebd46211f02f33e34a32d9f053bba711de0", size = 2147590, upload-time = "2026-05-06T13:39:29.883Z" }, + { url = "https://files.pythonhosted.org/packages/11/cb/428de0385b6c8d44b716feba566abfacfbd23ee3c4439faa789a1456242f/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:0c563b08bca408dc7f65f700633d8442fffb2421fc47b8101377e9fd65051ff0", size = 2112782, upload-time = "2026-05-06T13:37:04.016Z" }, + { url = "https://files.pythonhosted.org/packages/0b/b5/6a17bdadd0fc1f170adfd05a20d37c832f52b117b4d9131da1f41bb097ce/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:db06ffe51636ffe9ca531fe9023dd64bdd794be8754cb5df57c5498ae5b518a7", size = 1952146, upload-time = "2026-05-06T13:39:43.092Z" }, + { url = "https://files.pythonhosted.org/packages/2a/dc/03734d80e362cd43ef65428e9de77c730ce7f2f11c60d2b1e1b39f0fbf99/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133878133d271ade3d41d1bfb2a45ec38dbdbda40bc065921c6b04e4630127e2", size = 2134492, upload-time = "2026-05-06T13:36:58.124Z" }, + { url = "https://files.pythonhosted.org/packages/de/df/5e5ffc085ed07cc22d298134d3d911c63e91f6a0eb91fe646750a3209910/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9bc519fbf2b7578398853d815009ae5e4d4603d12f4e3f91da8c06852d3da3e9", size = 2156604, upload-time = "2026-05-06T13:37:49.88Z" }, + { url = "https://files.pythonhosted.org/packages/81/44/6e112a4253e56f5705467cbab7ab5e91ee7398ba3d56d358635958893d3e/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c7a7bd4e39e8e4c12c39cd480356842b6a8a06e41b23a55a5e3e191718838ddf", size = 2183828, upload-time = "2026-05-06T13:37:43.053Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/5565071e937d8e752842ac241463944c9eb14c87e2d269f2658a5bd05e98/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:d396ec2b979760aaf3218e76c24e65bd0aca24983298653b3a9d7a45f9e47b30", size = 2310000, upload-time = "2026-05-06T13:37:56.694Z" }, + { url = "https://files.pythonhosted.org/packages/4f/c3/66883a5cec183e7fba4d024b4cbbe61851a63750ef606b0afecc46d1f2bf/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:86e1a4418c6cd97d60c95c71164158eaf7324fae7b0923264016baa993eba6fc", size = 2361286, upload-time = "2026-05-06T13:40:05.667Z" }, + { url = "https://files.pythonhosted.org/packages/4b/2d/69abac8f838090bbecd5df894befb2c2619e7996a98ddb949db9f3b93225/pydantic_core-2.46.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:d51026d73fcfd93610abc7b27789c26b313920fcfb20e27462d74a7f8b06e983", size = 2193071, upload-time = "2026-05-06T13:38:08.682Z" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + +[[package]] +name = "rich" +version = "15.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +]