diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000..e9a7d0e
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,3 @@
+self-hosted-runner:
+  labels:
+    - agent-skills-amd-4cpu
diff --git a/.github/workflows/capture-baseline.yml b/.github/workflows/capture-baseline.yml
new file mode 100644
index 0000000..0f25b26
--- /dev/null
+++ b/.github/workflows/capture-baseline.yml
@@ -0,0 +1,166 @@
+name: Capture Eval Baseline
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Release tag to baseline (e.g. v1.9.0)"
+        required: true
+        type: string
+permissions:
+  contents: read
+
+jobs:
+
+  # ── Claude Code — 3 models ─────────────────────────────────────────────────
+  capture-claude-code:
+    name: baseline / claude-code / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Claude Code CLI
+        run: npm install -g @anthropic-ai/claude-code
+      - name: Verify claude CLI
+        run: claude --version
+      - name: Run baseline eval (${{ matrix.model }})
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          uv run evals --harness claude-code --skill ${{ matrix.skill }} \
+            --model ${{ matrix.model }} --bare --max-budget 0.15 || true
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-claude-code-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/claude-code/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
+
+  # ── Codex — 2 models ──────────────────────────────────────────────────────
+  capture-codex:
+    name: baseline / codex / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [gpt-5.5, o3]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Codex CLI
+        run: npm install -g @openai/codex
+      - name: Verify codex CLI
+        run: codex --version
+      - name: Install StackHawk skills (hawkscan + api)
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          codex plugin marketplace add .
+          echo y | codex plugin add hawkscan@stackhawk
+          echo y | codex plugin add stackhawk-api@stackhawk
+      - name: Run baseline eval (${{ matrix.model }})
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} || true
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-codex-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/codex/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
+
+  # ── Antigravity (agy) — default model ─────────────────────────────────────
+  capture-agy:
+    name: baseline / agy / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [default]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - name: Install agy CLI
+        run: curl -fsSL https://antigravity.google/install-cli | bash
+      - name: Verify agy CLI
+        run: agy --version
+      - name: Install StackHawk plugins
+        env:
+          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+        run: |
+          echo y | agy plugin install plugins/hawkscan
+          echo y | agy plugin install plugins/api
+      - name: Run baseline eval
+        env:
+          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+        run: |
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-agy-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/agy/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
+
+  # ── Cursor — default model ─────────────────────────────────────────────────
+  capture-cursor:
+    name: baseline / cursor / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [default]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Cursor CLI
+        run: npm install -g @cursor/cli || npm install -g cursor-agent
+        continue-on-error: true  # package name TBD; update when stable
+      - name: Verify agent CLI
+        run: agent --version
+        continue-on-error: true  # CLI package name TBD; skip if unavailable
+      - name: Run baseline eval
+        env:
+          CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
+        run: |
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true
+        continue-on-error: true  # best-effort
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-cursor-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/cursor/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index aa29ba8..1843daf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -127,6 +127,24 @@ jobs:
         if: inputs.dry_run == true
         run: echo "DRY RUN complete — all checks passed for ${{ steps.version.outputs.tag }}"
 
+  capture-baseline:
+    name: Trigger baseline capture
+    needs: release
+    if: inputs.dry_run != true
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Dispatch capture-baseline
+        # GITHUB_TOKEN can dispatch workflows in the same repo for most orgs.
+        # If org policy blocks it, swap to the TF_GITHUB_TOKEN PAT that
+        # update-marketplace pulls from SSM (aws ssm get-parameter --name TF_GITHUB_TOKEN).
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RELEASE_TAG: ${{ needs.release.outputs.tag }}
+        run: gh workflow run capture-baseline.yml -f tag="$RELEASE_TAG"
+
   update-marketplace:
     name: Update marketplace pin
     needs: release
@@ -139,7 +157,7 @@ jobs:
       - name: Resolve cache
         run: |
           biodome ci restore-cache
-          rm -rf *.tar.lz4
+          rm -rf ./*.tar.lz4
 
       - name: Pull secrets
         run: biodome ci save-secrets
@@ -158,7 +176,7 @@ jobs:
           echo "::add-mask::${GH_PAT}"
           git clone https://github.com/stackhawk/agent-skills-marketplace.git /tmp/marketplace
           git -C /tmp/marketplace remote set-url origin \
-            https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git
+            "https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git"
 
       - name: Update marketplace.json
         run: |
diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 5cc3162..e56aaab 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -1,6 +1,9 @@
 name: Skill Evals
 
 on:
+  # Manual, on-demand only — matches origin/main's deliberate design (commit c860e47
+  # "ci: remove pull_request trigger — evals run on workflow_dispatch only"). These
+  # evals drive real agents against tool CLIs and were never an automatic PR gate.
   workflow_dispatch:
     inputs:
       skill:
@@ -10,15 +13,15 @@ on:
         type: choice
         options: [hawkscan, api, both]
       platform:
-        description: "Platform to run (all = claude-code + codex + agy + cursor)"
+        description: "Platform to run"
         required: true
         default: "all"
         type: choice
         options: [all, claude-code, codex, agy, cursor]
       rubric:
-        description: "Run qualitative rubric grader (slower, ~$0.10 extra per run)"
+        description: "Also run the qualitative rubric grader (extra ANTHROPIC_API_KEY cost)"
         required: false
-        default: false
+        default: true
         type: boolean
 
 permissions:
@@ -35,78 +38,141 @@ permissions:
 
 jobs:
 
+  # ── Config validation (no API keys; runs on every PR including forks) ──────
+  validate-config:
+    name: validate eval config
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Validate prompts.yaml + process-checks.json
+        run: uv run validate
+
+  # ── Unit tests (no API keys; runs on every PR) ────────────────────────────
+  pytest:
+    name: pytest (lib)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Run lib tests
+        run: uv run pytest -q
+
   # ── Claude Code ──────────────────────────────────────────────────────────
   eval-claude-code:
-    name: claude-code / ${{ matrix.skill }}
+    name: claude-code / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
       inputs.platform == 'all' ||
       inputs.platform == 'claude-code'
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
+        model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001]
 
     steps:
       - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
 
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - name: Install Claude Code CLI
-        run: npm install -g @anthropic-ai/claude-code
+      - name: Install Claude Code CLI (native)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
 
       - name: Verify claude CLI
         run: claude --version
 
-      - name: Run ${{ matrix.skill }} evals
+      # hawk CLI is a Java app; ensure a JDK 17+ is on PATH for it.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+
+      # Install the latest hawk via StackHawk's official action in install-only
+      # mode (no scan). It downloads the CLI and adds it to PATH so the hawkscan
+      # skill can follow its documented CLI path (hawk version/config/validate/
+      # scan). Without hawk the agent improvises and never emits a hawk* signal.
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      # No --bare: --bare is "minimal mode" where skills only resolve via an
+      # explicit /skill-name and do NOT auto-trigger from their description, so
+      # natural-language prompts never fire the skill (all false-negatives).
+      # Full plugin mode is also the realistic user experience (hooks + skill).
+      - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}            # hawk reads API_KEY
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}     # hawkop reads HAWKOP_API_KEY
+          HAWKOP_FORMAT: json
+        run: |
+          uv run evals --harness claude-code --skill ${{ matrix.skill }} \
+            --model ${{ matrix.model }} --max-budget 0.15 ${{ inputs.rubric && '--rubric' || '' }}
+
+      - name: Skill lift (compare with/without)
+        if: github.event_name == 'pull_request'
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
-          RUBRIC_FLAG=""
-          if [ "${{ inputs.rubric }}" = "true" ]; then
-            RUBRIC_FLAG="--rubric"
-          fi
-          python3 evals/harnesses/claude-code/run-evals.py \
-            --skill ${{ matrix.skill }} \
-            --bare \
-            --max-budget 0.15 \
-            $RUBRIC_FLAG
+          uv run compare --harness claude-code --skill ${{ matrix.skill }} \
+            --model ${{ matrix.model }} --max-budget 0.15 || true
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-claude-code-${{ matrix.skill }}
+          name: eval-claude-code-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/claude-code/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── Codex ─────────────────────────────────────────────────────────────────
   eval-codex:
-    name: codex / ${{ matrix.skill }}
+    name: codex / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
       inputs.platform == 'all' ||
       inputs.platform == 'codex'
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
+        model: [gpt-5.5, o3]
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
+      - uses: astral-sh/setup-uv@v5
       - uses: actions/setup-node@v4
         with:
           node-version: "20"
@@ -117,6 +183,18 @@ jobs:
       - name: Verify codex CLI
         run: codex --version
 
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      # codex exec reads stored credentials, not OPENAI_API_KEY directly — without
+      # this it 401s ("Missing bearer"). Pipe the key via stdin (never as an arg).
+      - name: Authenticate codex CLI
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: printenv OPENAI_API_KEY | codex login --with-api-key
+
       - name: Install StackHawk skills (hawkscan + api)
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -124,201 +202,310 @@ jobs:
           codex plugin marketplace add .
           echo y | codex plugin add hawkscan@stackhawk
           echo y | codex plugin add stackhawk-api@stackhawk
+          echo y | codex plugin add stackhawk-data-seed@stackhawk
 
-      - name: Run ${{ matrix.skill }} evals
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
-          python3 evals/harnesses/codex/run-evals.py \
-            --skill ${{ matrix.skill }}
+          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} ${{ inputs.rubric && '--rubric' || '' }}
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-codex-${{ matrix.skill }}
+          name: eval-codex-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/codex/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── Antigravity (agy) — replaces Gemini ───────────────────────────────────
   eval-agy:
-    name: agy / ${{ matrix.skill }}
+    name: agy / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
       inputs.platform == 'all' ||
       inputs.platform == 'agy'
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
+        model: [default]
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
+      - uses: astral-sh/setup-uv@v5
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
 
       - name: Install agy CLI
-        run: curl -fsSL https://antigravity.google/install-cli | bash
+        run: |
+          # /cli/install.sh is the real bootstrapper; /install-cli returns the
+          # site's HTML landing page (piping that into bash is what broke before).
+          curl -fsSL https://antigravity.google/cli/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"   # installer drops `agy` here
+        continue-on-error: true  # don't abort the job — evals records any launch failure
 
       - name: Verify agy CLI
         run: agy --version
+        continue-on-error: true  # if unavailable, the eval run captures it as a per-prompt error
 
       - name: Install StackHawk plugins
         env:
-          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+          ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
         run: |
           echo y | agy plugin install plugins/hawkscan
           echo y | agy plugin install plugins/api
+          echo y | agy plugin install plugins/stackhawk-data-seed
+        continue-on-error: true  # depends on agy CLI; best-effort so evals still runs
+
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
 
       - name: Run ${{ matrix.skill }} evals
         env:
-          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+          ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
-          python3 evals/harnesses/agy/run-evals.py \
-            --skill ${{ matrix.skill }} \
-            --print-timeout 240s
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi
+          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC
+        continue-on-error: true  # best-effort; digest degrades gracefully (matches cursor)
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-agy-${{ matrix.skill }}
+          name: eval-agy-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/agy/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── Cursor ────────────────────────────────────────────────────────────────
   eval-cursor:
-    name: cursor / ${{ matrix.skill }}
+    name: cursor / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
       inputs.platform == 'all' ||
       inputs.platform == 'cursor'
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
+        model: [default]
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
+      - uses: astral-sh/setup-uv@v5
 
       - name: Install Cursor CLI
-        run: npm install -g @cursor/cli || npm install -g cursor-agent
-        continue-on-error: true  # package name TBD; update when stable
+        run: |
+          # Official installer; symlinks the `agent` binary into ~/.local/bin.
+          # (@cursor/cli / cursor-agent npm packages don't exist — they 404'd.)
+          curl https://cursor.com/install -fsS | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+        continue-on-error: true  # best-effort; evals records any launch failure
 
       - name: Verify agent CLI
         run: agent --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
 
       - name: Run ${{ matrix.skill }} evals
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
-          python3 evals/harnesses/cursor/run-evals.py \
-            --skill ${{ matrix.skill }}
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi
+          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC
+        continue-on-error: true  # best-effort; digest degrades gracefully
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-cursor-${{ matrix.skill }}
+          name: eval-cursor-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/cursor/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── PR comment ────────────────────────────────────────────────────────────
-  comment:
-    name: Post PR summary
-    needs: [eval-claude-code, eval-codex, eval-agy, eval-cursor]
-    if: always() && github.event_name == 'pull_request'
+  report:
+    name: Eval report (run summary + PR comment)
+    needs: [validate-config, eval-claude-code, eval-codex, eval-agy, eval-cursor]
+    if: always()
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
 
     steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - uses: actions/download-artifact@v4
         with:
           pattern: eval-*
           merge-multiple: false
           path: results/
-
-      - name: Build and post comment
+      - uses: astral-sh/setup-uv@v5
+      - name: Fetch released baseline (best-effort)
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +e
+          mkdir -p baseline
+          TAG=$(gh release view --json tagName -q .tagName 2>/dev/null)
+          if [ -z "$TAG" ]; then echo "no release yet"; exit 0; fi
+          SHA=$(git rev-list -n 1 "$TAG" 2>/dev/null)
+          RUN=$(gh run list --workflow capture-baseline.yml --json databaseId,headSha \
+                 -q "map(select(.headSha==\"$SHA\")) | .[0].databaseId" 2>/dev/null)
+          if [ -z "$RUN" ] || [ "$RUN" = "null" ]; then echo "no capture run for $TAG"; exit 0; fi
+          gh run download "$RUN" -p 'baseline-*' -D baseline 2>/dev/null || echo "download failed"
+          echo "baseline fetched for $TAG (run $RUN)"
+      - name: Build digest
+        run: uv run report --pr --results-dir results --baseline-dir baseline --lift-dir results --out digest.md
+      - name: Write digest to run summary
+        if: always()
+        run: cat digest.md >> "$GITHUB_STEP_SUMMARY"
+      - name: Post digest comment
+        if: github.event_name == 'pull_request'
         uses: actions/github-script@v7
         with:
           script: |
             const fs = require('fs');
-            const path = require('path');
-
-            const needsResult = ${{ toJSON(needs) }};
-            const allSuccess = Object.values(needsResult).every(n => n.result === 'success');
-            const overallIcon = allSuccess ? '✅' : '❌';
-
-            let body = `## ${overallIcon} Skill Eval Results\n\n`;
-
-            const platforms = ['claude-code', 'codex', 'agy', 'cursor'];
-            const skills = ['hawkscan', 'api'];
-
-            for (const platform of platforms) {
-              body += `### Platform: \`${platform}\`\n\n`;
-              for (const skill of skills) {
-                const summaryPath = path.join(
-                  'results', `eval-${platform}-${skill}`, 'summary.json'
-                );
-
-                if (!fs.existsSync(summaryPath)) {
-                  body += `**\`${skill}\`**: ⚠️ No results\n`;
-                  continue;
-                }
-
-                const s = JSON.parse(fs.readFileSync(summaryPath, 'utf8'));
-                const ta = s.trigger_accuracy;
-                const triggerIcon = ta.correct === ta.total ? '✅' : '❌';
-
-                body += `**\`${skill}\`**: ${triggerIcon} Trigger ${ta.correct}/${ta.total}`;
-                if (s.process_avg_score !== null) {
-                  const scoreIcon = s.process_avg_score >= 70 && s.total_blocking_failures === 0 ? '✅' : '⚠️';
-                  body += ` | ${scoreIcon} Process ${s.process_avg_score}/100`;
-                }
-                if (s.false_positives?.length) body += ` | ⚠️ FP: ${s.false_positives.join(', ')}`;
-                if (s.false_negatives?.length) body += ` | ⚠️ FN: ${s.false_negatives.join(', ')}`;
-                body += '\n';
-              }
-              body += '\n';
-            }
-
-            body += `---\n_Commit ${context.sha.slice(0, 7)}. `;
-            body += `[Full results](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})_\n`;
-
+            const body = fs.readFileSync('digest.md', 'utf8');
             const marker = '<!-- skill-eval-comment -->';
             const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-            });
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number });
             const existing = comments.find(c => c.body.includes(marker));
-            const fullBody = marker + '\n' + body;
-
             if (existing) {
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: existing.id,
-                body: fullBody,
-              });
+              await github.rest.issues.updateComment({ owner: context.repo.owner,
+                repo: context.repo.repo, comment_id: existing.id, body });
             } else {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: fullBody,
-              });
+              await github.rest.issues.createComment({ owner: context.repo.owner,
+                repo: context.repo.repo, issue_number: context.issue.number, body });
             }
diff --git a/.gitignore b/.gitignore
index 61cc84f..f5df676 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,8 @@ docs/superpowers/
 .worktrees/
 .idea/
 *.iml
+
+# Python
+__pycache__/
+*.py[cod]
+.venv/
diff --git a/evals/README.md b/evals/README.md
index 74ebac7..b9b3458 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -7,11 +7,11 @@ Evaluation assets for the `hawkscan`, `api`, and `stackhawk-data-seed` skills. T
 ```
 evals/
   hawkscan/
-    prompts.csv          # 20 trigger/no-trigger test cases for the hawkscan skill
+    prompts.yaml         # 20 trigger/no-trigger test cases for the hawkscan skill
     process-checks.json  # Deterministic checks: commands, files, and patterns that must (or must not) appear
     rubric-items.json    # Qualitative rubric check definitions for style and correctness grading
   api/
-    prompts.csv          # 16 trigger/no-trigger test cases for the api skill
+    prompts.yaml         # 16 trigger/no-trigger test cases for the api skill
     process-checks.json  # Deterministic checks
     rubric-items.json    # Qualitative rubric check definitions
   stackhawk-data-seed/
@@ -19,17 +19,19 @@ evals/
     process-checks.json  # Deterministic checks for discovery, dialog, artifact emission, and contract boundaries
     rubric-items.json    # Qualitative rubric check definitions
   rubric-schema.json     # Shared JSON Schema — constrains rubric grader output format
+  lib/                   # Shared library: models, config, grading, harness, replay, compare, reporting
+  cli.py                 # Unified CLI entrypoints (evals, compare, regrade, validate)
   harnesses/
-    README.md            # How to build platform-specific harnesses (Codex, Claude, Gemini, etc.)
+    README.md            # How to build platform-specific harnesses (Codex, Claude, etc.)
 ```
 
 ## Three layers of evaluation
 
-### 1. Trigger evals (`prompts.csv`)
+### 1. Trigger evals (`prompts.yaml`)
 
-Each row is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked.
+Each entry is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked. Each prompt may also set a `budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an `expected` list (each item has exactly one of: signal / anti_pattern / check_id).
 
-Columns: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes`
+Fields: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes`
 
 Invocation types:
 - `explicit` — skill named directly (e.g. `$hawkscan` or `$api`)
@@ -50,19 +52,63 @@ A second, read-only grader pass over the agent's output and generated files. The
 
 ## Running evals
 
-Harnesses are platform-specific. See `harnesses/README.md` for the contract and planned implementations.
+This is a uv project. All commands go through `uv run`.
 
-**Manual checklist:**
-1. Run the prompt in the target agent
-2. Check the output and any generated files against `process-checks.json` — look for `signals` (must appear) and `anti_patterns` (must not appear)
-3. Run a grader with the `grader_prompt` from `rubric-items.json` against the output; require JSON output conforming to `rubric-schema.json`
-4. Record results per check; track scores over time to detect regressions
+| Task | Command |
+|---|---|
+| Validate config (no keys) | `uv run validate` |
+| Run a skill | `uv run evals --harness claude-code --skill hawkscan` |
+| Single prompt | `uv run evals --harness claude-code --skill hawkscan --id hw-07` |
+| Compare with/without skill | `uv run compare --harness claude-code --skill hawkscan` |
+| Regrade a saved trace (free) | `uv run regrade <trace.jsonl> --skill hawkscan` |
+
+Per-prompt config lives in `evals/<skill>/prompts.yaml`. Each prompt may set a
+`budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an
+`expected` list (each item has exactly one of: signal / anti_pattern / check_id).
+A correct run that breaches a budget grades as PASS-SLOW. A process-check in
+`process-checks.json` may carry `applies_to: [<prompt id>]` to scope it to
+specific prompts (absent = applies to all).
+
+See `harnesses/README.md` for per-platform instructions and CI setup.
+
+### Reports
+
+**Per-job summaries.** Each `uv run evals` run writes a JUnit-style table to
+`$GITHUB_STEP_SUMMARY`: one row per test, failures-first ordering,
+`✅ PASS / ◆ PASS-SLOW / ❌ FAIL` verdicts. It also writes a `cell.json`
+artifact in the results directory so downstream steps can aggregate across
+jobs.
+
+**PR digest comment.** When a PR lands, the `comment` CI job collects all
+`cell.json` artifacts and runs:
+
+```
+uv run report --pr [--results-dir DIR] [--baseline-dir DIR] [--lift-dir DIR] [--out FILE]
+```
+
+This produces a consolidated Markdown digest posted as a sticky PR comment.
+The digest contains:
+
+- **Matrix overview** — one row per (platform × skill × model) cell showing
+  trigger accuracy, ✅/◆/❌ verdict mix, and aggregate score.
+- **Per-cell tables** — the same failures-first rows from each job summary.
+- **Regression vs released-tag baseline** — the `comment` job fetches the
+  baseline from the most recent release's `capture-baseline.yml` run
+  (best-effort; missing baseline degrades gracefully to "no baseline
+  available"). Comparison is pure deterministic threshold math: per-test
+  verdict-flips (fixed / regressed) and aggregate score deltas with a ±3
+  band → better / worse / no-change. No AI or LLM calls are used.
+- **Skill lift section** — with-skill vs without-skill verdict comparison
+  showing how many prompts move from FAIL→PASS when the skill is active.
+
+Baselines are captured at release tags by `capture-baseline.yml`, which is
+triggered automatically from `release.yml`.
 
 ## Adding test cases
 
 When a skill bug or regression is discovered:
 
-1. Add a new row to the relevant `prompts.csv` capturing the prompt that exposed the bug
+1. Add a new entry to the relevant `prompts.yaml` capturing the prompt that exposed the bug
 2. If the bug was a missing process step, add a check to `process-checks.json`
 3. If the bug was a style or qualitative issue, add a check to the relevant `rubric-items.json`
 
diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/api/prompts.csv b/evals/api/prompts.csv
deleted file mode 100644
index 93d89e1..0000000
--- a/evals/api/prompts.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-api-01,true,explicit,"Use the $api skill to show me my org's security posture","Direct skill reference by name"
-api-02,true,implicit,"Show me the security posture of my StackHawk organization","'security posture' keyword from skill description"
-api-03,true,implicit,"What are my untriaged findings across all apps?","'untriaged findings' keyword"
-api-04,true,implicit,"Give me a security dashboard overview for my org","'security dashboard' keyword"
-api-05,true,implicit,"Show me the scan history for my payment-api application","'scan history' keyword"
-api-06,true,implicit,"What needs immediate security attention in my org?","'what needs attention' keyword"
-api-07,true,contextual,"Which of my apps have HIGH severity findings that haven't been addressed?","Finding severity query; no keyword match — tests whether description alone triggers"
-api-08,true,contextual,"What changed in my security findings since last week?","Delta / diff query; tests 'what's new' path in Step 4"
-api-09,true,contextual,"Which apps haven't been scanned in over 30 days?","Stale app detection; tests Step 3 posture with stale-app focus"
-api-10,true,contextual,"I need a security report for the team's weekly standup","Reporting use case with no API-specific keywords"
-api-11,true,contextual,"Pull the full finding details for the checkout-service — the PM wants a severity breakdown","App deep dive; tests Step 4 path"
-api-12,false,negative,"Run a DAST scan on my API","Scan request → hawkscan skill; 'scan' should not route to api skill"
-api-13,false,negative,"Scan my app on localhost:8080 for vulnerabilities","Explicit scan request → hawkscan"
-api-14,false,negative,"Create a stackhawk.yml for my service","Config generation → hawkscan"
-api-15,false,negative,"Fix the CORS misconfiguration that HawkScan found","Code fix → hawkscan + code change; api skill reads only"
-api-16,false,negative,"Run HawkScan against my staging environment","Scan request → hawkscan; 'StackHawk' keyword should not override scan intent"
diff --git a/evals/api/prompts.yaml b/evals/api/prompts.yaml
new file mode 100644
index 0000000..72b0534
--- /dev/null
+++ b/evals/api/prompts.yaml
@@ -0,0 +1,80 @@
+- id: api-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use the $api skill to show me my org's security posture
+  notes: Direct skill reference by name
+- id: api-02
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Show me the security posture of my StackHawk organization
+  notes: '''security posture'' keyword from skill description'
+- id: api-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: What are my untriaged findings across all apps?
+  notes: '''untriaged findings'' keyword'
+- id: api-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Give me a security dashboard overview for my org
+  notes: '''security dashboard'' keyword'
+- id: api-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Show me the scan history for my payment-api application
+  notes: '''scan history'' keyword'
+- id: api-06
+  should_trigger: true
+  invocation_type: implicit
+  prompt: What needs immediate security attention in my org?
+  notes: '''what needs attention'' keyword'
+- id: api-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Which of my apps have HIGH severity findings that haven't been addressed?
+  notes: Finding severity query; no keyword match — tests whether description alone triggers
+- id: api-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: What changed in my security findings since last week?
+  notes: Delta / diff query; tests 'what's new' path in Step 4
+- id: api-09
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Which apps haven't been scanned in over 30 days?
+  notes: Stale app detection; tests Step 3 posture with stale-app focus
+- id: api-10
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I need a security report for the team's weekly standup
+  notes: Reporting use case with no API-specific keywords
+- id: api-11
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Pull the full finding details for the checkout-service — the PM wants a severity breakdown
+  notes: App deep dive; tests Step 4 path
+- id: api-12
+  should_trigger: false
+  invocation_type: negative
+  prompt: Run a DAST scan on my API
+  notes: Scan request → hawkscan skill; 'scan' should not route to api skill
+- id: api-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Scan my app on localhost:8080 for vulnerabilities
+  notes: Explicit scan request → hawkscan
+- id: api-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: Create a stackhawk.yml for my service
+  notes: Config generation → hawkscan
+- id: api-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: Fix the CORS misconfiguration that HawkScan found
+  notes: Code fix → hawkscan + code change; api skill reads only
+- id: api-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Run HawkScan against my staging environment
+  notes: Scan request → hawkscan; 'StackHawk' keyword should not override scan intent
diff --git a/evals/cli.py b/evals/cli.py
new file mode 100644
index 0000000..764801a
--- /dev/null
+++ b/evals/cli.py
@@ -0,0 +1,174 @@
+"""Unified eval CLI. Entry points: evals, compare, regrade, validate."""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+from evals.lib.replay import regrade as _regrade
+from evals.lib.reporting import build_summary, render_table, render_compare, console
+from evals.lib.compare import compare_skill
+
+PLATFORMS = ["claude-code", "codex", "cursor", "copilot", "agy"]
+RESULTS_ROOT = Path(__file__).resolve().parent / "harnesses"
+
+
+def _common_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"])
+    p.add_argument("--harness", default="claude-code", choices=PLATFORMS)
+    p.add_argument("--id", dest="prompt_id")
+    p.add_argument("--model")
+    p.add_argument("--max-budget", type=float, default=0.20)
+    p.add_argument("--bare", action="store_true")
+    p.add_argument("--full-auto", action="store_true")
+    p.add_argument("--rubric", action="store_true",
+                   help="also run the qualitative model-graded rubric (needs ANTHROPIC_API_KEY)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(prog="evals")
+    _common_args(ap)
+    args = ap.parse_args()
+
+    cfg = load_skill(args.skill)
+    adapter = get_adapter(args.harness)
+    plugin_dirs = [str(Path.cwd() / "plugins" / args.skill)]
+    prompts = [p for p in cfg.prompts if not args.prompt_id or p.id == args.prompt_id]
+    if not prompts:
+        print(f"no prompt '{args.prompt_id}'", file=sys.stderr); sys.exit(1)
+
+    from evals.lib.models import EvalResult, Verdict
+    results = []
+    out_dir = RESULTS_ROOT / args.harness / "results" / args.skill
+    out_dir.mkdir(parents=True, exist_ok=True)
+    for p in prompts:
+        try:
+            run = adapter.launch(p.prompt, args.skill, p.id, plugin_dirs,
+                                 model=args.model, load_skill=True,
+                                 max_budget=args.max_budget, bare=args.bare,
+                                 full_auto=args.full_auto)
+            did = adapter.detect_trigger(run, args.skill)
+            res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill,
+                        did_trigger=did)
+            # Qualitative rubric (opt-in): grade the transcript with a claude
+            # grader and attach to the result so the reporter can weave it into
+            # the pass/fail table. Only when the skill triggered correctly —
+            # grading a non-triggering run against a workflow rubric is moot.
+            if args.rubric and res.trigger_correct and did:
+                from evals.lib.rubric import grade_rubric
+                res.rubric = grade_rubric(run, args.skill, p.id)
+            # persist a trace for visibility (uploaded with the artifact)
+            trace = (f"# {p.id} (returncode={run.returncode})\n"
+                     f"## error\n{run.error or ''}\n"
+                     f"## stderr_tail\n{run.stderr_tail}\n"
+                     f"## output_text\n{run.output_text}\n"
+                     f"## bash_commands\n" + "\n".join(run.bash_commands) + "\n")
+            (out_dir / f"{p.id}.trace.txt").write_text(trace)
+        except Exception as e:  # noqa: BLE001 — never let one prompt abort the cell
+            res = EvalResult(platform=args.harness, skill=args.skill, run_id=p.id,
+                             should_trigger=p.should_trigger, did_trigger=False,
+                             trigger_correct=(not p.should_trigger),
+                             verdict=Verdict.FAIL if p.should_trigger else Verdict.PASS,
+                             score=0 if p.should_trigger else 100,
+                             note=f"harness exception: {type(e).__name__}: {e}")
+            (out_dir / f"{p.id}.trace.txt").write_text(
+                f"# {p.id}\n## harness exception\n{type(e).__name__}: {e}\n")
+        results.append(res)
+        (out_dir / f"{p.id}.result.json").write_text(res.model_dump_json(indent=2))
+
+    render_table(results)
+    summary = build_summary(args.skill, args.harness, results)
+    summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+    (out_dir / "summary.json").write_text(json.dumps(summary, indent=2))
+
+    from evals.lib.models import CellReport
+    import subprocess as _sp
+    commit = _sp.run(["git", "rev-parse", "--short", "HEAD"], capture_output=True,
+                     text=True).stdout.strip() or "unknown"
+    cell = CellReport(platform=args.harness, skill=args.skill,
+                      model=args.model or "default", commit=commit, results=results)
+    (out_dir / "cell.json").write_text(cell.model_dump_json(indent=2))
+    # Note: individual cells no longer write to GITHUB_STEP_SUMMARY — the `report`
+    # job aggregates every cell.json into one pivot table (render_digest), so the
+    # run summary holds a single table instead of one per matrix cell.
+
+    if summary["false_positives"] or summary["false_negatives"] or \
+            summary["total_blocking_failures"] > 0:
+        sys.exit(1)
+
+
+def compare() -> None:
+    ap = argparse.ArgumentParser(prog="compare")
+    _common_args(ap)
+    args = ap.parse_args()
+    rows = compare_skill(args.skill, args.harness, model=args.model,
+                         max_budget=args.max_budget, bare=args.bare,
+                         full_auto=args.full_auto, only_id=args.prompt_id)
+    import json
+    from pathlib import Path
+    out_dir = Path(__file__).resolve().parent / "harnesses" / args.harness / "results" / args.skill
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "lift.json").write_text(json.dumps(
+        [{**r, "with_verdict": r["with_verdict"].value,
+          "without_verdict": r["without_verdict"].value} for r in rows], indent=2))
+    render_compare(rows)
+
+
+def regrade() -> None:
+    ap = argparse.ArgumentParser(prog="regrade")
+    ap.add_argument("trace", type=Path)
+    ap.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"])
+    ap.add_argument("--harness", default="claude-code", choices=PLATFORMS)
+    args = ap.parse_args()
+    res = _regrade(args.trace, skill=args.skill, platform=args.harness)
+    render_table([res])
+
+
+def report() -> None:
+    import argparse
+    from pathlib import Path
+    from evals.lib.models import CellReport
+    from evals.lib.reporting import render_digest
+    ap = argparse.ArgumentParser(prog="report")
+    ap.add_argument("--pr", action="store_true")
+    ap.add_argument("--results-dir", type=Path, default=Path("results"))
+    ap.add_argument("--baseline-dir", type=Path, default=None)
+    ap.add_argument("--lift-dir", type=Path, default=None)
+    ap.add_argument("--out", type=Path, default=Path("digest.md"))
+    args = ap.parse_args()
+    cells = []
+    for cj in sorted(args.results_dir.rglob("cell.json")):
+        try:
+            cells.append(CellReport.model_validate_json(cj.read_text()))
+        except Exception:
+            continue
+    from evals.lib.baseline import load_baseline_dir
+    baselines = load_baseline_dir(args.baseline_dir) or None
+    lift = None
+    if args.lift_dir and args.lift_dir.exists():
+        lift = {}
+        for lj in args.lift_dir.rglob("lift.json"):
+            sib = lj.parent / "cell.json"
+            if not sib.exists():
+                continue
+            cell = CellReport.model_validate_json(sib.read_text())
+            lift[(cell.platform, cell.skill, cell.model)] = json.loads(lj.read_text())
+        lift = lift or None
+    md = render_digest(cells, baselines=baselines, lift=lift)
+    args.out.write_text(md)
+    print(f"wrote {args.out} ({len(cells)} cells)")
+
+
+def validate() -> None:
+    ap = argparse.ArgumentParser(prog="validate")
+    ap.add_argument("--skill", choices=["hawkscan", "api", "stackhawk-data-seed"])
+    args = ap.parse_args()
+    skills = [args.skill] if args.skill else ["hawkscan", "api", "stackhawk-data-seed"]
+    for skill in skills:
+        cfg = load_skill(skill)   # raises on any validation error
+        console.print(f"[green]✓[/] {skill}: {len(cfg.prompts)} prompts, "
+                      f"{len(cfg.checks)} checks valid")
diff --git a/evals/harnesses/README.md b/evals/harnesses/README.md
index 16d2370..04d8b2a 100644
--- a/evals/harnesses/README.md
+++ b/evals/harnesses/README.md
@@ -16,6 +16,8 @@ Each harness connects the platform-agnostic test cases in `evals/` to a specific
 
 ### Prerequisites
 
+Install [uv](https://docs.astral.sh/uv/) if you don't have it — `uv run` handles dependency installation automatically, so no separate `uv sync` step is needed before running evals.
+
 Install the CLI for whichever platform you want to test:
 
 ```bash
@@ -30,18 +32,18 @@ curl -fsSL https://antigravity.google/install-cli | bash  # Antigravity (agy)
 
 ```bash
 # Requires: ANTHROPIC_API_KEY
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-python3 evals/harnesses/claude-code/run-evals.py --skill api
+uv run evals --harness claude-code --skill hawkscan
+uv run evals --harness claude-code --skill api
 
 # Override model (default: claude's configured default)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-opus-4-7
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-haiku-4-5-20251001
+uv run evals --harness claude-code --skill hawkscan --model claude-opus-4-7
+uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001
 
 # Single prompt
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07
+uv run evals --harness claude-code --skill hawkscan --id hw-07
 
 # Dry run (no API calls)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run
+uv run evals --harness claude-code --skill hawkscan --dry-run
 ```
 
 ### Codex
@@ -55,20 +57,20 @@ codex plugin add stackhawk-api@stackhawk
 
 ```bash
 # Requires: OPENAI_API_KEY
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan
-python3 evals/harnesses/codex/run-evals.py --skill api
+uv run evals --harness codex --skill hawkscan
+uv run evals --harness codex --skill api
 
 # Override model
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model gpt-5.5
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model o3
+uv run evals --harness codex --skill hawkscan --model gpt-5.5
+uv run evals --harness codex --skill hawkscan --model o3
 ```
 
 ### Cursor
 
 ```bash
 # Requires: Cursor Pro account
-python3 evals/harnesses/cursor/run-evals.py --skill hawkscan
-python3 evals/harnesses/cursor/run-evals.py --skill api
+uv run evals --harness cursor --skill hawkscan
+uv run evals --harness cursor --skill api
 ```
 
 ### Copilot
@@ -76,9 +78,9 @@ python3 evals/harnesses/cursor/run-evals.py --skill api
 ```bash
 # Requires: GitHub Copilot account (gh copilot or copilot CLI)
 # No plugin setup needed — loads directly via --plugin-dir
-python3 evals/harnesses/copilot/run-evals.py --skill hawkscan
-python3 evals/harnesses/copilot/run-evals.py --skill api
-python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex
+uv run evals --harness copilot --skill hawkscan
+uv run evals --harness copilot --skill api
+uv run evals --harness copilot --skill hawkscan --model gpt-5.3-codex
 ```
 
 > **Best trigger detection**: Copilot emits an explicit `skill` tool call
@@ -95,16 +97,23 @@ agy plugin install /path/to/agent-skills/plugins/api
 
 ```bash
 # Run with your main agy session idle (background tasks bleed in otherwise)
-python3 evals/harnesses/agy/run-evals.py --skill hawkscan
-python3 evals/harnesses/agy/run-evals.py --skill api
+uv run evals --harness agy --skill hawkscan
+uv run evals --harness agy --skill api
 
 # Longer timeout for slow prompts
-python3 evals/harnesses/agy/run-evals.py --skill hawkscan --print-timeout 300s
+uv run evals --harness agy --skill hawkscan --print-timeout 300s
 ```
 
+> **Shims vs adapters**: The per-platform `run-evals.py` scripts are back-compat
+> shims that forward to `uv run evals`. Full stream-parsing adapter logic lives in
+> `evals/harnesses/<platform>/adapter.py`; **claude-code, codex, cursor, and agy**
+> all have real `adapter.py` implementations. Copilot and Gemini use the legacy
+> shim path (Gemini is frozen). The per-platform `run-evals.py` files remain thin
+> forwarding shims for back-compat.
+
 ## How it works
 
-For each row in `evals/<skill>/prompts.csv`, each harness:
+For each entry in `evals/<skill>/prompts.yaml`, each harness:
 
 1. Runs `agent -p "<prompt>"` in a fresh isolated directory
 2. Captures bash commands executed and text output
@@ -122,7 +131,17 @@ For each row in `evals/<skill>/prompts.csv`, each harness:
 
 ## CI
 
-The `.github/workflows/skill-evals.yml` workflow runs Claude Code + Codex + Gemini + Cursor on every PR that touches `plugins/` or `evals/`.
+The `.github/workflows/skill-evals.yml` workflow is tiered:
+
+- **Every PR + push**: runs `uv run validate` (no API keys required), then runs
+  **all four platforms** (claude-code, codex, agy, cursor). On PRs, claude-code
+  uses the Haiku model to stay within budget; the other platforms run their
+  default model.
+- **Merge to main + manual dispatch**: runs the full multi-model matrix across
+  all platforms.
+- **PR comment job**: collects `cell.json` artifacts from all platform jobs,
+  fetches the released-tag baseline (best-effort), and posts a consolidated
+  digest comment via `uv run report --pr`.
 
 Required GitHub secrets:
 - `ANTHROPIC_API_KEY` — Claude Code
diff --git a/evals/harnesses/_manual_harness.py b/evals/harnesses/_manual_harness.py
index 7b400a9..f996e44 100644
--- a/evals/harnesses/_manual_harness.py
+++ b/evals/harnesses/_manual_harness.py
@@ -3,13 +3,13 @@
 Import this from platform-specific run-evals.py files.
 """
 
-import csv
 import json
-import os
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 
+from evals.lib.config import load_skill
+
 
 HARNESS_ROOT = Path(__file__).parent.resolve()
 EVALS_DIR    = HARNESS_ROOT.parent
@@ -36,23 +36,22 @@ def run_manual_evals(
     prompt_id: str | None,
     rubric: bool,
 ) -> None:
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
     results_dir  = HARNESS_ROOT / platform / "results" / skill
 
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
+    cfg = load_skill(skill)
+    all_prompts = cfg.prompts
+    checks = cfg.checks
     blocking_checks = [c for c in checks if c.get("severity") == "blocking"]
 
     rubric_items = None
     if rubric:
+        # rubric-items.json is not yet part of evals.lib — loaded directly for now
         rubric_path = EVALS_DIR / skill / "rubric-items.json"
         if rubric_path.exists():
             rubric_items = json.loads(rubric_path.read_text())["checks"]
 
     if prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == prompt_id]
+        prompts = [p for p in all_prompts if p.id == prompt_id]
         if not prompts:
             print(f"ERROR: No prompt with id '{prompt_id}'", file=sys.stderr)
             sys.exit(1)
@@ -70,11 +69,11 @@ def run_manual_evals(
     all_results = []
 
     for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-        notes          = row.get("notes", "")
+        run_id         = row.id
+        prompt         = row.prompt
+        should_trigger = row.should_trigger
+        itype          = row.invocation_type
+        notes          = row.notes
 
         print(f"\n{'─' * 68}")
         print(f"[{run_id}]  {itype:<12}  should_trigger={'Y' if should_trigger else 'N'}")
diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
new file mode 100644
index 0000000..f00e2a2
--- /dev/null
+++ b/evals/harnesses/agy/adapter.py
@@ -0,0 +1,172 @@
+"""agy Harness adapter. Plain-text output (no structured stream).
+
+Pre-shim (5472ed2~1:evals/harnesses/agy/run-evals.py) notes:
+- agy outputs plain text — no --output-format flag available.
+- Trigger detection scans output_text only; no bash_commands ever populated.
+- Skills installed globally via `agy plugin install` (done in CI); load_skill
+  toggling is a no-op here.
+- AGY_API_KEY passed via os.environ (CI sets it); no special env handling needed.
+- Launch: agy -p <prompt> --print-timeout <timeout> [--model M]
+- The pre-shim used a unified ALL_SIGNALS dict (no CLI/INVOCATION split) with
+  SKILL: prefix signals.  Those are carried in INVOCATION_SIGNALS below alongside
+  the backtick-evaluation-format signals shared by codex/cursor adapters.
+"""
+from __future__ import annotations
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
+
+# CLI_SIGNALS: agy emits plain text — there are no shell commands to scan.
+CLI_SIGNALS: dict[str, list[str]] = {
+    "hawkscan": [],
+    "api": [],
+    "stackhawk-data-seed": [],
+}
+
+# INVOCATION_SIGNALS: checked against output_text.
+# Combines the pre-shim ALL_SIGNALS (SKILL: prefix variants) with the
+# evaluation-format backtick signals used by the shared skill prompts.
+INVOCATION_SIGNALS: dict[str, list[str]] = {
+    "hawkscan": [
+        # Pre-shim ALL_SIGNALS (verbatim from 5472ed2~1:evals/harnesses/agy/run-evals.py)
+        "skill: hawkscan",
+        "skill:hawkscan",
+        # Evaluation-format variants emitted by the shared skill evaluation suffix
+        "hawkscan:hawkscan`: yes",
+        "hawkscan:hawkscan` — yes",
+        "hawkscan:hawkscan**: yes",
+        "hawkscan:hawkscan** — yes",
+        "hawkscan:hawkscan: yes",
+        "hawkscan:hawkscan — yes",
+        # Action-intent phrases
+        "autonomous security scan",
+        "dast scan after code",
+        "dast scan triggered",
+        "dast scan required",
+        "security scan required",
+        "security scan after",
+        "run the security scan",
+        "running the hawkscan",
+        "running the security scan",
+    ],
+    "api": [
+        # Pre-shim ALL_SIGNALS (verbatim)
+        "skill: api",
+        "skill:api",
+        "skill: stackhawk-api",
+        # Evaluation-format variants
+        "stackhawk-api:api`: yes",
+        "stackhawk-api:api` — yes",
+        "stackhawk-api:api: yes",
+        "stackhawk-api:api — yes",
+    ],
+    "stackhawk-data-seed": [
+        "skill: stackhawk-data-seed",
+        "skill:stackhawk-data-seed",
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "data seed complete", "data-seed/manifest",
+    ],
+}
+
+# Matches pre-shim default --print-timeout (180s); bumped slightly for safety.
+PRINT_TIMEOUT = "240s"
+
+# Appended to every prompt before invoking agy (verbatim from pre-shim
+# 5472ed2~1:evals/harnesses/agy/run-evals.py). In --print mode agy hangs on tool
+# approvals, so this asks the agent to declare its skill choice up front — that
+# declaration is what explicit_decision + INVOCATION_SIGNALS detect. Without it,
+# live agy runs produce no detectable trigger text (all false-negatives). agy now
+# uses the shared per-skill observe suffix (evals/lib/observe.py), aligning its
+# declaration format and workflow-enumeration ask with the other harnesses.
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    """agy outputs plain text — wrap entirely in output_text; no commands to parse."""
+    return ParsedRun(output_text=raw.strip())
+
+
+class AgyAdapter:
+    platform = "agy"
+
+    def cli_signals(self, skill: str) -> list[str]:
+        return CLI_SIGNALS.get(skill, [])
+
+    def invocation_signals(self, skill: str) -> list[str]:
+        return INVOCATION_SIGNALS.get(skill, [])
+
+    def parse_stream(self, raw: str) -> ParsedRun:
+        return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        # agy is text-only; CLI signals may appear in prose too, so check both
+        # lists against the combined text. An explicit decline still overrides a
+        # loose phrase match (e.g. the agent quoting a "don't scan" instruction).
+        hay = (" ".join(run.bash_commands) + " " + run.output_text).lower()
+        cli_hit = any(s.lower() in hay for s in self.cli_signals(skill))
+        loose = any(s.lower() in hay for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=cli_hit,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
+
+    def launch(
+        self,
+        prompt: str,
+        skill: str,
+        run_id: str,
+        plugin_dirs: list[str],
+        *,
+        model: str | None,
+        load_skill: bool,
+        max_budget: float,
+        bare: bool,
+        full_auto: bool,
+    ) -> ParsedRun:
+        # Skills are installed globally via `agy plugin install` in CI;
+        # load_skill toggling is a no-op here.
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            # --print mode hangs on tool approvals; the suffix makes agy declare
+            # its skill choice up front so detect_trigger has text to match. agy is
+            # text-only (no real execution), so observe mode is its only mode.
+            effective_prompt = prompt + observe_suffix(skill)
+            cmd = ["agy", "-p", effective_prompt, "--print-timeout", PRINT_TIMEOUT]
+            if model:
+                cmd += ["--model", model]
+            try:
+                proc = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    timeout=420,
+                    cwd=tmpdir,
+                )
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            # agy has no non-interactive auth (relies on OAuth; see upstream
+            # google-antigravity/antigravity-cli#78). In a browser-less CI runner
+            # it prints an auth URL and times out. Label that distinctly so the
+            # digest doesn't read it as an eval/plumbing failure on our side.
+            blob = (run.output_text + " " + run.stderr_tail).lower()
+            if "authentication required" in blob or "authentication timed out" in blob:
+                run.error = "agy: no headless auth (upstream antigravity-cli#78) — not runnable in CI"
+            elif proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = AgyAdapter()
diff --git a/evals/harnesses/agy/run-evals.py b/evals/harnesses/agy/run-evals.py
index c485b1d..52d7fd7 100644
--- a/evals/harnesses/agy/run-evals.py
+++ b/evals/harnesses/agy/run-evals.py
@@ -1,375 +1,11 @@
 #!/usr/bin/env python3
-"""
-Antigravity (agy) eval harness for StackHawk agent skills.
-
-Uses `agy -p --print-timeout` (headless mode). Skills are installed via:
-    agy plugin install /path/to/agent-skills/plugins/hawkscan
-    agy plugin install /path/to/agent-skills/plugins/api
-
-agy outputs plain text (no --output-format stream-json), so trigger detection
-scans the full text output for CLI signals and skill-invocation phrases.
-
-Usage:
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan
-    python3 evals/harnesses/agy/run-evals.py --skill api
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan --dry-run
-
-Requirements:
-    - agy CLI installed and authenticated
-    - StackHawk plugins installed:
-        agy plugin install /path/to/agent-skills/plugins/hawkscan
-        agy plugin install /path/to/agent-skills/plugins/api
-    - Run from the agent-skills repo root
-
-Known limitations:
-    - agy connects to a shared server process. Background tasks from your
-      main agy session can bleed into eval runs — run evals when your main
-      agy session is idle.
-    - Some contextual prompts take >180s; use --print-timeout to increase.
-    - Process check scores will be low (agy in print mode doesn't execute
-      full workflows).
-"""
-
-import argparse
-import csv
-import json
-import os
-import re
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness agy --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-import shutil
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# agy outputs plain text, so ALL signals are searched against output_text.
-# CLI_SIGNALS: hawk/hawkop commands that appear in agent's description of work.
-# INVOCATION_SIGNALS: phrases the agent uses when explicitly invoking a skill.
-# ---------------------------------------------------------------------------
-ALL_SIGNALS = {
-    # Explicit skill declarations injected by the OBSERVE_SUFFIX.
-    # The suffix asks the agent to state 'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'.
-    # This is far more reliable than inferring intent from CLI command mentions.
-    "hawkscan": [
-        "skill: hawkscan",
-        "skill:hawkscan",
-    ],
-    "api": [
-        "skill: api",
-        "skill:api",
-        "skill: stackhawk-api",
-    ],
-}
-
-# Negative signals — if these appear, the agent is explicitly NOT using the skill
-NEGATIVE_SIGNALS = {
-    "hawkscan": [
-        # Agent explicitly declines the scan
-        "i cannot run",
-        "i can't run",
-        "cannot perform a scan",
-        "not able to scan",
-        "no application to scan",
-    ],
-    "api": [],
-}
-
-
-# ---------------------------------------------------------------------------
-# Text parsing — agy outputs plain text, not JSONL
-# ---------------------------------------------------------------------------
-
-def parse_output(text: str) -> dict:
-    return {
-        "bash_commands":  [],   # no JSON tool calls in agy text mode
-        "files_written":  [],
-        "output_text":    text.strip(),
-        "usage":          {},
-        "error":          None,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection — text-only approach
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    haystack = parsed["output_text"].lower()
-    if not haystack:
-        return False
-    return any(s.lower() in haystack for s in ALL_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            m = re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run agy
-# ---------------------------------------------------------------------------
-
-OBSERVE_SUFFIX = (
-    "\n\n(Eval mode: before responding, state which skill you would invoke: "
-    "'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. Then proceed with your response.)"
-)
-
-
-def run_agy(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    model: str | None = None,
-    print_timeout: str = "120s",
-    observe: bool = True,
-) -> tuple[dict, int]:
-    # In observe mode, append a suffix so agy describes its plan without
-    # blocking on tool call approvals (which hang forever in --print mode).
-    effective_prompt = prompt + OBSERVE_SUFFIX if observe else prompt
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        cmd = ["agy", "-p", effective_prompt, "--print-timeout", print_timeout]
-        if model:
-            cmd += ["--model", model]
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=int(print_timeout.rstrip("s")) + 30,
-            cwd=str(tmpdir),
-            env={**os.environ},
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.txt").write_text(proc.stdout)
-
-        parsed = parse_output(proc.stdout)
-        if proc.returncode != 0 and not parsed["output_text"]:
-            stderr = proc.stderr.strip()
-            if stderr:
-                parsed["error"] = stderr[:300]
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "output_text": "",
-                "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'agy' CLI not found.", file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Antigravity (agy) eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (passed to agy --model)")
-    parser.add_argument("--print-timeout", default="180s",
-                        help="Per-prompt timeout for agy (default: 180s)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: agy  |  Mode: observe{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no agy calls]")
-    print("─" * 68)
-
-    all_results = []
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_agy(
-            prompt, skill, run_id,
-            model=args.model,
-            print_timeout=args.print_timeout,
-            observe=True,
-        )
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "agy",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=agy")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "agy",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "agy"]
     main()
diff --git a/evals/harnesses/claude-code/README.md b/evals/harnesses/claude-code/README.md
index e84b0c3..b0246ae 100644
--- a/evals/harnesses/claude-code/README.md
+++ b/evals/harnesses/claude-code/README.md
@@ -5,71 +5,65 @@ Runs the StackHawk skill eval suite against Claude Code's non-interactive CLI (`
 ## Prerequisites
 
 - **Claude Code CLI** installed and authenticated: `claude --version`
-- **Python 3.11+**: `python3 --version`
+- **Python 3.11+** with `uv`: `uv run evals --help`
 - Run from the **agent-skills repo root** (plugin dirs are auto-detected)
 
-## How it works
+## Invocation
 
-For each row in `evals/<skill>/prompts.csv`:
+```bash
+# Run all prompts for a skill (preferred)
+uv run evals --harness claude-code --skill hawkscan
+uv run evals --harness claude-code --skill api
 
-1. Runs `claude -p "<prompt>" --output-format stream-json --plugin-dir plugins/<skill>`
-   in a fresh temp directory (isolated, no state leakage between runs)
-2. Parses the JSONL event stream to extract bash commands, files written, and output text
-3. Detects whether the skill triggered (skill-specific command patterns in the trace)
-4. If the skill should have triggered and did: runs deterministic checks from
-   `evals/<skill>/process-checks.json` against the captured trace
-5. Saves `results/<skill>/<run-id>.jsonl` (raw trace) and `results/<skill>/<run-id>.result.json` (scored)
+# Run a specific model
+uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001
 
-Optionally, `--rubric` runs a second `claude -p` call as a qualitative grader, using
-`evals/<skill>/rubric-items.json` and enforcing `evals/rubric-schema.json` via `--json-schema`.
+# Cap spend per run (default: $0.20)
+uv run evals --harness claude-code --skill hawkscan --max-budget 0.10
 
-## Usage
+# Full-auto mode: agent executes commands (--dangerously-skip-permissions)
+uv run evals --harness claude-code --skill hawkscan --full-auto
 
-```bash
-# Run all prompts for a skill
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-python3 evals/harnesses/claude-code/run-evals.py --skill api
+# Suppress progress UI (used in CI)
+uv run evals --harness claude-code --skill hawkscan --bare
+```
 
-# Run a single prompt by ID
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07
+`run-evals.py` in this directory is a back-compat shim that forwards to `uv run evals --harness claude-code`. Use the `uv run evals` form going forward.
 
-# Dry run — print prompts without calling claude
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run
+## Config source
 
-# Full-auto mode: agent can actually execute commands (--dangerously-skip-permissions)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --full-auto
+Prompts and trigger labels are loaded from `evals/<skill>/prompts.yaml` (not prompts.csv — the CSV was removed during the YAML migration). Process checks come from `evals/<skill>/process-checks.json`.
 
-# Also run the qualitative rubric grader (extra cost + ~30s per run)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --rubric
+## How it works
 
-# Cap spend per run (default: $0.20)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --max-budget 0.10
-```
+For each prompt in `evals/<skill>/prompts.yaml`:
+
+1. `ClaudeCodeAdapter.launch()` runs `claude -p "<prompt>" --output-format stream-json --plugin-dir plugins/<skill>` in a fresh temp directory (isolated, no state leakage between runs). The raw stdout is parsed in-memory; no raw `.jsonl` file is persisted.
+2. `parse_stream()` extracts bash commands, files written/edited, output text, and cost from the JSONL event stream.
+3. `detect_trigger()` checks whether the skill triggered using CLI command signals (e.g. `hawk scan`) and invocation-phrase signals in the output text.
+4. If the skill should have triggered and did, process checks from `process-checks.json` are run against the captured trace.
+5. A verdict (`pass`, `pass-slow`, or `fail`) is assigned and an `EvalResult` is written to `results/<skill>/<run-id>.result.json`.
 
 ## Two modes
 
 ### Observe mode (default)
 
-The agent runs normally but permissions are not bypassed. It will plan and narrate what
-it would do — including bash commands it intends to execute — without necessarily
-running them. Trigger detection and most process checks work because the agent names
-the commands in its output even when execution is blocked.
+Permissions are not bypassed. The agent plans and narrates what it would do — including bash commands it intends to run — without necessarily executing them. Trigger detection and most process checks still work because the agent names the commands in its output.
 
-**Use for:** trigger accuracy checks, output quality checks, rubric grading.
+**Use for:** trigger accuracy checks, output quality checks, CI.
 
 ### Full-auto mode (`--full-auto`)
 
-Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands,
-write files, and run `hawk` CLI calls. Results are more accurate for process checks that
-require real execution (e.g. `hawk validate config` was actually run and passed).
+Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands, write files, and run `hawk` CLI calls. Results are more accurate for process checks that require real execution.
 
-**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app
-is available. Run in a trusted, isolated environment — not on a production machine.
+**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app is available. Run in a trusted, isolated environment.
 
 ## Understanding results
 
 ### Per-run result file (`results/<skill>/<run-id>.result.json`)
 
+Conforms to the `EvalResult` Pydantic model (`evals/lib/models.py`):
+
 ```json
 {
   "platform": "claude-code",
@@ -78,67 +72,51 @@ is available. Run in a trusted, isolated environment — not on a production mac
   "should_trigger": true,
   "did_trigger": true,
   "trigger_correct": true,
-  "bash_commands": ["hawk version", "hawkop app list", "hawk validate config stackhawk.yml", "hawk scan --json-output"],
-  "files_written": ["stackhawk.yml"],
+  "verdict": "pass",
+  "budget_breaches": [],
   "process_checks": [
-    { "id": "preflight_version_check", "pass": true, "severity": "blocking", "signal_found": "hawk version" },
-    { "id": "step2_no_local_yml_created", "pass": true, "severity": "blocking", "signal_found": null }
+    { "id": "preflight_version_check", "passed": true, "severity": "blocking", "signal_found": "hawk version", "anti_found": null },
+    { "id": "step2_no_local_yml_created", "passed": true, "severity": "blocking", "signal_found": null, "anti_found": null }
   ],
-  "scoring": {
-    "total": 22,
-    "passed": 20,
-    "blocking_failed": 1,
-    "warning_failed": 1,
-    "score": 80
-  },
-  "rubric_result": null,
+  "score": 100,
   "cost_usd": 0.048
 }
 ```
 
 ### Summary file (`results/<skill>/summary.json`)
 
-Written after a full run. Tracks trigger accuracy, process score, false positives/negatives,
-and per-run scores — useful for comparing skill versions over time.
+Written after a full run. Tracks trigger accuracy, process score, false positives/negatives, and per-run scores.
 
 ### Scoring
 
-| Check type | Deduction per failure |
+| Check type  | Deduction per failure |
 |---|---|
-| `blocking` | −15 points |
-| `warning` | −5 points |
+| `blocking`  | −15 points |
+| `warning`   | −5 points |
 
-`overall_pass` in rubric results requires score ≥ 70 and zero blocking failures.
+Verdict is `pass` if trigger is correct and score ≥ 70 with zero blocking failures; `pass-slow` if correct but over budget; `fail` otherwise.
 
 ### Process checks only run when the skill should have triggered and did
 
-If `should_trigger=false` and the skill correctly did not fire, no process checks run —
-there is no workflow to grade. The run scores as a trigger-accuracy pass only.
+If `should_trigger=false` and the skill correctly did not fire, no process checks run — there is no workflow to grade.
 
-## Raw traces
+## adapter.py
 
-Each run saves the raw `claude --output-format stream-json` JSONL to
-`results/<skill>/<run-id>.jsonl`. Open it to debug false negatives or unexpected behavior:
+`ClaudeCodeAdapter` (`adapter.py`) implements the `HarnessAdapter` protocol for this platform:
 
-```bash
-# See all bash commands the agent attempted
-jq -r 'select(.type=="assistant") | .message.content[] | select(.type=="tool_use" and .name=="Bash") | .input.command' \
-  results/hawkscan/hw-07.jsonl
-```
+- `parse_stream(raw)` — parses `claude --output-format stream-json` JSONL into a `ParsedRun`
+- `detect_trigger(run, skill)` — checks CLI command signals and invocation-phrase signals
+- `launch(prompt, skill, run_id, ...)` — spawns `claude -p` in a temp directory, captures stdout in-memory, and returns a `ParsedRun`
 
 ## CI usage
 
-The harness exits non-zero if trigger accuracy falls below 100% or any blocking check
-fails. Wire it into CI after bumping a skill version to catch regressions:
-
 ```yaml
 - name: Run skill evals
-  run: |
-    python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-    python3 evals/harnesses/claude-code/run-evals.py --skill api
   env:
     ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  run: |
+    uv run evals --harness claude-code --skill hawkscan --bare --max-budget 0.15
+    uv run evals --harness claude-code --skill api --bare --max-budget 0.15
 ```
 
-Note: CI runs are in observe mode by default (no `--full-auto`), which avoids needing
-a live `hawk` CLI or running application. Add `--full-auto` only in a dedicated sandbox.
+CI runs use observe mode by default (no `--full-auto`), which avoids needing a live `hawk` CLI or running application.
diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
new file mode 100644
index 0000000..3c70b79
--- /dev/null
+++ b/evals/harnesses/claude-code/adapter.py
@@ -0,0 +1,150 @@
+"""claude-code Harness adapter. Parsing + signal lists ported from run-evals.py."""
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
+
+CLI_SIGNALS = {
+    # Scan-distinctive commands only. `hawk version`/`hawk config`/`hawk init` are
+    # generic preflight an agent runs while merely *assessing* the environment (even
+    # for a non-scan request), so they over-trigger; rely on scan commands or the
+    # explicit decision line instead.
+    "hawkscan": ["hawk scan", "hawk validate", "hawk rescan",
+                 "hawk create app", "hawk perch"],
+    "api": ["hawkop scan", "hawkop app", "hawkop org", "hawkop env", "hawkop status",
+            "hawkop init", "/api/v1/scan", "/api/v2/org", "hawk_api GET"],
+    # data-seed emits checked-in artifacts rather than running a distinctive CLI;
+    # its discovery + emission paths are the executable signals.
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
+}
+
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes", "hawkscan:hawkscan**: yes",
+        "hawkscan:hawkscan** — yes", "hawkscan:hawkscan: yes", "hawkscan:hawkscan — yes",
+        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes", "hawkscan**: yes",
+        "hawkscan** — yes", "hawkscan** - yes", "hawkscan: yes", "hawkscan — yes",
+        "hawkscan - yes", "autonomous security scan", "dast scan after code",
+        "dast scan triggered", "dast scan required", "security scan required",
+        "security scan after", "run the security scan", "running the hawkscan",
+    ],
+    "api": [
+        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes", "stackhawk-api:api**: yes",
+        "stackhawk-api:api** — yes", "stackhawk-api:api: yes", "stackhawk-api:api — yes",
+        "stackhawk-api:api - yes", "stackhawk-api**: yes", "stackhawk-api** — yes",
+        "stackhawk-api** - yes", "stackhawk-api: yes", "stackhawk-api — yes",
+        "stackhawk-api - yes",
+    ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed` — yes",
+        "stackhawk-data-seed:stackhawk-data-seed**: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed:stackhawk-data-seed - yes",
+        "stackhawk-data-seed**: yes", "stackhawk-data-seed** — yes",
+        "stackhawk-data-seed** - yes", "stackhawk-data-seed: yes",
+        "stackhawk-data-seed — yes", "stackhawk-data-seed - yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+    ],
+}
+
+# Observe-mode suffix is shared across all harnesses (per-skill). See
+# evals/lib/observe.py for the rationale and wording.
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    bash, written, edited, text, cost, err = [], [], [], "", 0.0, None
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        etype = event.get("type", "")
+        if etype == "assistant":
+            for block in event.get("message", {}).get("content", []):
+                bt = block.get("type", "")
+                if bt == "text":
+                    text += block.get("text", "") + "\n"
+                elif bt == "tool_use":
+                    name, inp = block.get("name", ""), block.get("input", {})
+                    if name == "Bash" and inp.get("command"):
+                        bash.append(inp["command"])
+                    elif name == "Write" and inp.get("file_path"):
+                        written.append(inp["file_path"])
+                    elif name == "Edit" and inp.get("file_path"):
+                        edited.append(inp["file_path"])
+        elif etype == "result":
+            cost = event.get("total_cost_usd") or event.get("cost_usd") or 0.0
+            text += event.get("result", "")
+            if event.get("subtype") == "error_during_execution":
+                err = event.get("result", "unknown error")
+    return ParsedRun(bash_commands=bash, files_written=written, files_edited=edited,
+                     output_text=text.strip(), cost_usd=cost, error=err)
+
+
+class ClaudeCodeAdapter:
+    platform = "claude-code"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
+        text = run.output_text.lower()
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            # Observe mode (default): ask the agent to declare + outline its
+            # workflow. Full-auto/extended runs against a real target execute for
+            # real, so they use the bare prompt.
+            effective_prompt = prompt if full_auto else prompt + observe_suffix(skill)
+            cmd = ["claude", "-p", effective_prompt, "--output-format", "stream-json",
+                   "--verbose", "--no-session-persistence",
+                   "--max-budget-usd", str(max_budget)]
+            if model:
+                cmd += ["--model", model]
+            if load_skill:
+                for pd in plugin_dirs:
+                    cmd += ["--plugin-dir", pd]
+            if full_auto:
+                cmd.append("--dangerously-skip-permissions")
+            if bare:
+                cmd.append("--bare")
+            try:
+                proc = subprocess.run(cmd, capture_output=True, text=True,
+                                      timeout=300, cwd=tmpdir)
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = ClaudeCodeAdapter()
diff --git a/evals/harnesses/claude-code/run-evals.py b/evals/harnesses/claude-code/run-evals.py
index 6d8679f..9489d2b 100644
--- a/evals/harnesses/claude-code/run-evals.py
+++ b/evals/harnesses/claude-code/run-evals.py
@@ -1,650 +1,11 @@
 #!/usr/bin/env python3
-"""
-Claude Code eval harness for StackHawk agent skills.
-
-Usage:
-    python3 run-evals.py --skill hawkscan          # all prompts
-    python3 run-evals.py --skill api               # all prompts
-    python3 run-evals.py --skill hawkscan --id hw-07    # single prompt
-    python3 run-evals.py --skill hawkscan --dry-run     # print prompts, no claude calls
-    python3 run-evals.py --skill hawkscan --full-auto   # allow agent to execute commands
-    python3 run-evals.py --skill hawkscan --rubric      # also run qualitative rubric grader
-    python3 run-evals.py --skill hawkscan --bare        # CI mode: ANTHROPIC_API_KEY only, no keychain
-
-Requirements:
-    - claude CLI installed and authenticated (https://claude.ai/code)
-    - Run from the agent-skills repo root (plugin dirs are auto-detected)
-
-Output:
-    evals/harnesses/claude-code/results/<skill>/<run-id>.jsonl       raw trace
-    evals/harnesses/claude-code/results/<skill>/<run-id>.result.json scored result
-    evals/harnesses/claude-code/results/<skill>/summary.json         run summary
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness claude-code --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR = HARNESS_DIR.parent.parent
-REPO_ROOT = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# Any of these appearing in bash commands or output text means the skill fired.
-# ---------------------------------------------------------------------------
-# CLI signals — checked against bash_commands only (prevents documentation content
-# from creating false positives when the agent writes README/guides about HawkScan).
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        # "hawk version" intentionally excluded: running 'hawk version' alone is common
-        # for installation-check tasks and would cause false positives. The preflight
-        # workflow always runs 'hawk config --help' in the same command, so 'hawk config'
-        # below is sufficient to distinguish scan-intent from install-check tasks.
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    "api": [
-        "hawkop scan",
-        "hawkop app",
-        "hawkop org",
-        "hawkop env",
-        "hawkop status",
-        "hawkop init",
-        "/api/v1/scan",
-        "/api/v2/org",
-        "hawk_api GET",
-    ],
-}
-
-# Invocation signals — checked against output_text only. Catches contextual prompts
-# where the agent correctly identifies the skill should trigger and says so explicitly,
-# but can't reach the CLI workflow (empty working dir, no running app, etc.).
-#
-# These are intentionally specific to action-intent phrases, NOT the generic
-# "hawkscan:hawkscan: yes" pattern (which also fires on educational/informational
-# responses where the agent answers "what does HawkScan detect?" type questions).
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        # Generic YES-evaluation signals — catch any run where the agent explicitly
-        # evaluates hawkscan as YES regardless of phrasing. Models vary in their markdown
-        # formatting: backtick (`` `hawkscan:hawkscan` ``), bold (**hawkscan:hawkscan**),
-        # or plain text. Each produces a different character sequence around `: YES`.
-        # Safe because SKILL.md now instructs NO for educational questions (hw-20),
-        # doc-only changes (hw-16/17/18), installation tasks (hw-19), and explicit skips.
-        "hawkscan:hawkscan`: yes",   # "`hawkscan:hawkscan`: YES" — backtick + colon (Sonnet/Haiku)
-        "hawkscan:hawkscan` — yes",  # "`hawkscan:hawkscan` — YES" — backtick + em-dash
-        "hawkscan:hawkscan**: yes",  # "**hawkscan:hawkscan**: YES" — bold + colon
-        "hawkscan:hawkscan** — yes", # "**hawkscan:hawkscan** — YES" — bold + em-dash
-        "hawkscan:hawkscan: yes",    # "hawkscan:hawkscan: YES" — plain colon
-        "hawkscan:hawkscan — yes",   # "hawkscan:hawkscan — YES" — em-dash
-        "hawkscan:hawkscan - yes",   # "hawkscan:hawkscan - YES" — plain hyphen (Opus 4.7)
-        "hawkscan:hawkscan - **yes", # "hawkscan:hawkscan - **YES**" — bold YES (Opus 4.7)
-        # Plugin name only — Opus 4.7 sometimes omits :hawkscan suffix
-        "hawkscan**: yes",           # "**hawkscan**: YES" — bold, no skill suffix
-        "hawkscan** — yes",          # bold + em-dash, no skill suffix
-        "hawkscan** - yes",          # "**hawkscan:hawkscan** - YES" — bold name + hyphen (Opus)
-        "hawkscan: yes",             # plain colon, no skill suffix
-        "hawkscan — yes",            # em-dash, no skill suffix
-        "hawkscan - yes",            # plain hyphen, no skill suffix
-        # Specific action-intent phrases as belt-and-suspenders for unusual formats
-        "autonomous security scan",
-        "dast scan after code",
-        "dast scan triggered",
-        "dast scan required",
-        "security scan required",
-        "security scan after",
-        "run the security scan",
-        "running the hawkscan",
-    ],
-    "api": [
-        # Full skill name (plugin:skill) — Sonnet/Haiku format
-        "stackhawk-api:api`: yes",   # backtick + colon
-        "stackhawk-api:api` — yes",  # backtick + em-dash
-        "stackhawk-api:api**: yes",  # bold + colon
-        "stackhawk-api:api** — yes", # bold + em-dash
-        "stackhawk-api:api: yes",    # plain colon
-        "stackhawk-api:api — yes",   # em-dash
-        "stackhawk-api:api - yes",   # plain hyphen (Opus 4.7)
-        # Plugin name only (Opus 4.7 sometimes omits :api suffix)
-        "stackhawk-api**: yes",      # bold + colon, no skill suffix
-        "stackhawk-api** — yes",     # bold + em-dash, no skill suffix
-        "stackhawk-api** - yes",     # bold + plain hyphen, no skill suffix (Opus)
-        "stackhawk-api: yes",        # plain colon, no skill suffix
-        "stackhawk-api — yes",       # em-dash, no skill suffix
-        "stackhawk-api - yes",       # plain hyphen, no skill suffix
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    """Extract structured data from a claude --output-format stream-json run."""
-    bash_commands: list[str] = []
-    files_written: list[str] = []
-    files_edited: list[str] = []
-    output_text = ""
-    cost_usd = 0.0
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "assistant":
-            for block in event.get("message", {}).get("content", []):
-                btype = block.get("type", "")
-                if btype == "text":
-                    output_text += block.get("text", "") + "\n"
-                elif btype == "tool_use":
-                    name = block.get("name", "")
-                    inp = block.get("input", {})
-                    if name == "Bash":
-                        cmd = inp.get("command", "")
-                        if cmd:
-                            bash_commands.append(cmd)
-                    elif name == "Write":
-                        path = inp.get("file_path", "")
-                        if path:
-                            files_written.append(path)
-                    elif name == "Edit":
-                        path = inp.get("file_path", "")
-                        if path:
-                            files_edited.append(path)
-
-        elif etype == "result":
-            cost_usd = event.get("cost_usd") or 0.0
-            output_text += event.get("result", "")
-            if event.get("subtype") == "error_during_execution":
-                error = event.get("result", "unknown error")
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "files_edited": files_edited,
-        "output_text": output_text.strip(),
-        "cost_usd": cost_usd,
-        "error": error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # CLI signals are checked only against actual bash commands executed — prevents
-    # documentation content (README guides, educational answers) from triggering.
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-
-    # Invocation signals are checked only against output text — catches cases where
-    # the agent evaluated the skill as YES but couldn't run CLI commands (e.g. empty
-    # working dir, permission blocks on hawkop, no running app).
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower()
-
-    results = []
-    for check in checks:
-        ctype = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit = next((a for a in antis if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            # Only enforce when the condition's keyword appears in the trace.
-            # Extract the keyword inside single quotes from the condition string,
-            # e.g. "stackhawk.yml contains 'authentication:'" → "authentication:"
-            import re as _re
-            condition_str = check.get("condition", "")
-            m = _re.search(r"'([^']+)'", condition_str)
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True  # condition not met — check is not applicable
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id": check["id"],
-            "pass": passed,
-            "severity": check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found": anti_hit,
-        })
-
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    score = max(0, 100 - blocking_failed * 15 - warning_failed * 5)
-    return {
-        "total": len(results),
-        "passed": sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed": warning_failed,
-        "score": score,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run claude -p
-# ---------------------------------------------------------------------------
-
-def run_claude(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    plugin_dirs: list[str],
-    full_auto: bool = False,
-    bare: bool = False,
-    max_budget: float = 0.20,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    # Each eval runs in a fresh temp dir so there is no state leakage.
-    tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
-    try:
-        cmd = [
-            "claude", "-p", prompt,
-            "--output-format", "stream-json",
-            "--verbose",
-            "--no-session-persistence",
-            "--max-budget-usd", str(max_budget),
-        ]
-        if model:
-            cmd += ["--model", model]
-        for pd in plugin_dirs:
-            cmd += ["--plugin-dir", pd]
-        if full_auto:
-            cmd.append("--dangerously-skip-permissions")
-        if bare:
-            cmd.append("--bare")
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=tmpdir,
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        return parse_stream(proc.stdout), proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {
-            "bash_commands": [], "files_written": [], "files_edited": [],
-            "output_text": "", "cost_usd": 0.0, "error": "timeout",
-        }, 1
-    except FileNotFoundError:
-        print(
-            "ERROR: 'claude' CLI not found. "
-            "Install Claude Code (https://claude.ai/code) and ensure it is in PATH.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Rubric grader (qualitative, model-assisted, optional)
-# ---------------------------------------------------------------------------
-
-def run_rubric_grader(
-    parsed: dict,
-    skill: str,
-    run_id: str,
-    plugin_dirs: list[str],
-    bare: bool = False,
-) -> dict | None:
-    rubric_path = EVALS_DIR / skill / "rubric-items.json"
-    schema_path = EVALS_DIR / "rubric-schema.json"
-    if not rubric_path.exists() or not schema_path.exists():
-        print("  [rubric] rubric-items.json or rubric-schema.json not found — skipping",
-              file=sys.stderr)
-        return None
-
-    rubric_data = json.loads(rubric_path.read_text())
-    schema = json.loads(schema_path.read_text())
-
-    grader_prompt = f"""{rubric_data['grader_prompt']}
-
-## Bash Commands Executed:
-{json.dumps(parsed['bash_commands'], indent=2)}
-
-## Files Written/Edited:
-{json.dumps(parsed['files_written'] + parsed['files_edited'], indent=2)}
-
-## Agent Output (first 4000 chars):
-{parsed['output_text'][:4000]}
-
-## Rubric Checks to Grade:
-{json.dumps(rubric_data['checks'], indent=2)}
-
-Populate the JSON result with:
-  skill = "{skill}"
-  run_id = "{run_id}"
-  overall_pass = true if all checks pass and score >= 70
-  score = 0-100
-  checks = one entry per check id listed above"""
-
-    cmd = [
-        "claude", "-p", grader_prompt,
-        "--output-format", "json",
-        "--no-session-persistence",
-        "--json-schema", json.dumps(schema),
-        "--max-budget-usd", "0.10",
-    ]
-    for pd in plugin_dirs:
-        cmd += ["--plugin-dir", pd]
-    if bare:
-        cmd.append("--bare")
-
-    try:
-        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
-        envelope = json.loads(proc.stdout)
-        # --output-format json wraps the response: {"result": "<json_string>", ...}
-        raw_result = envelope.get("result", "{}")
-        if isinstance(raw_result, dict):
-            return raw_result
-        return json.loads(raw_result)
-    except Exception as exc:
-        print(f"  [rubric] grader failed: {exc}", file=sys.stderr)
-        return None
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Claude Code eval harness for StackHawk agent skills",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID",
-                        help="Run a single prompt by id (e.g. hw-07)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print prompts without calling claude")
-    parser.add_argument("--rubric", action="store_true",
-                        help="Run qualitative rubric grader after process checks (extra cost + time)")
-    parser.add_argument("--full-auto", action="store_true",
-                        help="Pass --dangerously-skip-permissions so the agent can execute commands")
-    parser.add_argument("--bare", action="store_true",
-                        help="Pass --bare to claude: ANTHROPIC_API_KEY only, no keychain/hooks/CLAUDE.md (recommended for CI)")
-    parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD",
-                        help="Max spend per eval run in USD (default: 0.20)")
-    parser.add_argument("--plugin-dir", action="append", dest="plugin_dirs",
-                        help="Plugin dir to load; auto-detected from repo root if omitted")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Override the Claude model (e.g. claude-haiku-4-5-20251001, claude-sonnet-4-6)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    plugin_dirs = args.plugin_dirs or [str(REPO_ROOT / "plugins" / skill)]
-
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    mode = "full-auto" if args.full_auto else "observe"
-    if args.bare:
-        mode += "+bare"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: claude-code  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no claude calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_cost = 0.0
-
-    for row in prompts:
-        run_id        = row["id"]
-        prompt        = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype         = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_claude(
-            prompt, skill, run_id, plugin_dirs,
-            full_auto=args.full_auto,
-            bare=args.bare,
-            max_budget=args.max_budget,
-            model=args.model,
-        )
-        total_cost += parsed.get("cost_usd", 0.0)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger  = detect_trigger(parsed, skill)
-        trigger_ok   = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        rubric_result = None
-        if args.rubric and should_trigger and did_trigger:
-            print("  [rubric] grading…", end=" ", flush=True)
-            rubric_result = run_rubric_grader(parsed, skill, run_id, plugin_dirs, bare=args.bare)
-            print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed")
-
-        result = {
-            "platform": "claude-code",
-            "skill": skill,
-            "run_id": run_id,
-            "prompt": prompt,
-            "should_trigger": should_trigger,
-            "did_trigger": did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands": parsed["bash_commands"],
-            "files_written": parsed["files_written"],
-            "process_checks": process_results,
-            "scoring": scoring,
-            "rubric_result": rubric_result,
-            "cost_usd": parsed.get("cost_usd", 0.0),
-            "timestamp": datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon   = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}  ${parsed.get('cost_usd', 0):.3f}")
-
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    # ── Final summary ──────────────────────────────────────────────────────
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    process_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in process_runs) // len(process_runs)
-                 if process_runs else None)
-    total_blocking = (sum(r["scoring"]["blocking_failed"] for r in process_runs)
-                      if process_runs else 0)
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=claude-code")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Total cost       : ${total_cost:.3f}")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill,
-        "platform": "claude-code",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives": [r["run_id"] for r in false_pos],
-        "false_negatives": [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "total_cost_usd": round(total_cost, 4),
-        "runs": [
-            {
-                "run_id": r["run_id"],
-                "trigger_correct": r["trigger_correct"],
-                "score": r["scoring"]["score"],
-                "cost_usd": r["cost_usd"],
-            }
-            for r in all_results
-        ],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    # ── GitHub Actions step summary ────────────────────────────────────────
-    step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
-    if step_summary_path:
-        _write_step_summary(
-            step_summary_path, skill, all_results,
-            false_pos, false_neg, avg_score, total_blocking, total_cost,
-        )
-
-    # ── Exit non-zero for CI on any regression ─────────────────────────────
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
-
-def _write_step_summary(
-    path: str,
-    skill: str,
-    results: list[dict],
-    false_pos: list[dict],
-    false_neg: list[dict],
-    avg_score: int | None,
-    total_blocking: int,
-    total_cost: float,
-) -> None:
-    correct = sum(1 for r in results if r["trigger_correct"])
-    total = len(results)
-    trigger_icon = "✅" if correct == total else "❌"
-    score_icon = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌"
-
-    lines = [
-        f"## Skill Eval: `{skill}` (claude-code)\n",
-        "| Metric | Value |",
-        "|---|---|",
-        f"| Trigger accuracy | {trigger_icon} {correct}/{total} |",
-    ]
-    if false_pos:
-        lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |")
-    if false_neg:
-        lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |")
-    if avg_score is not None:
-        lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |")
-        lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |")
-    lines.append(f"| Total cost | ${total_cost:.3f} |")
-    lines.append("")
-
-    # Per-run table
-    lines += [
-        "<details><summary>Per-run results</summary>\n",
-        "| ID | Trigger | Score | Cost |",
-        "|---|---|---|---|",
-    ]
-    for r in results:
-        t = "✅" if r["trigger_correct"] else "❌"
-        score = r["scoring"]["score"] if r["process_checks"] else "—"
-        lines.append(f"| {r['run_id']} | {t} | {score} | ${r['cost_usd']:.3f} |")
-    lines.append("\n</details>\n")
-
-    with open(path, "a") as f:
-        f.write("\n".join(lines) + "\n")
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "claude-code"]
     main()
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
new file mode 100644
index 0000000..7196d48
--- /dev/null
+++ b/evals/harnesses/codex/adapter.py
@@ -0,0 +1,185 @@
+"""codex Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
+from __future__ import annotations
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
+
+# CLI signals — checked against bash_commands only (prevents documentation content
+# from creating false positives when the agent writes README/guides about HawkScan).
+CLI_SIGNALS = {
+    # Scan-distinctive commands only — generic preflight (hawk version/config/init)
+    # over-triggers when the agent merely assesses the environment for a non-scan
+    # request. Triggering falls back to the explicit decision line otherwise.
+    "hawkscan": [
+        "hawk scan",
+        "hawk validate",
+        "hawk rescan",
+        "hawk create app",
+        "hawk perch",
+    ],
+    # Signals specific to the api reporting workflow — avoids false positives
+    # from hawkop status/app/env commands that the hawkscan skill also runs.
+    "api": [
+        "hawkop scan get",     # api Step 4: app deep dive
+        "hawkop org get",      # api Step 1: establish orgId
+        "hawkop org set",      # api Step 1: switch org
+        "/api/v2/org",         # api Step 3: org posture endpoint (hawkop doesn't wrap it)
+        "/api/v1/scan",        # api Step 4: raw scan drill-down
+        "hawk_api GET",        # api raw API helper function
+    ],
+    # data-seed emits checked-in artifacts rather than a distinctive CLI.
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
+}
+
+# Invocation signals — checked against output_text only. In full-auto mode these are
+# belt-and-suspenders: the agent usually runs CLI commands directly. They catch
+# contextual prompts where the skill fires but the agent finds an empty working dir
+# and stops before reaching the CLI (same as observe mode in Claude Code harness).
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        # All markdown formatting variants the model uses around `: YES` or ` — YES`
+        "hawkscan:hawkscan`: yes",   # backtick + colon
+        "hawkscan:hawkscan` — yes",  # backtick + dash
+        "hawkscan:hawkscan**: yes",  # bold + colon
+        "hawkscan:hawkscan** — yes", # bold + dash
+        "hawkscan:hawkscan: yes",    # plain colon
+        "hawkscan:hawkscan — yes",   # plain dash
+        # Specific action-intent phrases
+        "autonomous security scan",
+        "dast scan after code",
+        "dast scan triggered",
+        "dast scan required",
+        "security scan required",
+        "security scan after",
+        "run the security scan",
+        "running the hawkscan",
+    ],
+    "api": [
+        "stackhawk-api:api`: yes",
+        "stackhawk-api:api` — yes",
+        "stackhawk-api:api: yes",
+        "stackhawk-api:api — yes",
+    ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed` — yes",
+        "stackhawk-data-seed:stackhawk-data-seed**: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+    ],
+}
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    cmds, out, otok, err, seen = [], "", 0, None, set()
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        t = ev.get("type", "")
+        if t == "item.started":
+            it = ev.get("item", {})
+            if it.get("type") == "command_execution":
+                c = it.get("command", "")
+                if c and c not in seen:
+                    cmds.append(c)
+                    seen.add(c)
+        elif t == "item.completed":
+            it = ev.get("item", {})
+            if it.get("type") in ("message", "agent_message"):
+                txt = it.get("text", "")
+                if txt:
+                    out += txt + "\n"
+                content = it.get("content", "")
+                if isinstance(content, str):
+                    out += content + "\n"
+                elif isinstance(content, list):
+                    for b in content:
+                        if isinstance(b, dict) and b.get("type") == "text":
+                            out += b.get("text", "") + "\n"
+        elif t == "turn.completed":
+            otok += ev.get("usage", {}).get("output_tokens", 0)
+        elif t == "error":
+            err = ev.get("message", "unknown error")
+    return ParsedRun(bash_commands=cmds, output_text=out.strip(),
+                     output_tokens=otok or None, error=err)
+
+
+class CodexAdapter:
+    platform = "codex"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
+        text = run.output_text.lower()
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            # In CI the bubblewrap sandbox can't initialize (Ubuntu 24.04 blocks
+            # unprivileged user namespaces), so codex exits at sandbox startup
+            # before running any command — the agent can't reach hawk. Bypass the
+            # sandbox there; it's safe on an ephemeral runner in a throwaway tmpdir,
+            # and the agent needs write+exec to run the skill workflow anyway.
+            # Locally, keep the real sandbox (workspace-write for full-auto,
+            # else read-only). Passing --sandbox twice makes codex exit 2.
+            if os.environ.get("CI"):
+                cmd = [
+                    "codex", "exec", "--json",
+                    "--dangerously-bypass-approvals-and-sandbox",
+                    "--skip-git-repo-check",
+                ]
+            else:
+                sandbox = "workspace-write" if full_auto else "read-only"
+                cmd = [
+                    "codex", "exec", "--json",
+                    "--sandbox", sandbox,
+                    "--skip-git-repo-check",
+                ]
+            if model:
+                cmd += ["-m", model]
+            # Observe mode: append the per-skill walkthrough suffix. Full-auto /
+            # extended runs against a real target use the bare prompt.
+            cmd.append(prompt if full_auto else prompt + observe_suffix(skill))
+            try:
+                proc = subprocess.run(cmd, capture_output=True, text=True,
+                                      timeout=300, cwd=tmpdir)
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = CodexAdapter()
diff --git a/evals/harnesses/codex/run-evals.py b/evals/harnesses/codex/run-evals.py
index 3c0828f..24df734 100644
--- a/evals/harnesses/codex/run-evals.py
+++ b/evals/harnesses/codex/run-evals.py
@@ -1,592 +1,11 @@
 #!/usr/bin/env python3
-"""
-Codex eval harness for StackHawk agent skills.
-
-Usage:
-    python3 run-evals.py --skill hawkscan          # all prompts
-    python3 run-evals.py --skill api               # all prompts
-    python3 run-evals.py --skill hawkscan --id hw-07    # single prompt
-    python3 run-evals.py --skill hawkscan --dry-run     # print prompts, no codex calls
-    python3 run-evals.py --skill hawkscan --rubric      # also run qualitative rubric grader
-
-Requirements:
-    - codex CLI installed and authenticated (https://openai.com/codex)
-    - Run from the agent-skills repo root
-
-Output:
-    evals/harnesses/codex/results/<skill>/<run-id>.jsonl       raw JSONL trace
-    evals/harnesses/codex/results/<skill>/<run-id>.result.json scored result
-    evals/harnesses/codex/results/<skill>/summary.json         run summary
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness codex --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# ---------------------------------------------------------------------------
-# CLI signals — checked against bash_commands only (prevents documentation content
-# from creating false positives when the agent writes README/guides about HawkScan).
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        # "hawk version" excluded: running 'hawk version' alone is common for
-        # installation-check tasks and would cause false positives. The preflight
-        # workflow always also runs 'hawk config --help', so 'hawk config' below suffices.
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    # Signals specific to the api reporting workflow — avoids false positives
-    # from hawkop status/app/env commands that the hawkscan skill also runs.
-    "api": [
-        "hawkop scan get",     # api Step 4: app deep dive
-        "hawkop org get",      # api Step 1: establish orgId
-        "hawkop org set",      # api Step 1: switch org
-        "/api/v2/org",         # api Step 3: org posture endpoint (hawkop doesn't wrap it)
-        "/api/v1/scan",        # api Step 4: raw scan drill-down
-        "hawk_api GET",        # api raw API helper function
-    ],
-}
-
-# Invocation signals — checked against output_text only. In full-auto mode these are
-# belt-and-suspenders: the agent usually runs CLI commands directly. They catch
-# contextual prompts where the skill fires but the agent finds an empty working dir
-# and stops before reaching the CLI (same as observe mode in Claude Code harness).
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        # All markdown formatting variants the model uses around `: YES` or ` — YES`
-        "hawkscan:hawkscan`: yes",   # backtick + colon
-        "hawkscan:hawkscan` — yes",  # backtick + dash
-        "hawkscan:hawkscan**: yes",  # bold + colon
-        "hawkscan:hawkscan** — yes", # bold + dash
-        "hawkscan:hawkscan: yes",    # plain colon
-        "hawkscan:hawkscan — yes",   # plain dash
-        # Specific action-intent phrases
-        "autonomous security scan",
-        "dast scan after code",
-        "dast scan triggered",
-        "dast scan required",
-        "security scan required",
-        "security scan after",
-        "run the security scan",
-        "running the hawkscan",
-    ],
-    "api": [
-        "stackhawk-api:api`: yes",
-        "stackhawk-api:api` — yes",
-        "stackhawk-api:api: yes",
-        "stackhawk-api:api — yes",
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# JSONL parsing
-# Codex --json event stream: item.started / item.completed / turn.completed
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    commands: list[str] = []
-    output_text = ""
-    input_tokens = 0
-    output_tokens = 0
-    error = None
-
-    seen_commands: set[str] = set()
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "item.started":
-            item = event.get("item", {})
-            if item.get("type") == "command_execution":
-                cmd = item.get("command", "")
-                # Deduplicate: item.started fires before item.completed for the same cmd
-                if cmd and cmd not in seen_commands:
-                    commands.append(cmd)
-                    seen_commands.add(cmd)
-
-        elif etype == "item.completed":
-            item = event.get("item", {})
-            # Capture any assistant message text — Codex uses "agent_message" type
-            if item.get("type") in ("message", "agent_message"):
-                text = item.get("text", "")
-                if text:
-                    output_text += text + "\n"
-                content = item.get("content", "")
-                if isinstance(content, str):
-                    output_text += content + "\n"
-                elif isinstance(content, list):
-                    for block in content:
-                        if isinstance(block, dict) and block.get("type") == "text":
-                            output_text += block.get("text", "") + "\n"
-
-        elif etype == "turn.completed":
-            usage = event.get("usage", {})
-            input_tokens  += usage.get("input_tokens", 0)
-            output_tokens += usage.get("output_tokens", 0)
-
-        elif etype == "error":
-            error = event.get("message", "unknown error")
-
-    return {
-        "bash_commands": commands,
-        "files_written": [],  # populated by scanning tmpdir after run
-        "files_edited":  [],
-        "output_text":   output_text.strip(),
-        "input_tokens":  input_tokens,
-        "output_tokens": output_tokens,
-        "error":         error,
-    }
-
-
-def _setup_skill_in_dir(skill: str, target_dir: Path) -> None:
-    """No-op: skills are installed globally via 'codex plugin add <skill>@stackhawk'.
-    Run: codex plugin marketplace add /path/to/agent-skills
-         codex plugin add hawkscan@stackhawk
-         codex plugin add stackhawk-api@stackhawk
-    """
-    pass
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # CLI signals checked against actual bash commands only — prevents README/educational
-    # output text from creating false positives.
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-
-    # Invocation signals checked against output text only — belt-and-suspenders for
-    # contextual prompts where the skill fires but no CLI commands run (empty dir, etc.)
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            # Only enforce when the condition's keyword appears in the trace.
-            import re as _re
-            condition_str = check.get("condition", "")
-            m = _re.search(r"'([^']+)'", condition_str)
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True  # condition not met — check not applicable
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    score = max(0, 100 - blocking_failed * 15 - warning_failed * 5)
-    return {
-        "total":            len(results),
-        "passed":           sum(1 for r in results if r["pass"]),
-        "blocking_failed":  blocking_failed,
-        "warning_failed":   warning_failed,
-        "score":            score,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run codex exec
-# ---------------------------------------------------------------------------
-
-def run_codex(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    full_auto: bool = True,
-    max_budget: float = 0.20,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        _setup_skill_in_dir(skill, tmpdir)
-
-        cmd = [
-            "codex", "exec", "--json",
-            "--sandbox", "workspace-write",
-            "--skip-git-repo-check",
-        ]
-        if model:
-            cmd += ["-m", model]
-        if not full_auto:
-            cmd += ["--sandbox", "read-only"]
-        cmd.append(prompt)
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=str(tmpdir),
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        parsed = parse_stream(proc.stdout)
-
-        # Scan tmpdir for files created during the run (more reliable than JSONL parsing)
-        created = [
-            str(p.relative_to(tmpdir))
-            for p in tmpdir.rglob("*")
-            if p.is_file() and not str(p).startswith(str(tmpdir / ".codex"))
-        ]
-        parsed["files_written"] = created
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {
-            "bash_commands": [], "files_written": [], "files_edited": [],
-            "output_text": "", "input_tokens": 0, "output_tokens": 0, "error": "timeout",
-        }, 1
-    except FileNotFoundError:
-        print(
-            "ERROR: 'codex' CLI not found. "
-            "Install the Codex CLI and ensure it is in PATH.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Rubric grader
-# Uses: codex exec "<prompt>" --output-schema <schema> -o <output_file>
-# ---------------------------------------------------------------------------
-
-def run_rubric_grader(parsed: dict, skill: str, run_id: str) -> dict | None:
-    rubric_path = EVALS_DIR / skill / "rubric-items.json"
-    schema_path = EVALS_DIR / "rubric-schema.json"
-    if not rubric_path.exists() or not schema_path.exists():
-        return None
-
-    rubric_data = json.loads(rubric_path.read_text())
-
-    grader_prompt = f"""{rubric_data['grader_prompt']}
-
-## Commands Executed:
-{json.dumps(parsed['bash_commands'], indent=2)}
-
-## Files Created:
-{json.dumps(parsed['files_written'], indent=2)}
-
-## Agent Output (first 4000 chars):
-{parsed['output_text'][:4000]}
-
-## Rubric Checks to Grade:
-{json.dumps(rubric_data['checks'], indent=2)}
-
-Populate: skill="{skill}", run_id="{run_id}", overall_pass, score 0-100, checks array."""
-
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkrubric_{run_id}_"))
-    try:
-        output_file = tmpdir / "rubric_result.json"
-        cmd = [
-            "codex", "exec",
-            grader_prompt,
-            "--output-schema", str(schema_path),
-            "-o", str(output_file),
-        ]
-        subprocess.run(cmd, capture_output=True, text=True, timeout=120, cwd=str(tmpdir))
-
-        if output_file.exists():
-            return json.loads(output_file.read_text())
-        return None
-    except Exception as exc:
-        print(f"  [rubric] grader failed: {exc}", file=sys.stderr)
-        return None
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Codex eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID",
-                        help="Run a single prompt by id (e.g. hw-07)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print prompts without calling codex")
-    parser.add_argument("--rubric", action="store_true",
-                        help="Run qualitative rubric grader after process checks (extra cost)")
-    parser.add_argument("--no-full-auto", action="store_true",
-                        help="Run without --full-auto (restricts filesystem access)")
-    parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD",
-                        help="Max spend per eval run in USD (default: 0.20)")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Override the Codex model (e.g. o3, o4-mini, gpt-4o)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    full_auto = not args.no_full_auto
-
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    mode = "full-auto" if full_auto else "sandbox"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: codex  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no codex calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_cost = 0.0
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_codex(
-            prompt, skill, run_id,
-            full_auto=full_auto,
-            max_budget=args.max_budget,
-            model=args.model,
-        )
-
-        # Codex doesn't report USD cost directly; estimate from token usage
-        tokens = parsed.get("input_tokens", 0) + parsed.get("output_tokens", 0)
-        est_cost = tokens * 0.000015  # rough estimate
-        total_cost += est_cost
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        rubric_result = None
-        if args.rubric and should_trigger and did_trigger:
-            print("  [rubric] grading…", end=" ", flush=True)
-            rubric_result = run_rubric_grader(parsed, skill, run_id)
-            print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed")
-
-        result = {
-            "platform":       "codex",
-            "skill":          skill,
-            "run_id":         run_id,
-            "prompt":         prompt,
-            "should_trigger": should_trigger,
-            "did_trigger":    did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":  parsed["bash_commands"],
-            "files_written":  parsed["files_written"],
-            "process_checks": process_results,
-            "scoring":        scoring,
-            "rubric_result":  rubric_result,
-            "tokens":         {"input": parsed.get("input_tokens", 0), "output": parsed.get("output_tokens", 0)},
-            "timestamp":      datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    # ── Summary ────────────────────────────────────────────────────────────
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total      = len(all_results)
-    false_pos  = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg  = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs  = [r for r in all_results if r["process_checks"]]
-    avg_score  = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                  if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=codex")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill":    skill,
-        "platform": "codex",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [
-            {"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], "score": r["scoring"]["score"]}
-            for r in all_results
-        ],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    # ── GitHub Actions step summary ─────────────────────────────────────────
-    step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
-    if step_summary_path:
-        _write_step_summary(step_summary_path, skill, all_results, false_pos, false_neg, avg_score, total_blocking)
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
-
-def _write_step_summary(
-    path: str, skill: str, results: list[dict],
-    false_pos: list[dict], false_neg: list[dict],
-    avg_score: int | None, total_blocking: int,
-) -> None:
-    correct = sum(1 for r in results if r["trigger_correct"])
-    total = len(results)
-    trigger_icon = "✅" if correct == total else "❌"
-    score_icon   = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌"
-
-    lines = [
-        f"## Skill Eval: `{skill}` (codex)\n",
-        "| Metric | Value |", "|---|---|",
-        f"| Trigger accuracy | {trigger_icon} {correct}/{total} |",
-    ]
-    if false_pos:
-        lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |")
-    if false_neg:
-        lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |")
-    if avg_score is not None:
-        lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |")
-        lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |")
-    lines.append("")
-
-    lines += [
-        "<details><summary>Per-run results</summary>\n",
-        "| ID | Trigger | Score |", "|---|---|---|",
-    ]
-    for r in results:
-        t = "✅" if r["trigger_correct"] else "❌"
-        score = r["scoring"]["score"] if r["process_checks"] else "—"
-        lines.append(f"| {r['run_id']} | {t} | {score} |")
-    lines.append("\n</details>\n")
-
-    with open(path, "a") as f:
-        f.write("\n".join(lines) + "\n")
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "codex"]
     main()
diff --git a/evals/harnesses/copilot/run-evals.py b/evals/harnesses/copilot/run-evals.py
index 9779110..d04c71e 100644
--- a/evals/harnesses/copilot/run-evals.py
+++ b/evals/harnesses/copilot/run-evals.py
@@ -1,391 +1,11 @@
 #!/usr/bin/env python3
-"""
-GitHub Copilot CLI eval harness for StackHawk agent skills.
-
-Uses `copilot -p --output-format json --allow-all-tools --plugin-dir`.
-Skills are loaded from plugins/<skill>/ via --plugin-dir.
-
-The trigger detection is uniquely reliable: Copilot emits an explicit
-  tool.execution_start {"toolName":"skill","arguments":{"skill":"hawkscan"}}
-event when the skill fires. No heuristic text-matching needed.
-
-Usage:
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan
-    python3 evals/harnesses/copilot/run-evals.py --skill api
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --dry-run
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex
-
-Requirements:
-    - GitHub Copilot CLI installed and authenticated (copilot login)
-    - Run from the agent-skills repo root
-
-Note: Copilot actually executes commands (--allow-all-tools), so process
-check scores reflect real hawk workflow completion — not just observations.
-"""
-
-import argparse
-import csv
-import json
-import os
-import re
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness copilot --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# Copilot emits an unambiguous tool.execution_start event when a skill fires:
-#   {"type":"tool.execution_start","data":{"toolName":"skill","arguments":{"skill":"hawkscan"}}}
-# This eliminates all heuristic signal-matching needed for other platforms.
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # Primary: explicit skill tool call (unambiguous)
-    for call in parsed.get("skill_calls", []):
-        if call.lower() == skill.lower() or call.lower() == f"stackhawk-{skill}".lower():
-            return True
-
-    # Fallback: CLI signals in bash commands (belt-and-suspenders)
-    cli_signals = {
-        "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
-                     "hawk create app", "hawk init", "hawk perch"],
-        "api": ["hawkop scan get", "hawkop org get", "/api/v2/org", "/api/v1/scan"],
-    }
-    cmd_haystack = " ".join(parsed.get("bash_commands", [])).lower()
-    return any(s.lower() in cmd_haystack for s in cli_signals.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing — Copilot JSONL event format:
-#   tool.execution_start  {"toolName":"bash","arguments":{"command":"..."}}
-#   tool.execution_start  {"toolName":"skill","arguments":{"skill":"hawkscan"}}
-#   tool.execution_partial_result {"partialOutput":"..."}
-#   assistant.message     {"content":"..."}
-#   result                {}
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    bash_commands: list[str] = []
-    files_written: list[str] = []
-    skill_calls:   list[str] = []
-    output_text  = ""
-    usage: dict  = {}
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-        data  = event.get("data", {})
-
-        if etype == "tool.execution_start":
-            tool_name = data.get("toolName", "")
-            args      = data.get("arguments", {})
-
-            if tool_name == "bash":
-                cmd = args.get("command", "")
-                if cmd:
-                    bash_commands.append(cmd)
-
-            elif tool_name == "skill":
-                skill_name = args.get("skill", "")
-                if skill_name:
-                    skill_calls.append(skill_name)
-
-            elif tool_name in ("write_file", "create_file", "str_replace_editor"):
-                path = args.get("path") or args.get("file_path") or ""
-                if path:
-                    files_written.append(path)
-
-        elif etype == "assistant.message":
-            content = data.get("content", "")
-            if content:
-                output_text += content + "\n"
-
-        elif etype == "result":
-            usage = data.get("usage", {})
-            if data.get("error"):
-                error = str(data["error"])
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "skill_calls":   skill_calls,
-        "output_text":   output_text.strip(),
-        "usage":         usage,
-        "error":         error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            m = re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run copilot
-# ---------------------------------------------------------------------------
-
-def run_copilot(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        plugin_dir = str(REPO_ROOT / "plugins" / skill)
-
-        cmd = [
-            "copilot", "-p", prompt,
-            "--output-format", "json",
-            "--allow-all-tools",
-            "--plugin-dir", plugin_dir,
-            "--no-ask-user",
-        ]
-        if model:
-            cmd += ["--model", model]
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=600,
-            cwd=str(tmpdir),
-            env={**os.environ},
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        parsed = parse_stream(proc.stdout)
-        if proc.returncode != 0 and not parsed["output_text"] and not parsed["skill_calls"]:
-            stderr = proc.stderr.strip()
-            if stderr:
-                parsed["error"] = stderr[:300]
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "skill_calls": [],
-                "output_text": "", "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'copilot' CLI not found. Install GitHub Copilot CLI.", file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="GitHub Copilot CLI eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (e.g. gpt-5.3-codex)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: copilot  |  Mode: full-auto{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no copilot calls]")
-    print("─" * 68)
-
-    all_results = []
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_copilot(prompt, skill, run_id, model=args.model)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "copilot",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "skill_calls":     parsed["skill_calls"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "usage":           parsed.get("usage", {}),
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  skill_calls={parsed['skill_calls']}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=copilot")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "copilot",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "copilot"]
     main()
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
new file mode 100644
index 0000000..3d5bdcc
--- /dev/null
+++ b/evals/harnesses/cursor/adapter.py
@@ -0,0 +1,238 @@
+"""cursor Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
+from __future__ import annotations
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
+
+# adapter.py -> cursor -> harnesses -> evals -> repo root
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+# cursor/.cursor/rules/ holds the alwaysApply .mdc skill rules (pre-shim path).
+CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules"
+
+
+def _setup_skill(target_dir: str) -> None:
+    """Copy cursor/.cursor/rules/*.mdc into the run's workspace so alwaysApply
+    rules load. Mirrors the pre-shim run-evals.py _setup_workspace()."""
+    dst = Path(target_dir) / ".cursor" / "rules"
+    dst.mkdir(parents=True, exist_ok=True)
+    for mdc in CURSOR_RULES_DIR.glob("*.mdc"):
+        shutil.copy2(mdc, dst / mdc.name)
+
+# CLI signals — checked against bash_commands only.
+# Cursor goes directly into execution, so CLI signals are the primary trigger
+# indicator. Invocation signals cover narrative phrases the agent uses when
+# kicking off a skill workflow without immediately running commands.
+CLI_SIGNALS = {
+    # Scan-distinctive commands only — generic preflight (hawk version/config/init)
+    # over-triggers when the agent merely assesses the environment for a non-scan
+    # request. Triggering falls back to the explicit decision line otherwise.
+    "hawkscan": [
+        "hawk scan",
+        "hawk validate",
+        "hawk rescan",
+        "hawk create app",
+        "hawk perch",
+    ],
+    # Cursor api: agent runs hawkop status as its first step, then deeper
+    # hawkop commands. Broader hawkop signals included since Cursor doesn't
+    # have false-positive risk of Codex full-auto mode.
+    "api": [
+        "hawkop status",
+        "hawkop scan get",
+        "hawkop org get",
+        "hawkop org set",
+        "hawkop app list",
+        "/api/v2/org",
+        "/api/v1/scan",
+        "hawk_api GET",
+    ],
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
+}
+
+# Invocation signals — checked against output_text only.
+# Cursor doesn't use the Claude Code "EVALUATE: YES/NO" evaluation step, so
+# these focus on narrative phrases the agent uses when kicking off a skill workflow.
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes",
+        "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes",
+        "hawkscan:hawkscan: yes",  "hawkscan:hawkscan — yes",
+        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes",
+        "hawkscan** - yes", "hawkscan** — yes",
+        "hawkscan**: yes",  "hawkscan: yes",
+        "hawkscan — yes",   "hawkscan - yes",
+        "autonomous security scan",
+        "dast scan after code", "dast scan triggered", "dast scan required",
+        "security scan required", "security scan after",
+        "run the security scan",  "running the hawkscan",
+    ],
+    "api": [
+        # Claude Code evaluation-format signals (if model uses that format)
+        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes",
+        "stackhawk-api:api**: yes", "stackhawk-api:api** — yes",
+        "stackhawk-api:api: yes",  "stackhawk-api:api — yes",
+        "stackhawk-api:api - yes",
+        "stackhawk-api**: yes",    "stackhawk-api** — yes",
+        "stackhawk-api: yes",      "stackhawk-api — yes",
+        "stackhawk-api - yes",
+        # Cursor narrative-style signals
+        "stackhawk api skill",
+        "stackhawk api",
+        "api skill to",
+        "security posture",
+        "untriaged findings",
+        "scan history",
+        "findings across",
+    ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "stackhawk-data-seed - yes",
+        # narrative-style
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+        "set up seed data",
+    ],
+}
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    """Parse cursor stream-json output.
+
+    Cursor event shapes (from pre-shim run-evals.py):
+      - type="assistant":  message.content[] with blocks of type="text"
+      - type="tool_call" subtype="started":
+            tool_call.shellToolCall.args.command  -> bash_commands
+            tool_call.writeToolCall.args.path     -> files_written
+      - type="result":  usage.outputTokens, is_error, result
+    """
+    bash_commands: list[str] = []
+    files_written: list[str] = []
+    output_text = ""
+    output_tokens: int | None = None
+    error = None
+
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        etype = event.get("type", "")
+
+        if etype == "assistant":
+            for block in event.get("message", {}).get("content", []):
+                if block.get("type") == "text":
+                    output_text += block.get("text", "") + "\n"
+
+        elif etype == "tool_call" and event.get("subtype") == "started":
+            tc = event.get("tool_call", {})
+            # Shell command
+            shell = tc.get("shellToolCall", {})
+            if shell:
+                cmd = shell.get("args", {}).get("command", "")
+                if cmd:
+                    bash_commands.append(cmd)
+            # File write
+            write = tc.get("writeToolCall", {})
+            if write:
+                path = write.get("args", {}).get("path", "")
+                if path:
+                    files_written.append(path)
+
+        elif etype == "result":
+            usage = event.get("usage", {})
+            otok = usage.get("outputTokens")
+            if otok is not None:
+                output_tokens = (output_tokens or 0) + int(otok)
+            if event.get("is_error"):
+                error = event.get("result", "unknown error")
+
+    return ParsedRun(
+        bash_commands=bash_commands,
+        files_written=files_written,
+        output_text=output_text.strip(),
+        output_tokens=output_tokens or None,
+        error=error,
+    )
+
+
+class CursorAdapter:
+    platform = "cursor"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
+        text = run.output_text.lower()
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            # With/without-skill switch: only install the cursor rules when the
+            # skill should be loaded (pre-shim always installed them).
+            if load_skill:
+                _setup_skill(tmpdir)
+            # Observe mode: append the per-skill walkthrough suffix. Full-auto /
+            # extended runs against a real target use the bare prompt.
+            effective_prompt = prompt if full_auto else prompt + observe_suffix(skill)
+            cmd = [
+                "agent", "-p", effective_prompt,
+                "--output-format", "stream-json",
+                "--print",
+                "--trust",
+            ]
+            if model:
+                cmd += ["--model", model]
+            if full_auto:
+                cmd.append("--force")
+            # Pass CURSOR_API_KEY via the environment, never on the command line
+            # (a CLI arg leaks the secret into process listings and logs). The
+            # agent CLI reads CURSOR_API_KEY from the environment directly.
+            env = dict(os.environ)
+            try:
+                proc = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    timeout=300,
+                    cwd=tmpdir,
+                    env=env,
+                )
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = CursorAdapter()
diff --git a/evals/harnesses/cursor/run-evals.py b/evals/harnesses/cursor/run-evals.py
index 364a3f7..d83ce7a 100644
--- a/evals/harnesses/cursor/run-evals.py
+++ b/evals/harnesses/cursor/run-evals.py
@@ -1,451 +1,11 @@
 #!/usr/bin/env python3
-"""
-Cursor Agent eval harness for StackHawk agent skills.
-
-Uses `agent --print --output-format stream-json` (Cursor's headless CLI).
-Skills are loaded from cursor/.cursor/rules/*.mdc (alwaysApply rules).
-
-Usage:
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan
-    python3 evals/harnesses/cursor/run-evals.py --skill api
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --dry-run
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --full-auto   # actually execute commands
-
-Requirements:
-    - Cursor CLI installed and authenticated (`agent status`)
-    - Run from the agent-skills repo root
-    - cursor/.cursor/rules/ contains generated .mdc files (run generate-cursor-rules.sh)
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness cursor --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-# cursor/.cursor/rules/ contains the alwaysApply .mdc skill rules
-CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules"
-
-# ---------------------------------------------------------------------------
-# Trigger signals — Cursor-specific tuning.
-# Cursor goes directly into execution without the Claude Code "EVALUATE: YES/NO"
-# evaluation step, so invocation signals focus on narrative phrases the agent
-# uses when kicking off a skill workflow.
-# CLI_SIGNALS are checked against shell commands the agent attempted to run.
-# ---------------------------------------------------------------------------
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    # Cursor api: the agent runs hawkop status as its first step, then
-    # deeper hawkop commands. Include broader hawkop signals since Cursor
-    # doesn't have the false-positive risk of Codex full-auto mode.
-    "api": [
-        "hawkop status",
-        "hawkop scan get",
-        "hawkop org get",
-        "hawkop org set",
-        "hawkop app list",
-        "/api/v2/org",
-        "/api/v1/scan",
-        "hawk_api GET",
-    ],
-}
-
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes",
-        "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes",
-        "hawkscan:hawkscan: yes",  "hawkscan:hawkscan — yes",
-        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes",
-        "hawkscan** - yes", "hawkscan** — yes",
-        "hawkscan**: yes",  "hawkscan: yes",
-        "hawkscan — yes",   "hawkscan - yes",
-        "autonomous security scan",
-        "dast scan after code", "dast scan triggered", "dast scan required",
-        "security scan required", "security scan after",
-        "run the security scan",  "running the hawkscan",
-    ],
-    "api": [
-        # Claude Code evaluation-format signals (if model uses that format)
-        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes",
-        "stackhawk-api:api**: yes","stackhawk-api:api** — yes",
-        "stackhawk-api:api: yes",  "stackhawk-api:api — yes",
-        "stackhawk-api:api - yes",
-        "stackhawk-api**: yes",    "stackhawk-api** — yes",
-        "stackhawk-api: yes",      "stackhawk-api — yes",
-        "stackhawk-api - yes",
-        # Cursor narrative-style signals — agent says these instead of evaluating
-        "stackhawk api skill",          # "I'll use the StackHawk API skill"
-        "stackhawk api",                # "using the StackHawk API"
-        "api skill to",                 # "api skill to pull your org..."
-        "security posture",             # "pull your org's security posture"
-        "untriaged findings",           # "untriaged findings across all apps"
-        "scan history",                 # "scan history for"
-        "findings across",              # "findings across all apps"
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing
-# Cursor events: system / user / thinking / assistant / tool_call / result
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    bash_commands: list[str] = []
-    output_text = ""
-    files_written: list[str] = []
-    usage: dict = {}
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "assistant":
-            for block in event.get("message", {}).get("content", []):
-                if block.get("type") == "text":
-                    output_text += block.get("text", "") + "\n"
-
-        elif etype == "tool_call" and event.get("subtype") == "started":
-            tc = event.get("tool_call", {})
-            # Shell command
-            shell = tc.get("shellToolCall", {})
-            if shell:
-                cmd = shell.get("args", {}).get("command", "")
-                if cmd:
-                    bash_commands.append(cmd)
-            # File write
-            write = tc.get("writeToolCall", {})
-            if write:
-                path = write.get("args", {}).get("path", "")
-                if path:
-                    files_written.append(path)
-
-        elif etype == "result":
-            usage = event.get("usage", {})
-            if event.get("is_error"):
-                error = event.get("result", "unknown error")
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "output_text": output_text.strip(),
-        "usage": usage,
-        "error": error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection — same split-signal approach as Claude Code harness
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks — shared with Claude Code harness
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            import re as _re
-            m = _re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run agent
-# ---------------------------------------------------------------------------
-
-def _setup_workspace(skill: str, target_dir: Path) -> None:
-    """Copy cursor/.cursor/rules/ into a fresh workspace so alwaysApply rules load."""
-    dst = target_dir / ".cursor" / "rules"
-    dst.mkdir(parents=True, exist_ok=True)
-    for mdc in CURSOR_RULES_DIR.glob("*.mdc"):
-        shutil.copy2(mdc, dst / mdc.name)
-
-
-def run_cursor(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    full_auto: bool = False,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        _setup_workspace(skill, tmpdir)
-
-        api_key = os.environ.get("CURSOR_API_KEY", "")
-        cmd = [
-            "agent", "-p", prompt,
-            "--output-format", "stream-json",
-            "--print",
-            "--trust",
-        ]
-        if api_key:
-            cmd += ["--api-key", api_key]
-        if model:
-            cmd += ["--model", model]
-        if full_auto:
-            cmd.append("--force")
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=str(tmpdir),
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        return parse_stream(proc.stdout), proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "output_text": "",
-                "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'agent' CLI not found. Install Cursor and ensure it is in PATH.",
-              file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Cursor Agent eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--full-auto", action="store_true",
-                        help="Pass --force so the agent can execute commands")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (e.g. gpt-5.5, sonnet-4)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    if not CURSOR_RULES_DIR.exists():
-        print(f"ERROR: {CURSOR_RULES_DIR} not found. Run scripts/generate-cursor-rules.sh first.",
-              file=sys.stderr)
-        sys.exit(1)
-
-    mode = "full-auto" if args.full_auto else "observe"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: cursor  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no agent calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_tokens = {"input": 0, "output": 0}
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_cursor(prompt, skill, run_id, full_auto=args.full_auto, model=args.model)
-        u = parsed.get("usage", {})
-        total_tokens["input"]  += u.get("inputTokens", 0)
-        total_tokens["output"] += u.get("outputTokens", 0)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "cursor",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "usage":           u,
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=cursor")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Total tokens     : {total_tokens['input']} in / {total_tokens['output']} out")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "cursor",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "total_tokens": total_tokens,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "cursor"]
     main()
diff --git a/evals/harnesses/gemini/run-evals.py b/evals/harnesses/gemini/run-evals.py
index d00c8c5..00fce99 100644
--- a/evals/harnesses/gemini/run-evals.py
+++ b/evals/harnesses/gemini/run-evals.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 """
+FROZEN/LEGACY: superseded by the agy harness and the unified 'uv run evals' CLI. Not wired into CI. References the removed prompts.csv and will not run as-is. Kept for historical reference only.
+
 Gemini CLI eval harness for StackHawk agent skills.
 
 Uses `gemini -p --output-format stream-json` (Gemini's headless CLI).
diff --git a/evals/hawkscan/prompts.csv b/evals/hawkscan/prompts.csv
deleted file mode 100644
index 20e1ebd..0000000
--- a/evals/hawkscan/prompts.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-hw-01,true,explicit,"Use $hawkscan to scan my API running on localhost:8080","Direct skill reference by name; baseline — if this fails everything else is moot"
-hw-02,true,explicit,"Run the hawkscan skill against my running application","hawkscan keyword in natural language"
-hw-03,true,implicit,"Run a DAST scan on my running application","DAST keyword from skill description"
-hw-04,true,implicit,"Scan my app for security vulnerabilities","'security scan' + 'app' language"
-hw-05,true,implicit,"Use StackHawk to check my service on port 8080","StackHawk platform keyword"
-hw-06,true,implicit,"I need to run a security scan on my API","'security scan' + 'API' language"
-hw-07,true,contextual,"I just finished implementing the payment processing endpoint. Run a security scan before we deploy.","Explicit scan request after code change"
-hw-08,true,contextual,"I added OAuth2 support to the auth service. Check it for security issues.","Implicit security check after feature; tests autonomous trigger"
-hw-09,true,contextual,"The SQL injection fix is in. Verify the vulnerability is actually resolved.","Fix verification — should route to rescan path not a fresh scan"
-hw-10,true,contextual,"I refactored the API gateway — check it hasn't introduced any security regressions.","Post-refactor security check; tests autonomous trigger on refactor"
-hw-11,true,contextual,"My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security holes before I merge.","Pre-merge gate with OpenAPI spec context"
-hw-12,true,contextual,"Feature is done. Let's make sure it's secure.","Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing"
-hw-13,false,negative,"Show me my security findings from the last StackHawk scan","Reporting query → api skill; hawkscan must NOT trigger"
-hw-14,false,negative,"What's my security posture across all my applications?","Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan"
-hw-15,false,negative,"How many HIGH severity findings do I have untriaged?","Finding count query → api skill"
-hw-16,false,negative,"Update the README with instructions for running security scans","Doc-only change; skill explicitly excludes doc-only changes"
-hw-17,false,negative,"Don't run a security scan after this change — just commit it","Explicit user skip; skill must respect user's explicit opt-out"
-hw-18,false,negative,"Change the host in stackhawk.yml from localhost:8080 to localhost:3000","Config-only edit to stackhawk.yml; not a code change, not a scan trigger"
-hw-19,false,negative,"Install the hawk CLI on my machine","Installation task with no scan intent"
-hw-20,false,negative,"What types of vulnerabilities does HawkScan detect?","Research question; no scan intent, no code change"
diff --git a/evals/hawkscan/prompts.yaml b/evals/hawkscan/prompts.yaml
new file mode 100644
index 0000000..3c6e5dd
--- /dev/null
+++ b/evals/hawkscan/prompts.yaml
@@ -0,0 +1,107 @@
+- id: hw-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use $hawkscan to scan my API running on localhost:8080
+  notes: Direct skill reference by name; baseline — if this fails everything else is moot
+- id: hw-02
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Run the hawkscan skill against my running application
+  notes: hawkscan keyword in natural language
+- id: hw-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Run a DAST scan on my running application
+  notes: DAST keyword from skill description
+- id: hw-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Scan my app for security vulnerabilities
+  notes: '''security scan'' + ''app'' language'
+- id: hw-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Use StackHawk to check my service on port 8080
+  notes: StackHawk platform keyword
+- id: hw-06
+  should_trigger: true
+  invocation_type: implicit
+  prompt: I need to run a security scan on my API
+  notes: '''security scan'' + ''API'' language'
+- id: hw-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I just finished implementing the payment processing endpoint. Run a security scan before we
+    deploy.
+  notes: Explicit scan request after code change
+  budget:
+    bash_commands: 15
+    cost_usd: 0.15
+  expected:
+  - signal: hawk validate
+- id: hw-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I added OAuth2 support to the auth service. Check it for security issues.
+  notes: Implicit security check after feature; tests autonomous trigger
+- id: hw-09
+  should_trigger: true
+  invocation_type: contextual
+  prompt: The SQL injection fix is in. Verify the vulnerability is actually resolved.
+  notes: Fix verification — should route to rescan path not a fresh scan
+- id: hw-10
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I refactored the API gateway — check it hasn't introduced any security regressions.
+  notes: Post-refactor security check; tests autonomous trigger on refactor
+- id: hw-11
+  should_trigger: true
+  invocation_type: contextual
+  prompt: My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security
+    holes before I merge.
+  notes: Pre-merge gate with OpenAPI spec context
+- id: hw-12
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Feature is done. Let's make sure it's secure.
+  notes: Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing
+- id: hw-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Show me my security findings from the last StackHawk scan
+  notes: Reporting query → api skill; hawkscan must NOT trigger
+- id: hw-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: What's my security posture across all my applications?
+  notes: Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan
+- id: hw-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: How many HIGH severity findings do I have untriaged?
+  notes: Finding count query → api skill
+- id: hw-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Update the README with instructions for running security scans
+  notes: Doc-only change; skill explicitly excludes doc-only changes
+- id: hw-17
+  should_trigger: false
+  invocation_type: negative
+  prompt: Don't run a security scan after this change — just commit it
+  notes: Explicit user skip; skill must respect user's explicit opt-out
+- id: hw-18
+  should_trigger: false
+  invocation_type: negative
+  prompt: Change the host in stackhawk.yml from localhost:8080 to localhost:3000
+  notes: Config-only edit to stackhawk.yml; not a code change, not a scan trigger
+- id: hw-19
+  should_trigger: false
+  invocation_type: negative
+  prompt: Install the hawk CLI on my machine
+  notes: Installation task with no scan intent
+- id: hw-20
+  should_trigger: false
+  invocation_type: negative
+  prompt: What types of vulnerabilities does HawkScan detect?
+  notes: Research question; no scan intent, no code change
diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/lib/baseline.py b/evals/lib/baseline.py
new file mode 100644
index 0000000..a23575b
--- /dev/null
+++ b/evals/lib/baseline.py
@@ -0,0 +1,45 @@
+"""Pure-Python (no AI) comparison of a run against a baseline run."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.models import CellReport
+
+
+def diff(current: CellReport, baseline: CellReport) -> dict[str, str]:
+    cur = {r.run_id: r.verdict.value for r in current.results}
+    base = {r.run_id: r.verdict.value for r in baseline.results}
+    out: dict[str, str] = {}
+    for rid in set(cur) | set(base):
+        if rid not in base:
+            out[rid] = "new"
+        elif rid not in cur:
+            out[rid] = "dropped"
+        elif cur[rid] == base[rid]:
+            out[rid] = "same"
+        elif cur[rid] == "fail":
+            out[rid] = "regressed"
+        elif base[rid] == "fail":
+            out[rid] = "fixed"
+        else:
+            out[rid] = "changed"
+    return out
+
+
+def score_delta(current_avg: int, baseline_avg: int, band: int = 3) -> str:
+    d = current_avg - baseline_avg
+    if abs(d) <= band:
+        return "no-change"
+    return "better" if d > 0 else "worse"
+
+
+def load_baseline_dir(path: Path | None) -> dict[tuple[str, str, str], CellReport]:
+    out: dict[tuple[str, str, str], CellReport] = {}
+    if not path or not Path(path).exists():
+        return out
+    for cj in Path(path).rglob("cell.json"):
+        try:
+            cell = CellReport.model_validate_json(cj.read_text())
+        except Exception:
+            continue
+        out[(cell.platform, cell.skill, cell.model)] = cell
+    return out
diff --git a/evals/lib/compare.py b/evals/lib/compare.py
new file mode 100644
index 0000000..5f00856
--- /dev/null
+++ b/evals/lib/compare.py
@@ -0,0 +1,46 @@
+"""Run each should_trigger prompt with and without the skill loaded; report lift."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+from evals.lib.models import Verdict
+
+
+def compare_skill(skill: str, platform: str, *, model: str | None = None,
+                  max_budget: float = 0.20, bare: bool = False,
+                  full_auto: bool = False, only_id: str | None = None) -> list[dict]:
+    cfg = load_skill(skill)
+    adapter = get_adapter(platform)
+    plugin_dirs = [str(Path.cwd() / "plugins" / skill)]
+    prompts = [p for p in cfg.prompts
+               if p.should_trigger and (not only_id or p.id == only_id)]
+
+    rows = []
+    for p in prompts:
+        graded = {}
+        for load in (True, False):
+            run = adapter.launch(p.prompt, skill, f"{p.id}-{'with' if load else 'without'}",
+                                 plugin_dirs, model=model, load_skill=load,
+                                 max_budget=max_budget, bare=bare, full_auto=full_auto)
+            did = adapter.detect_trigger(run, skill)
+            graded[load] = grade(p, run, cfg.checks, platform=platform, skill=skill,
+                                 did_trigger=did)
+        wv = graded[True].verdict
+        wo = graded[False].verdict
+        if wo == Verdict.FAIL and wv != Verdict.FAIL:
+            effect = "lift"
+        elif wo != Verdict.FAIL and wv == Verdict.FAIL:
+            effect = "regress"
+        else:
+            effect = "none"
+        rows.append({
+            "id": p.id,
+            "with_verdict": wv,
+            "without_verdict": wo,
+            "with_cost": graded[True].cost_usd,
+            "without_cost": graded[False].cost_usd,
+            "effect": effect,
+        })
+    return rows
diff --git a/evals/lib/config.py b/evals/lib/config.py
new file mode 100644
index 0000000..4736749
--- /dev/null
+++ b/evals/lib/config.py
@@ -0,0 +1,39 @@
+"""Load and validate a skill's eval config (prompts.yaml + process-checks.json)."""
+from __future__ import annotations
+import json
+from pathlib import Path
+
+import yaml
+from pydantic import BaseModel
+
+from evals.lib.models import PromptConfig
+
+EVALS_DIR = Path(__file__).resolve().parent.parent  # repo/evals
+
+
+class SkillConfig(BaseModel):
+    skill: str
+    prompts: list[PromptConfig]
+    checks: list[dict]
+
+
+def load_skill(skill: str, base_dir: Path | None = None) -> SkillConfig:
+    base = base_dir or EVALS_DIR
+    skill_dir = base / skill
+    prompts_raw = yaml.safe_load((skill_dir / "prompts.yaml").read_text()) or []
+    prompts = [PromptConfig(**row) for row in prompts_raw]  # raises on bad fields
+
+    ids = [p.id for p in prompts]
+    dupes = {i for i in ids if ids.count(i) > 1}
+    if dupes:
+        raise ValueError(f"duplicate prompt id(s) in {skill}: {sorted(dupes)}")
+
+    checks = json.loads((skill_dir / "process-checks.json").read_text())["checks"]
+    id_set = set(ids)
+    for c in checks:
+        for target in c.get("applies_to", []):
+            if target not in id_set:
+                raise ValueError(
+                    f"check '{c['id']}' applies_to references unknown prompt '{target}'")
+
+    return SkillConfig(skill=skill, prompts=prompts, checks=checks)
diff --git a/evals/lib/grading.py b/evals/lib/grading.py
new file mode 100644
index 0000000..9b4fb3d
--- /dev/null
+++ b/evals/lib/grading.py
@@ -0,0 +1,155 @@
+"""Grading: process checks (ported from the claude-code harness), per-prompt
+ad-hoc expectations, budget scoring, and the three-state verdict."""
+from __future__ import annotations
+import re
+
+from evals.lib.models import (
+    ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict,
+    ProcessCheckResult, EvalResult,
+)
+
+
+def applicable_checks(checks: list[dict], prompt_id: str) -> list[dict]:
+    """A check applies if it has no applies_to (global) or names this prompt id."""
+    out = []
+    for c in checks:
+        targets = c.get("applies_to")
+        if not targets or prompt_id in targets:
+            out.append(c)
+    return out
+
+
+def _haystack(run: ParsedRun) -> str:
+    return " ".join([*run.bash_commands, run.output_text]).lower()
+
+
+def run_process_checks(run: ParsedRun, checks: list[dict]) -> list[ProcessCheckResult]:
+    haystack = _haystack(run)
+    all_files = " ".join(run.files_written + run.files_edited).lower()
+    results: list[ProcessCheckResult] = []
+
+    for check in checks:
+        ctype = check.get("type", "command_executed")
+        signals = [s.lower() for s in check.get("signals", [])]
+        antis = [a.lower() for a in check.get("anti_patterns", [])]
+        signal_hit = next((s for s in signals if s in haystack), None)
+        anti_hit = next((a for a in antis if a in haystack), None)
+
+        if ctype in ("command_negative", "file_content_negative", "output_negative"):
+            passed = anti_hit is None
+        elif ctype in ("file_absent", "file_absent_or_unchanged"):
+            # The file(s) must NOT have been written/edited. Supports either a
+            # single target_file or a list of anti_pattern paths (data-seed uses
+            # both forms). "_or_unchanged" is the same absence test here — the
+            # eval doesn't diff pre-existing content.
+            target = check.get("target_file", "").lower()
+            passed = (not target or target not in all_files) and \
+                     not any(a in all_files for a in antis)
+        elif ctype == "file_present":
+            # The artifact should exist: written/edited for real (execution mode)
+            # OR named in the agent's narration (observe mode).
+            passed = any(s in all_files or s in haystack for s in signals)
+        elif ctype == "conditional_command":
+            condition_str = check.get("condition", "")
+            m = re.search(r"'([^']+)'", condition_str)
+            if condition_str and m is None:
+                raise ValueError(
+                    f"conditional_command check '{check['id']}': condition "
+                    f"'{condition_str}' has no single-quoted keyword")
+            keyword = m.group(1).lower() if m else None
+            passed = True if (keyword and keyword not in haystack) else signal_hit is not None
+        elif ctype == "command_preference":
+            preferred = [p.lower() for p in check.get("preferred", [])]
+            if preferred:
+                passed = any(p in haystack for p in preferred) and anti_hit is None
+            else:
+                passed = anti_hit is None  # no preference expressed; only anti-patterns matter
+        else:
+            passed = signal_hit is not None and (anti_hit is None if antis else True)
+
+        results.append(ProcessCheckResult(
+            id=check["id"], passed=passed,
+            severity=check.get("severity", "warning"),
+            signal_found=signal_hit, anti_found=anti_hit,
+        ))
+    return results
+
+
+def run_adhoc_expected(run: ParsedRun, expected: list[ExpectedCheck]) -> list[ProcessCheckResult]:
+    """Per-prompt expectations. signal/anti_pattern are blocking; check_id refs are
+    resolved by the caller against process-checks and skipped here."""
+    haystack = _haystack(run)
+    results: list[ProcessCheckResult] = []
+    for i, exp in enumerate(expected):
+        if exp.check_id is not None:
+            continue  # handled via applies_to / process checks
+        if exp.signal is not None:
+            hit = exp.signal.lower() in haystack
+            results.append(ProcessCheckResult(
+                id=f"expected[{i}]:signal", passed=hit, severity="blocking",
+                signal_found=exp.signal if hit else None))
+        elif exp.anti_pattern is not None:
+            hit = exp.anti_pattern.lower() in haystack
+            results.append(ProcessCheckResult(
+                id=f"expected[{i}]:anti", passed=not hit, severity="blocking",
+                anti_found=exp.anti_pattern if hit else None))
+    return results
+
+
+def check_budget(run: ParsedRun, budget: BudgetSpec) -> list[str]:
+    breaches: list[str] = []
+    if budget.cost_usd is not None and run.cost_usd > budget.cost_usd:
+        breaches.append(f"cost_usd {run.cost_usd:.3f} > {budget.cost_usd:.3f}")
+    if budget.bash_commands is not None and len(run.bash_commands) > budget.bash_commands:
+        breaches.append(f"bash_commands {len(run.bash_commands)} > {budget.bash_commands}")
+    if budget.output_tokens is not None and (run.output_tokens or 0) > budget.output_tokens:
+        breaches.append(f"output_tokens {run.output_tokens} > {budget.output_tokens}")
+    if budget.wall_seconds is not None and (run.wall_seconds or 0) > budget.wall_seconds:
+        breaches.append(f"wall_seconds {run.wall_seconds:.0f} > {budget.wall_seconds:.0f}")
+    return breaches
+
+
+def _score(checks: list[ProcessCheckResult]) -> int:
+    blocking = sum(1 for c in checks if not c.passed and c.severity == "blocking")
+    warning = sum(1 for c in checks if not c.passed and c.severity == "warning")
+    return max(0, 100 - blocking * 15 - warning * 5)
+
+
+def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
+          platform: str, skill: str, did_trigger: bool) -> EvalResult:
+    trigger_correct = (did_trigger == prompt.should_trigger)
+
+    # Process checks, ad-hoc expectations, and budgets only apply when the skill
+    # should have fired AND did. For correct non-triggers, false positives, and
+    # false negatives, the verdict is purely the trigger outcome (no process grading).
+    if not (prompt.should_trigger and did_trigger):
+        return EvalResult(
+            platform=platform, skill=skill, run_id=prompt.id,
+            should_trigger=prompt.should_trigger, did_trigger=did_trigger,
+            trigger_correct=trigger_correct,
+            verdict=Verdict.PASS if trigger_correct else Verdict.FAIL,
+            budget_breaches=[], process_checks=[],
+            score=100 if trigger_correct else 0, cost_usd=run.cost_usd,
+            note=(run.error or ""),
+        )
+
+    proc = run_process_checks(run, applicable_checks(checks, prompt.id))
+    proc += run_adhoc_expected(run, prompt.expected)
+
+    blocking_failed = any(not c.passed and c.severity == "blocking" for c in proc)
+    verdict = Verdict.FAIL if blocking_failed else Verdict.PASS
+
+    breaches: list[str] = []
+    if verdict == Verdict.PASS and prompt.budget is not None:
+        breaches = check_budget(run, prompt.budget)
+        if breaches:
+            verdict = Verdict.PASS_SLOW
+
+    return EvalResult(
+        platform=platform, skill=skill, run_id=prompt.id,
+        should_trigger=prompt.should_trigger, did_trigger=did_trigger,
+        trigger_correct=trigger_correct,
+        verdict=verdict, budget_breaches=breaches, process_checks=proc,
+        score=_score(proc), cost_usd=run.cost_usd,
+        note=(run.error or ""),
+    )
diff --git a/evals/lib/harness.py b/evals/lib/harness.py
new file mode 100644
index 0000000..52fb0be
--- /dev/null
+++ b/evals/lib/harness.py
@@ -0,0 +1,32 @@
+"""Harness protocol + adapter registry. An adapter owns everything runtime-specific:
+how to launch the agent, how to parse its stream, and which signals indicate the
+skill fired. Everything downstream consumes the ParsedRun it returns."""
+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+from typing import Protocol
+
+from evals.lib.models import ParsedRun
+
+EVALS_DIR = Path(__file__).resolve().parent.parent
+
+
+class Harness(Protocol):
+    platform: str
+    def cli_signals(self, skill: str) -> list[str]: ...
+    def invocation_signals(self, skill: str) -> list[str]: ...
+    def parse_stream(self, raw: str) -> ParsedRun: ...
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool: ...
+    def launch(self, prompt: str, skill: str, run_id: str, plugin_dirs: list[str],
+               *, model: str | None, load_skill: bool, max_budget: float,
+               bare: bool, full_auto: bool) -> ParsedRun: ...
+
+
+def get_adapter(platform: str) -> Harness:
+    path = EVALS_DIR / "harnesses" / platform / "adapter.py"
+    if not path.exists():
+        raise ValueError(f"no adapter for platform '{platform}' at {path}")
+    spec = importlib.util.spec_from_file_location(f"adapter_{platform.replace('-', '_')}", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod.ADAPTER
diff --git a/evals/lib/models.py b/evals/lib/models.py
new file mode 100644
index 0000000..0febcc9
--- /dev/null
+++ b/evals/lib/models.py
@@ -0,0 +1,117 @@
+"""Pydantic data contracts for the eval system. extra='forbid' makes config
+typos hard load-time errors instead of silently-ignored fields."""
+from __future__ import annotations
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, model_validator
+
+
+class BudgetSpec(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    cost_usd: float | None = None
+    bash_commands: int | None = None
+    output_tokens: int | None = None
+    wall_seconds: float | None = None
+
+
+class ExpectedCheck(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    check_id: str | None = None      # reference an existing process-check by id
+    signal: str | None = None        # ad-hoc substring that MUST appear
+    anti_pattern: str | None = None  # substring that must NOT appear
+
+    @model_validator(mode="after")
+    def _exactly_one(self) -> "ExpectedCheck":
+        set_count = sum(x is not None for x in (self.check_id, self.signal, self.anti_pattern))
+        if set_count != 1:
+            raise ValueError("ExpectedCheck must set exactly one of "
+                             "check_id / signal / anti_pattern")
+        return self
+
+
+class PromptConfig(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    should_trigger: bool
+    invocation_type: Literal["explicit", "implicit", "contextual", "negative"]
+    prompt: str
+    notes: str = ""
+    budget: BudgetSpec | None = None
+    expected: list[ExpectedCheck] = []
+
+
+class Verdict(str, Enum):
+    PASS = "pass"
+    PASS_SLOW = "pass-slow"
+    FAIL = "fail"
+
+
+class ParsedRun(BaseModel):
+    bash_commands: list[str] = []
+    files_written: list[str] = []
+    files_edited: list[str] = []
+    output_text: str = ""
+    cost_usd: float = 0.0
+    output_tokens: int | None = None
+    wall_seconds: float | None = None
+    error: str | None = None
+    returncode: int | None = None
+    stderr_tail: str = ""
+
+
+class ProcessCheckResult(BaseModel):
+    id: str
+    passed: bool
+    severity: Literal["blocking", "warning"]
+    signal_found: str | None = None
+    anti_found: str | None = None
+
+
+class RubricCheckResult(BaseModel):
+    id: str
+    passed: bool
+    notes: str = ""
+
+
+class RubricResult(BaseModel):
+    """Qualitative, model-graded result (ported from origin/main's --rubric pass).
+    A grader model reviews the transcript against rubric-items.json and returns
+    a 0-100 score + per-item pass/fail; overall_pass = all pass and score >= 70."""
+    overall_pass: bool
+    score: int
+    checks: list[RubricCheckResult] = []
+    error: str | None = None   # set if the grader couldn't run/parse
+
+
+class EvalResult(BaseModel):
+    platform: str
+    skill: str
+    run_id: str
+    should_trigger: bool
+    did_trigger: bool
+    trigger_correct: bool
+    verdict: Verdict
+    budget_breaches: list[str] = []
+    process_checks: list[ProcessCheckResult] = []
+    score: int
+    cost_usd: float = 0.0
+    note: str = ""
+    rubric: RubricResult | None = None   # populated only when --rubric is set
+
+
+class CellReport(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    platform: str
+    skill: str
+    model: str
+    commit: str
+    results: list[EvalResult]
+
+
+class LiftRow(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    without_verdict: Verdict
+    with_verdict: Verdict
+    effect: Literal["lift", "regress", "none"]
diff --git a/evals/lib/observe.py b/evals/lib/observe.py
new file mode 100644
index 0000000..95b032a
--- /dev/null
+++ b/evals/lib/observe.py
@@ -0,0 +1,94 @@
+"""Shared per-skill observe-mode prompt suffixes, used by every harness adapter.
+
+Observe mode gauges whether the right skill TRIGGERS and whether the agent knows
+its WORKFLOW, so we ask it to declare the skill and write out the commands it would
+run. The declaration matches the explicit-decision parser (evals/lib/triggers.py);
+the commands match the process-check signals (which scan bash_commands +
+output_text). We deliberately do NOT list the commands here — producing them is the
+skill's job, i.e. the test.
+
+The suffix is PER-SKILL: the three skills have different sandbox execution
+profiles, so one shared string can't serve all of them.
+  - hawkscan needs a live target to scan. With none present, any execution attempt
+    stalls mid-workflow, so its observe pass is a pure paper walkthrough.
+  - api is a read-workflow over hawkop; it degrades gracefully (narrate if creds
+    absent, run the read-only queries if present).
+  - data-seed's product is the artifacts it emits (manifest + data-seed/), so its
+    walkthrough must enumerate those.
+
+Every harness shares this config and the same `plugin:skill: YES`/`none: NO`
+decision format, so trigger detection is uniform across harnesses. Appended only
+in observe mode — full-auto / extended runs against a real target use the bare
+prompt.
+"""
+from __future__ import annotations
+
+# Anti-refusal core (all skills): in headless `-p` mode a model may have only the
+# skill's description, not its body. A rigid "do not invent" then makes weak models
+# refuse — "I can't access the skill definition, should I read it?" (haiku scored 0
+# this way). So tell it to invoke/load the skill and not pause to ask permission.
+_USE_SKILL = (
+    "Use the skill's own steps — if its full definition isn't already in your "
+    "context, invoke/load the skill to get them; do NOT pause to ask permission to "
+    "read or load it."
+)
+
+# Command-emission guidance is PER-SKILL. "Include the command even if unsure of a
+# flag" is safe for hawkscan/api (listing commands has no side effect) but wrong for
+# data-seed: it's a code-EMITTER, and narrating a startup command like
+# `docker-compose up` trips its no-startup anti-pattern. data-seed therefore gets
+# read-only discovery guidance instead.
+_CMDS_OK = (
+    " Give the real commands with their flags, not a prose summary; if you can't "
+    "recall an exact flag, include the command anyway rather than skipping the step."
+)
+_DATA_SEED_GUIDANCE = (
+    " Give the real discovery commands and the artifacts emitted, not a prose "
+    "summary. Discovery only READS the repo; data-seed emits files and never starts "
+    "services — do NOT run or list app-startup commands (docker compose up, npm "
+    "start, ./gradlew bootRun, etc.)."
+)
+
+_OBSERVE_HEADER = (
+    "\n\n---\n"
+    "(Eval harness — observe mode. The target app, credentials, or prior scans may "
+    "be unavailable here. Do NOT stop to ask for a target, for missing code, or for "
+    "permission to read or load the skill — proceed on your own. Output exactly:\n"
+    "1. A decision line naming the StackHawk skill this request should invoke, "
+    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, "
+    "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n"
+)
+
+OBSERVE_SUFFIX = {
+    # hawkscan: no live target here, so executing the scan stalls — keep it a
+    # pure paper walkthrough of the full command sequence.
+    "hawkscan": _OBSERVE_HEADER + (
+        "2. If (and only if) the hawkscan skill applies, write out its COMPLETE "
+        "documented workflow as the exact CLI commands it runs, in order — every "
+        "phase from preflight through the verifying rescan. This is a paper "
+        "walkthrough: do NOT try to run the scan, there is no live target here. "
+        + _USE_SKILL + _CMDS_OK + ")"
+    ),
+    # api: a read-workflow over hawkop. Narrate the full command sequence; if
+    # hawkop + credentials happen to be present, the read-only queries may also run.
+    "api": _OBSERVE_HEADER + (
+        "2. If (and only if) the api skill applies, write out its COMPLETE documented "
+        "workflow as the exact CLI commands it runs, in order — every phase from the "
+        "hawkop preflight/auth check and org resolution through the final query. "
+        + _USE_SKILL + _CMDS_OK + " If hawkop and credentials are available, you may "
+        "also run the read-only queries.)"
+    ),
+    # data-seed: its product is the emitted artifacts, so the walkthrough must name
+    # the discovery steps, the minimal seed set, and the files it writes.
+    "stackhawk-data-seed": _OBSERVE_HEADER + (
+        "2. If (and only if) the data-seed skill applies, write out its COMPLETE "
+        "documented workflow in order — the discovery steps, the minimal seed set it "
+        "proposes, and the exact artifacts it emits (the data-seed/ directory, "
+        "manifest.yaml, and the credentials file). " + _USE_SKILL + _DATA_SEED_GUIDANCE + ")"
+    ),
+}
+
+
+def observe_suffix(skill: str) -> str:
+    """The observe-mode suffix for `skill`, or '' if the skill is unknown."""
+    return OBSERVE_SUFFIX.get(skill, "")
diff --git a/evals/lib/replay.py b/evals/lib/replay.py
new file mode 100644
index 0000000..95e826c
--- /dev/null
+++ b/evals/lib/replay.py
@@ -0,0 +1,29 @@
+"""Regrade a saved trace with no agent call — the zero-cost iteration loop.
+The trace filename stem is the prompt id (e.g. hw-07.trace.jsonl -> hw-07)."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+from evals.lib.models import EvalResult
+
+
+def _prompt_id_from_path(trace_path: Path) -> str:
+    return trace_path.name.split(".")[0]
+
+
+def regrade(trace_path: Path, *, skill: str, platform: str) -> EvalResult:
+    trace_path = Path(trace_path)
+    adapter = get_adapter(platform)
+    run = adapter.parse_stream(trace_path.read_text())
+
+    cfg = load_skill(skill)
+    prompt_id = _prompt_id_from_path(trace_path)
+    prompt = next((p for p in cfg.prompts if p.id == prompt_id), None)
+    if prompt is None:
+        raise ValueError(f"no prompt '{prompt_id}' in skill '{skill}'")
+
+    did_trigger = adapter.detect_trigger(run, skill)
+    return grade(prompt, run, cfg.checks, platform=platform, skill=skill,
+                 did_trigger=did_trigger)
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
new file mode 100644
index 0000000..71e0c5e
--- /dev/null
+++ b/evals/lib/reporting.py
@@ -0,0 +1,235 @@
+"""Summaries + rich rendering for eval runs."""
+from __future__ import annotations
+import os
+import re
+from collections import Counter
+
+from rich.console import Console
+from rich.table import Table
+
+from evals.lib.models import CellReport, EvalResult, Verdict
+
+console = Console()
+DOT = {Verdict.PASS: "[green]● PASS[/]", Verdict.PASS_SLOW: "[yellow]◐ PASS-SLOW[/]",
+       Verdict.FAIL: "[red]○ FAIL[/]"}
+
+
+def build_summary(skill: str, platform: str, results: list[EvalResult]) -> dict:
+    correct = sum(1 for r in results if r.trigger_correct)
+    fp = [r.run_id for r in results if not r.should_trigger and r.did_trigger]
+    fn = [r.run_id for r in results if r.should_trigger and not r.did_trigger]
+    counts = Counter(r.verdict.value for r in results)
+    graded = [r for r in results if r.did_trigger and r.should_trigger]
+    avg = sum(r.score for r in graded) // len(graded) if graded else None
+    return {
+        "skill": skill, "platform": platform,
+        "trigger_accuracy": {"correct": correct, "total": len(results)},
+        "false_positives": fp, "false_negatives": fn,
+        "verdict_counts": dict(counts), "process_avg_score": avg,
+        "total_blocking_failures": sum(
+            1 for r in results for c in r.process_checks
+            if not c.passed and c.severity == "blocking"),
+    }
+
+
+def render_table(results: list[EvalResult]) -> None:
+    t = Table(show_edge=False, box=None, padding=(0, 2))
+    for col in ("ID", "Trigger", "Verdict", "Score", "Budget", "Cost"):
+        t.add_column(col)
+    for r in results:
+        trig = "[green]✓[/]" if r.trigger_correct else "[red]✗[/]"
+        budget = ", ".join(r.budget_breaches) or "—"
+        t.add_row(r.run_id, trig, DOT[r.verdict], str(r.score), budget,
+                  f"${r.cost_usd:.3f}")
+    console.print(t)
+
+
+def render_compare(rows: list[dict]) -> None:
+    """rows: {id, with_verdict, without_verdict, with_cost, without_cost}."""
+    t = Table(show_edge=False, box=None, padding=(0, 2))
+    for col in ("ID", "Without skill", "With skill", "Δ"):
+        t.add_column(col)
+    for row in rows:
+        w, wo = row["with_verdict"], row["without_verdict"]
+        delta = "[green]↑ lift[/]" if (wo == Verdict.FAIL and w != Verdict.FAIL) else (
+                "[red]↓ regress[/]" if (wo != Verdict.FAIL and w == Verdict.FAIL) else "=")
+        t.add_row(row["id"], DOT[wo], DOT[w], delta)
+    console.print(t)
+
+
+_BADGE_COLOR = {
+    "pass": "brightgreen", "pass-slow": "yellow", "fail": "red",
+    "regressed": "red", "fixed": "brightgreen", "changed": "blue",
+    "same": "lightgrey", "better": "brightgreen", "worse": "red",
+    "no-change": "lightgrey",
+}
+
+
+def badge(kind: str, label: str) -> str:
+    color = _BADGE_COLOR.get(kind, "lightgrey")
+    safe = label.replace("-", "--").replace(" ", "_")
+    return f"![{label}](https://img.shields.io/badge/{safe}-{color})"
+
+
+_VERDICT_ICON = {"pass": "✅ PASS", "pass-slow": "◆ PASS-SLOW", "fail": "❌ FAIL"}
+
+
+def _row_rank(r: EvalResult) -> int:
+    # failures first (incl. trigger-incorrect), then slow, then pass
+    if r.verdict.value == "fail" or not r.trigger_correct:
+        return 0
+    if r.verdict.value == "pass-slow":
+        return 1
+    return 2
+
+
+def write_github_summary(md: str) -> None:
+    path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not path:
+        return
+    with open(path, "a", encoding="utf-8") as fp:
+        fp.write(md)
+
+
+_PLATFORM_ORDER = {p: i for i, p in
+                   enumerate(["claude-code", "codex", "cursor", "agy", "copilot"])}
+_PIVOT_ICON = {"pass": "✅", "pass-slow": "◆", "fail": "❌"}
+
+
+def _short_model(model: str) -> str:
+    """Compact column label: drop a trailing date stamp and a redundant
+    'claude-' prefix. 'claude-haiku-4-5-20251001' -> 'haiku-4-5'; 'o3' -> 'o3'."""
+    m = re.sub(r"-\d{6,}$", "", model)
+    if m.startswith("claude-"):
+        m = m[len("claude-"):]
+    return m or model
+
+
+def _id_sort_key(run_id: str):
+    m = re.search(r"(\d+)", run_id)
+    return (int(m.group(1)) if m else 0, run_id)
+
+
+def _fail_reason(r: EvalResult) -> str:
+    reason = (r.note or "").strip()
+    if not reason:
+        if not r.trigger_correct:
+            reason = "false-positive" if r.did_trigger else "false-negative"
+        elif r.budget_breaches:
+            reason = "; ".join(r.budget_breaches)
+        else:
+            reason = "blocking check failed"
+    reason = reason.replace("|", "/").replace("\n", " ").strip()
+    return reason[:69] + "…" if len(reason) > 70 else reason
+
+
+def _rubric_tag(r: EvalResult) -> str:
+    """Qualitative rubric badge woven into the cell: ` r85✓` / ` r55✗`.
+    Empty when the rubric didn't run for this prompt."""
+    if r.rubric is None:
+        return ""
+    if r.rubric.error:
+        return " r?"
+    return f" r{r.rubric.score}{'✓' if r.rubric.overall_pass else '✗'}"
+
+
+def _pivot_cell(r: EvalResult | None) -> str:
+    """One matrix cell: deterministic verdict emoji + a terse reason on non-pass,
+    with the qualitative rubric score (rNN✓/✗) appended when it ran."""
+    if r is None:
+        return "·"   # this harness/model didn't run this test
+    rub = _rubric_tag(r)
+    v = r.verdict.value
+    if v == "pass":
+        return f"{_PIVOT_ICON['pass']}{rub}"
+    if v == "pass-slow":
+        why = "; ".join(r.budget_breaches) or "slow"
+        return f"{_PIVOT_ICON['pass-slow']} — {why}"[:74] + rub
+    return f"{_PIVOT_ICON['fail']} — {_fail_reason(r)}{rub}"
+
+
+def render_digest(cells, baselines=None, lift=None) -> str:
+    """One aggregated pivot table for the whole matrix.
+
+    Rows are tests (skill/id); columns are platform-model combos; each cell is a
+    verdict emoji followed by a short reason on failures. Replaces the previous
+    per-cell tables so the Actions run summary holds a single table.
+    """
+    out = ["<!-- skill-eval-comment -->", "## Skill Eval Results\n"]
+    if not cells:
+        out.append("_No results._\n")
+        return "\n".join(out) + "\n"
+
+    cols = sorted({(c.platform, c.model) for c in cells},
+                  key=lambda pm: (_PLATFORM_ORDER.get(pm[0], 99), pm[1]))
+    col_label = {pm: f"{pm[0]}-{_short_model(pm[1])}" for pm in cols}
+
+    lookup: dict[tuple, EvalResult] = {}
+    row_keys: dict[tuple, bool] = {}
+    for c in cells:
+        for r in c.results:
+            lookup[(c.platform, c.model, c.skill, r.run_id)] = r
+            row_keys[(c.skill, r.run_id)] = True
+    skill_rank = {"hawkscan": 0, "api": 1}
+    rows = sorted(row_keys, key=lambda sr: (skill_rank.get(sr[0], 9), *_id_sort_key(sr[1])))
+
+    out.append("| test | " + " | ".join(col_label[pm] for pm in cols) + " |")
+    out.append("|---" * (len(cols) + 1) + "|")
+    for skill, rid in rows:
+        line = " | ".join(_pivot_cell(lookup.get((pm[0], pm[1], skill, rid)))
+                          for pm in cols)
+        out.append(f"| {skill}/{rid} | {line} |")
+    out.append("")
+    out.append("_Legend: ✅ pass · ◆ pass-slow · ❌ fail (reason follows) · `·` = not run. "
+               "`rNN✓/✗` = qualitative rubric score/verdict (when --rubric ran)._\n")
+
+    # Optional, compact extras (kept off the main table to avoid the old sprawl).
+    if baselines is None:
+        out.append("_No baseline available — showing absolute results only._\n")
+    else:
+        from evals.lib.baseline import diff as _diff, score_delta
+        notes = []
+        for c in cells:
+            base = baselines.get((c.platform, c.skill, c.model))
+            if base is None:
+                continue
+            tag = f"{c.platform}-{_short_model(c.model)}/{c.skill}"
+            for k, v in sorted(_diff(c, base).items()):
+                if v in ("regressed", "fixed", "changed"):
+                    notes.append(f"{badge(v, v)} {tag}:{k}")
+            g = [r for r in c.results if r.did_trigger and r.should_trigger]
+            bg = [r for r in base.results if r.did_trigger and r.should_trigger]
+            avg = sum(r.score for r in g) // len(g) if g else 0
+            bavg = sum(r.score for r in bg) // len(bg) if bg else 0
+            delta = score_delta(avg, bavg)
+            if delta in ("better", "worse"):
+                notes.append(f"{badge(delta, delta)} {tag}")
+        out.append(("**vs baseline:** " + ", ".join(notes) + "\n") if notes
+                   else "_vs baseline: no changes._\n")
+
+    if lift:
+        out.append("\n### Skill lift (with vs without)\n")
+        for key, rws in lift.items():
+            lifted = sum(1 for r in rws if r["effect"] == "lift")
+            out.append(f"**{key[0]} · {key[1]} · {key[2]}** — "
+                       f"{lifted}/{len(rws)} prompts lifted FAIL→PASS\n")
+    return "\n".join(out) + "\n"
+
+
+def render_job_summary(cell: CellReport) -> str:
+    c = Counter(r.verdict.value for r in cell.results)
+    trig_ok = sum(1 for r in cell.results if r.trigger_correct)
+    n = len(cell.results)
+    head = (f"### {cell.platform} · {cell.skill} · {cell.model}  "
+            f"— ✅ {c.get('pass',0)} / ◆ {c.get('pass-slow',0)} / "
+            f"❌ {c.get('fail',0)}  ·  {c.get('fail',0)} failed  ·  "
+            f"trigger {trig_ok}/{n}\n\n")
+    rows = ["| test | result | why |", "|---|---|---|"]
+    for r in sorted(cell.results, key=lambda r: (_row_rank(r), r.run_id)):
+        why = "; ".join(r.budget_breaches) if r.budget_breaches else (
+            "" if r.trigger_correct else
+            ("false-positive" if r.did_trigger else "false-negative"))
+        if r.note:
+            why = f"{why} — {r.note}" if why else r.note
+        rows.append(f"| {r.run_id} | {_VERDICT_ICON[r.verdict.value]} | {why} |")
+    return head + "\n".join(rows) + "\n"
diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py
new file mode 100644
index 0000000..464569a
--- /dev/null
+++ b/evals/lib/rubric.py
@@ -0,0 +1,126 @@
+"""Qualitative, model-assisted rubric grader.
+
+Ported from origin/main's `--rubric` pass (evals/harnesses/*/run-evals.py).
+A grader model (claude) reviews an agent run's transcript against the skill's
+rubric-items.json and returns a structured 0-100 quality score + per-item
+pass/fail. This is the QUALITATIVE axis that complements the deterministic
+process-checks, and it's woven into the pass/fail table by the reporter.
+
+The grader judges text only, so it is platform-independent: every harness's
+transcript is graded by the same claude grader. Requires ANTHROPIC_API_KEY.
+"""
+from __future__ import annotations
+import json
+import re
+import subprocess
+from pathlib import Path
+
+from evals.lib.models import ParsedRun, RubricResult, RubricCheckResult
+
+EVALS_DIR = Path(__file__).resolve().parent.parent  # repo/evals
+
+
+def _extract_json_object(text: str) -> dict:
+    """Parse a JSON object out of a grader reply that may be pure JSON, wrapped in
+    a ```json fence, or embedded in prose (e.g. "No skills needed.\\n\\n```json
+    {...}```"). Tries direct parse, then a fenced block, then the first balanced
+    {...} object."""
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S)
+    if fence:
+        return json.loads(fence.group(1))
+    start = text.find("{")
+    if start != -1:
+        depth = 0
+        for i in range(start, len(text)):
+            if text[i] == "{":
+                depth += 1
+            elif text[i] == "}":
+                depth -= 1
+                if depth == 0:
+                    return json.loads(text[start:i + 1])
+    raise ValueError(f"no JSON object in grader result: {text[:120]}")
+
+
+def _build_prompt(rubric_data: dict, run: ParsedRun, skill: str, run_id: str) -> str:
+    return f"""{rubric_data['grader_prompt']}
+
+## Bash Commands Executed:
+{json.dumps(run.bash_commands, indent=2)}
+
+## Files Written/Edited:
+{json.dumps(run.files_written + run.files_edited, indent=2)}
+
+## Agent Output (first 4000 chars):
+{run.output_text[:4000]}
+
+## Rubric Checks to Grade:
+{json.dumps(rubric_data['checks'], indent=2)}
+
+Populate the JSON result with:
+  skill = "{skill}"
+  run_id = "{run_id}"
+  overall_pass = true if all checks pass and score >= 70
+  score = 0-100 (each failed check deducts: blocking 15, warning 5)
+  checks = one entry per check id listed above"""
+
+
+# Cheap, capable grader by default — judging a transcript against a rubric is a
+# structured classification task. Budget must cover the full prompt (transcript +
+# rubric + schema); 0.10 hit error_max_budget_usd, so use a roomier cap.
+DEFAULT_GRADER_MODEL = "claude-haiku-4-5-20251001"
+GRADER_BUDGET_USD = "0.25"
+
+
+def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
+                 grader_model: str | None = None, timeout: int = 120,
+                 base_dir: Path | None = None) -> RubricResult | None:
+    """Run the qualitative grader. Returns a RubricResult, or None if the rubric
+    config is absent. On grader failure returns a RubricResult with error set so
+    the run still records a (failed) rubric cell rather than silently dropping it."""
+    base = base_dir or EVALS_DIR
+    rubric_path = base / skill / "rubric-items.json"
+    schema_path = base / "rubric-schema.json"
+    if not rubric_path.exists() or not schema_path.exists():
+        return None
+    rubric_data = json.loads(rubric_path.read_text())
+    schema = json.loads(schema_path.read_text())
+
+    # NOTE: no --bare here. --bare ("minimal mode") suppresses the structured
+    # --json-schema output (returns an empty result), so the grader must run in
+    # full mode. It's a one-shot text judge; no plugin-dir needed.
+    cmd = ["claude", "-p", _build_prompt(rubric_data, run, skill, run_id),
+           "--output-format", "json", "--no-session-persistence",
+           "--json-schema", json.dumps(schema),
+           "--max-budget-usd", GRADER_BUDGET_USD,
+           "--model", grader_model or DEFAULT_GRADER_MODEL]
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+        if not proc.stdout.strip():
+            # claude produced nothing on stdout — surface the real cause (exit
+            # code + stderr) instead of a misleading JSONDecodeError downstream.
+            tail = (proc.stderr or "").strip()[-200:]
+            raise ValueError(f"grader produced no output (exit {proc.returncode}): {tail}")
+        envelope = json.loads(proc.stdout)
+        # --output-format json wraps as {"result": "<json|obj>", ...}; some modes
+        # return the schema object directly. Handle both.
+        raw = envelope.get("result", envelope) if isinstance(envelope, dict) else envelope
+        # `raw` may be a dict already, or a string that is pure JSON, or — even with
+        # --json-schema — a model reply that wraps the JSON in prose / a ```json
+        # fence. Extract the object tolerantly.
+        result = raw if isinstance(raw, dict) else _extract_json_object(raw)
+        if "score" not in result and "overall_pass" not in result:
+            raise ValueError(f"grader returned no rubric fields: {str(result)[:120]}")
+    except Exception as exc:  # noqa: BLE001 — grader is best-effort
+        return RubricResult(overall_pass=False, score=0, checks=[],
+                            error=f"grader failed: {type(exc).__name__}: {exc}")
+
+    checks = [RubricCheckResult(id=c.get("id", "?"), passed=bool(c.get("pass")),
+                                notes=c.get("notes", ""))
+              for c in result.get("checks", [])]
+    return RubricResult(overall_pass=bool(result.get("overall_pass")),
+                        score=int(result.get("score", 0)), checks=checks)
diff --git a/evals/lib/triggers.py b/evals/lib/triggers.py
new file mode 100644
index 0000000..af71077
--- /dev/null
+++ b/evals/lib/triggers.py
@@ -0,0 +1,73 @@
+"""Shared trigger-decision helpers used by every harness adapter.
+
+The agents declare a decision line under the observe suffix, e.g.
+`hawkscan:hawkscan: YES` or `none: NO`. That explicit declaration is the agent's
+considered verdict and must be authoritative — it should not be overridden by the
+looser behavioral phrases in INVOCATION_SIGNALS (e.g. "security scan after"), which
+frequently appear because the agent is *quoting the user's negative instruction*
+("Don't run a security scan after this change"). Treating the explicit decline as
+authoritative removes that class of false positive.
+"""
+from __future__ import annotations
+import re
+
+# How the agent names each skill in its decision line. Full `plugin:skill` form
+# first (most specific), then the bare skill name. Hyphens are literal here, so we
+# never normalize them away (would corrupt `stackhawk-api`).
+_DECL_NAMES = {
+    "hawkscan": ["hawkscan:hawkscan", "hawkscan"],
+    "api": ["stackhawk-api:api", "stackhawk-api"],
+    "stackhawk-data-seed": ["stackhawk-data-seed:stackhawk-data-seed",
+                            "stackhawk-data-seed"],
+}
+
+# Decision separator between the skill name and YES/NO: colon, hyphen, en/em dash.
+_SEP = r"\s*[:\-–—]\s*"
+
+
+# Phrases an agent uses to decline a skill without the literal `: NO`, e.g.
+# "`hawkscan:hawkscan` does not apply".
+_DECLINE = r"(?:does ?n.?t apply|not applicable|not needed|n/a)"
+
+
+def explicit_decision(text: str, skill: str) -> str | None:
+    """Return 'yes'/'no' if the agent emitted an explicit decision for `skill` —
+    a `skill: YES`/`skill: NO` line, a global `none: NO`, a `skill … does not
+    apply` decline, or an explicit YES for a *different* skill (which means it
+    chose that one, not this). Else None. Strips markdown emphasis first so
+    `**hawkscan:hawkscan: YES**` and `` `none: NO` `` are recognized."""
+    norm = re.sub(r"[*`_]+", "", text.lower())
+    names = _DECL_NAMES.get(skill, [skill])
+
+    def declared(name: str, verdict: str) -> bool:
+        return re.search(re.escape(name) + _SEP + verdict + r"\b", norm) is not None
+
+    if any(declared(n, "yes") for n in names):
+        return "yes"
+    # Explicit NO for this skill, a global decline, or a "does not apply" phrase.
+    if (re.search(r"\bnone" + _SEP + r"no\b", norm)
+            or any(declared(n, "no") for n in names)
+            or any(re.search(re.escape(n) + r"\W+" + _DECLINE, norm) for n in names)):
+        return "no"
+    # The agent explicitly chose a DIFFERENT skill → this skill was declined.
+    for other, onames in _DECL_NAMES.items():
+        if other == skill:
+            continue
+        if any(re.search(re.escape(n) + _SEP + r"yes\b", norm) for n in onames):
+            return "no"
+    return None
+
+
+def decide_trigger(*, executed_cli: bool, declared: str | None, loose_hit: bool) -> bool:
+    """Combine the three trigger signals with the right precedence:
+      1. Real CLI execution is unambiguous — the skill ran.
+      2. An explicit decision line (YES/NO) is authoritative for narration.
+      3. Otherwise fall back to loose behavioral phrase matches.
+    """
+    if executed_cli:
+        return True
+    if declared == "no":
+        return False
+    if declared == "yes":
+        return True
+    return loose_hit
diff --git a/evals/stackhawk-data-seed/prompts.csv b/evals/stackhawk-data-seed/prompts.csv
deleted file mode 100644
index afa2673..0000000
--- a/evals/stackhawk-data-seed/prompts.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-ds-01,true,explicit,"Use the $stackhawk-data-seed skill to set up seed data for my repo","Direct skill reference by name"
-ds-02,true,implicit,"Set up data for HawkScan in this repo","Top-line trigger phrase from skill description"
-ds-03,true,implicit,"My HawkScan run has no data to hit — can you seed it?","'no data to hit' keyword from description"
-ds-04,true,implicit,"Seed this repo so HawkScan can authenticate and scan real paths","'seed this repo' phrasing"
-ds-05,true,implicit,"I'm setting up HawkScan for the first time on this service — generate the seed data","First-time-setup phrasing"
-ds-06,true,contextual,"HawkScan is logging in but every endpoint returns an empty list. Fix it.","Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly."
-ds-07,true,contextual,"We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works.","Augment existing seed when data shape changed"
-ds-08,true,contextual,"Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit","Describes the artifact, not the skill"
-ds-09,false,negative,"Scan my API for vulnerabilities","Scan request → hawkscan skill"
-ds-10,false,negative,"Show me my untriaged findings","Reporting query → stackhawk-api skill"
-ds-11,false,negative,"Configure HawkScan in my GitHub Actions pipeline","CI config → hawkscan-ci skill"
-ds-12,false,negative,"I just fixed a bug in the payment endpoint","Routine code change — skill is NOT autonomous; must not fire on post-change hook"
-ds-13,false,negative,"Refactor the user service to use repositories instead of direct queries","Refactor — not autonomous; must not fire"
-ds-14,false,negative,"Bootstrap a new Spring Boot project for me","Generic 'bootstrap' meaning project scaffolding — not our skill's domain"
-ds-15,false,negative,"Write a database migration to add a 'role' column to users","DB migration is not seed data; not our skill's domain"
-ds-16,false,negative,"Start the docker-compose stack and run the integration tests","Environment startup — skill explicitly forbids running services"
diff --git a/evals/stackhawk-data-seed/prompts.yaml b/evals/stackhawk-data-seed/prompts.yaml
new file mode 100644
index 0000000..5717341
--- /dev/null
+++ b/evals/stackhawk-data-seed/prompts.yaml
@@ -0,0 +1,80 @@
+- id: ds-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use the $stackhawk-data-seed skill to set up seed data for my repo
+  notes: Direct skill reference by name
+- id: ds-02
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Set up data for HawkScan in this repo
+  notes: Top-line trigger phrase from skill description
+- id: ds-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: My HawkScan run has no data to hit — can you seed it?
+  notes: '''no data to hit'' keyword from description'
+- id: ds-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Seed this repo so HawkScan can authenticate and scan real paths
+  notes: '''seed this repo'' phrasing'
+- id: ds-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: I'm setting up HawkScan for the first time on this service — generate the seed data
+  notes: First-time-setup phrasing
+- id: ds-06
+  should_trigger: true
+  invocation_type: contextual
+  prompt: HawkScan is logging in but every endpoint returns an empty list. Fix it.
+  notes: 'Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly.'
+- id: ds-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works.
+  notes: Augment existing seed when data shape changed
+- id: ds-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit
+  notes: Describes the artifact, not the skill
+- id: ds-09
+  should_trigger: false
+  invocation_type: negative
+  prompt: Scan my API for vulnerabilities
+  notes: Scan request → hawkscan skill
+- id: ds-10
+  should_trigger: false
+  invocation_type: negative
+  prompt: Show me my untriaged findings
+  notes: Reporting query → stackhawk-api skill
+- id: ds-11
+  should_trigger: false
+  invocation_type: negative
+  prompt: Configure HawkScan in my GitHub Actions pipeline
+  notes: CI config → hawkscan-ci skill
+- id: ds-12
+  should_trigger: false
+  invocation_type: negative
+  prompt: I just fixed a bug in the payment endpoint
+  notes: Routine code change — skill is NOT autonomous; must not fire on post-change hook
+- id: ds-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Refactor the user service to use repositories instead of direct queries
+  notes: Refactor — not autonomous; must not fire
+- id: ds-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: Bootstrap a new Spring Boot project for me
+  notes: Generic 'bootstrap' meaning project scaffolding — not our skill's domain
+- id: ds-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: Write a database migration to add a 'role' column to users
+  notes: DB migration is not seed data; not our skill's domain
+- id: ds-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Start the docker-compose stack and run the integration tests
+  notes: Environment startup — skill explicitly forbids running services
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b87b331
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,30 @@
+[project]
+name = "agent-skills-evals"
+version = "0.1.0"
+description = "Eval harness + shared grading lib for StackHawk agent skills"
+requires-python = ">=3.11"
+dependencies = [
+    "pydantic>=2.6",
+    "pyyaml>=6.0",
+    "rich>=13.0",
+]
+
+[dependency-groups]
+dev = ["pytest>=8.0"]
+
+[project.scripts]
+evals = "evals.cli:main"
+compare = "evals.cli:compare"
+regrade = "evals.cli:regrade"
+validate = "evals.cli:validate"
+report = "evals.cli:report"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["evals"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/scripts/migrate_prompts.py b/scripts/migrate_prompts.py
new file mode 100644
index 0000000..3498fe3
--- /dev/null
+++ b/scripts/migrate_prompts.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""One-time, idempotent migration of evals/<skill>/prompts.csv -> prompts.yaml.
+Preserves id, should_trigger (bool), invocation_type, prompt, notes. Adds no
+budgets or expected[] — those are authored by hand afterward."""
+from __future__ import annotations
+import csv
+import sys
+from pathlib import Path
+
+import yaml
+
+EVALS_DIR = Path(__file__).resolve().parent.parent / "evals"
+
+
+def migrate(skill: str) -> None:
+    csv_path = EVALS_DIR / skill / "prompts.csv"
+    yaml_path = EVALS_DIR / skill / "prompts.yaml"
+    rows = []
+    with open(csv_path) as f:
+        for r in csv.DictReader(f):
+            rows.append({
+                "id": r["id"],
+                "should_trigger": r["should_trigger"].strip().lower() == "true",
+                "invocation_type": r["invocation_type"],
+                "prompt": r["prompt"],
+                "notes": r.get("notes", ""),
+            })
+    yaml_path.write_text(yaml.safe_dump(rows, sort_keys=False, width=100,
+                                        allow_unicode=True))
+    print(f"wrote {yaml_path} ({len(rows)} prompts)")
+
+
+if __name__ == "__main__":
+    for skill in (sys.argv[1:] or ["hawkscan", "api"]):
+        migrate(skill)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/hw-07.trace.jsonl b/tests/fixtures/hw-07.trace.jsonl
new file mode 100644
index 0000000..6869787
--- /dev/null
+++ b/tests/fixtures/hw-07.trace.jsonl
@@ -0,0 +1,4 @@
+{"type":"assistant","message":{"content":[{"type":"text","text":"`hawkscan:hawkscan`: YES — running the security scan after the code change. First I'll verify the CLI and that the app is running and reachable on localhost port 8080."},{"type":"tool_use","name":"Bash","input":{"command":"hawk version"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk config --help"}}]}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Confirming the app and environment exist and credentials are set."},{"type":"tool_use","name":"Bash","input":{"command":"hawkop app list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawkop env list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk init"}}]}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Validating config, then running the scan synchronously."},{"type":"tool_use","name":"Bash","input":{"command":"hawk validate config stackhawk.yml"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk scan --env Development"}}]}}
+{"type":"result","result":"Scan complete. The application was reachable on localhost:8080 throughout.","cost_usd":0.07,"subtype":"success"}
diff --git a/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json
new file mode 100644
index 0000000..100a650
--- /dev/null
+++ b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json
@@ -0,0 +1,5 @@
+{"platform":"claude-code","skill":"hawkscan","model":"haiku","commit":"abc1234",
+ "results":[
+   {"platform":"claude-code","skill":"hawkscan","run_id":"hw-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.05},
+   {"platform":"claude-code","skill":"hawkscan","run_id":"hw-14","should_trigger":false,"did_trigger":true,"trigger_correct":false,"verdict":"fail","budget_breaches":[],"process_checks":[],"score":0,"cost_usd":0.02}
+ ]}
diff --git a/tests/fixtures/results/eval-codex-api-haiku/cell.json b/tests/fixtures/results/eval-codex-api-haiku/cell.json
new file mode 100644
index 0000000..1343366
--- /dev/null
+++ b/tests/fixtures/results/eval-codex-api-haiku/cell.json
@@ -0,0 +1,4 @@
+{"platform":"codex","skill":"api","model":"haiku","commit":"abc1234",
+ "results":[
+   {"platform":"codex","skill":"api","run_id":"api-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.04}
+ ]}
diff --git a/tests/fixtures/streams/agy.txt b/tests/fixtures/streams/agy.txt
new file mode 100644
index 0000000..2726a9e
--- /dev/null
+++ b/tests/fixtures/streams/agy.txt
@@ -0,0 +1,2 @@
+`hawkscan:hawkscan`: YES — running the security scan.
+I ran `hawk scan --env Development`; the app was reachable on localhost:8080.
diff --git a/tests/fixtures/streams/codex.txt b/tests/fixtures/streams/codex.txt
new file mode 100644
index 0000000..048da79
--- /dev/null
+++ b/tests/fixtures/streams/codex.txt
@@ -0,0 +1,4 @@
+{"type":"item.started","item":{"type":"command_execution","command":"hawk validate config stackhawk.yml"}}
+{"type":"item.started","item":{"type":"command_execution","command":"hawk scan --env Development"}}
+{"type":"item.completed","item":{"type":"agent_message","text":"Running the security scan; app reachable on localhost:8080."}}
+{"type":"turn.completed","usage":{"input_tokens":1200,"output_tokens":340}}
diff --git a/tests/fixtures/streams/cursor.txt b/tests/fixtures/streams/cursor.txt
new file mode 100644
index 0000000..2dfe9ee
--- /dev/null
+++ b/tests/fixtures/streams/cursor.txt
@@ -0,0 +1,3 @@
+{"type":"tool_call","subtype":"started","tool_call":{"shellToolCall":{"args":{"command":"hawk scan --env Development"}}}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Running HawkScan against the app on localhost:8080."}]}}
+{"type":"result","usage":{"inputTokens":950,"outputTokens":210},"is_error":false}
diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
new file mode 100644
index 0000000..9b68462
--- /dev/null
+++ b/tests/lib/test_adapters.py
@@ -0,0 +1,85 @@
+import importlib.util
+from pathlib import Path
+from evals.lib.harness import get_adapter
+from evals.lib.models import ParsedRun
+
+FIX = Path(__file__).parent.parent / "fixtures" / "streams"
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+def _load_adapter_module(platform: str):
+    path = REPO_ROOT / "evals" / "harnesses" / platform / "adapter.py"
+    spec = importlib.util.spec_from_file_location(f"_t_adapter_{platform}", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def test_codex_parse_stream():
+    cx = get_adapter("codex")
+    run = cx.parse_stream((FIX / "codex.txt").read_text())
+    assert isinstance(run, ParsedRun)
+    assert "hawk validate config stackhawk.yml" in run.bash_commands
+    assert "hawk scan --env Development" in run.bash_commands
+    assert "localhost:8080" in run.output_text
+    assert run.output_tokens == 340
+
+
+def test_codex_detect_trigger():
+    cx = get_adapter("codex")
+    run = ParsedRun(bash_commands=["hawk scan --env Development"])
+    assert cx.detect_trigger(run, "hawkscan") is True
+    assert cx.detect_trigger(ParsedRun(bash_commands=["echo hi"]), "hawkscan") is False
+
+
+def test_cursor_parse_stream():
+    cu = get_adapter("cursor")
+    run = cu.parse_stream((FIX / "cursor.txt").read_text())
+    assert "hawk scan --env Development" in run.bash_commands
+    assert "localhost:8080" in run.output_text
+
+
+def test_cursor_detect_trigger():
+    cu = get_adapter("cursor")
+    assert cu.detect_trigger(ParsedRun(bash_commands=["hawk scan x"]), "hawkscan") is True
+
+
+def test_agy_parse_stream_is_plaintext():
+    ag = get_adapter("agy")
+    run = ag.parse_stream((FIX / "agy.txt").read_text())
+    assert run.bash_commands == []
+    assert "hawk scan --env Development" in run.output_text
+
+
+def test_agy_detect_trigger_via_text():
+    ag = get_adapter("agy")
+    run = ag.parse_stream((FIX / "agy.txt").read_text())
+    assert ag.detect_trigger(run, "hawkscan") is True
+
+
+def test_claude_code_parses_total_cost_usd():
+    import json
+    cc = get_adapter("claude-code")
+    lines = [
+        json.dumps({"type":"assistant","message":{"content":[{"type":"text","text":"hi"}]}}),
+        json.dumps({"type":"result","result":"done","total_cost_usd":0.123,"subtype":"success"}),
+    ]
+    run = cc.parse_stream("\n".join(lines))
+    assert abs(run.cost_usd - 0.123) < 1e-9
+
+
+def test_agy_observe_suffix_and_skill_signal():
+    ag = get_adapter("agy")
+    # The legacy `SKILL: hawkscan` declaration format must still be detected (it's
+    # retained as a loose INVOCATION_SIGNAL fallback).
+    run = ag.parse_stream("I would use SKILL: hawkscan for this task.")
+    assert ag.detect_trigger(run, "hawkscan") is True
+    # agy now uses the shared per-skill observe suffix, which requests the
+    # `plugin:skill: YES`/`none: NO` decision line and a full workflow walkthrough.
+    from evals.lib.observe import observe_suffix
+    suffix = observe_suffix("hawkscan")
+    assert suffix.strip()
+    assert "hawkscan:hawkscan: YES" in suffix
+    # The new decision line is recognized as an explicit trigger.
+    run2 = ag.parse_stream("**hawkscan:hawkscan: YES** — running the scan workflow")
+    assert ag.detect_trigger(run2, "hawkscan") is True
diff --git a/tests/lib/test_baseline.py b/tests/lib/test_baseline.py
new file mode 100644
index 0000000..727f270
--- /dev/null
+++ b/tests/lib/test_baseline.py
@@ -0,0 +1,32 @@
+from evals.lib.models import CellReport, EvalResult, Verdict
+from evals.lib.baseline import diff, score_delta
+
+
+def _cell(verdicts: dict):
+    results = [EvalResult(platform="p", skill="s", run_id=k, should_trigger=True,
+                          did_trigger=True, trigger_correct=True, verdict=v, score=100)
+               for k, v in verdicts.items()]
+    return CellReport(platform="p", skill="s", model="m", commit="c", results=results)
+
+
+def test_diff_statuses():
+    base = _cell({"a": Verdict.PASS, "b": Verdict.FAIL, "c": Verdict.PASS, "d": Verdict.PASS})
+    cur = _cell({"a": Verdict.FAIL, "b": Verdict.PASS, "c": Verdict.PASS, "e": Verdict.PASS})
+    d = diff(cur, base)
+    assert d["a"] == "regressed"
+    assert d["b"] == "fixed"
+    assert d["c"] == "same"
+    assert d["e"] == "new"
+    assert d["d"] == "dropped"
+
+
+def test_diff_changed_non_fail():
+    base = _cell({"a": Verdict.PASS})
+    cur = _cell({"a": Verdict.PASS_SLOW})
+    assert diff(cur, base)["a"] == "changed"
+
+
+def test_score_delta_bands():
+    assert score_delta(90, 88) == "no-change"
+    assert score_delta(95, 88) == "better"
+    assert score_delta(80, 88) == "worse"
diff --git a/tests/lib/test_cli_resilience.py b/tests/lib/test_cli_resilience.py
new file mode 100644
index 0000000..9668e4f
--- /dev/null
+++ b/tests/lib/test_cli_resilience.py
@@ -0,0 +1,41 @@
+import json
+from pathlib import Path
+import pytest
+import evals.cli as cli_mod
+
+
+class BoomAdapter:
+    platform = "boom"
+
+    def cli_signals(self, s):
+        return []
+
+    def invocation_signals(self, s):
+        return []
+
+    def parse_stream(self, raw):
+        from evals.lib.models import ParsedRun
+        return ParsedRun()
+
+    def detect_trigger(self, run, s):
+        return False
+
+    def launch(self, *a, **k):
+        raise FileNotFoundError("agent: command not found")
+
+
+def test_main_survives_launch_crash(monkeypatch, tmp_path):
+    # Point results at a temp dir and force the boom adapter + a tiny prompt set.
+    monkeypatch.setattr(cli_mod, "get_adapter", lambda p: BoomAdapter())
+    monkeypatch.setattr(cli_mod, "RESULTS_ROOT", tmp_path)
+    monkeypatch.setattr("sys.argv", ["evals", "--harness", "claude-code", "--skill", "hawkscan"])
+    with pytest.raises(SystemExit):   # FP/FN cause sys.exit(1) — that's fine
+        cli_mod.main()
+    # The cell + summary were still written despite every launch crashing:
+    out = tmp_path / "claude-code" / "results" / "hawkscan"
+    assert (out / "cell.json").exists()
+    assert (out / "summary.json").exists()
+    cell = json.loads((out / "cell.json").read_text())
+    assert len(cell["results"]) == 20            # all hawkscan prompts graded
+    # positive prompts failed with a harness note; at least one note mentions the crash
+    assert any("command not found" in r.get("note", "") for r in cell["results"])
diff --git a/tests/lib/test_compare.py b/tests/lib/test_compare.py
new file mode 100644
index 0000000..4adb5cf
--- /dev/null
+++ b/tests/lib/test_compare.py
@@ -0,0 +1,65 @@
+# tests/lib/test_compare.py
+from evals.lib.models import ParsedRun, Verdict
+from evals.lib import compare as compare_mod
+
+
+# A realistic skill-loaded hawkscan run: preflight + step1 discovery + config
+# validation + synchronous scan, with output mentioning the app is reachable.
+# This satisfies hawkscan's blocking process-checks, the way a real run would.
+_WITH_SKILL = ParsedRun(
+    bash_commands=[
+        "hawk version",
+        "hawk config --help",
+        "hawkop app list",
+        "hawkop env list",
+        "hawk init",
+        "hawk validate config stackhawk.yml",
+        "hawk scan --env Development",
+    ],
+    output_text="The application was running and reachable on localhost:8080.",
+    cost_usd=0.05,
+)
+_WITHOUT_SKILL = ParsedRun(bash_commands=["echo idk"], cost_usd=0.02)
+
+
+class StubAdapter:
+    platform = "stub"
+    def cli_signals(self, skill): return ["hawk scan"]
+    def invocation_signals(self, skill): return []
+    def parse_stream(self, raw): return ParsedRun()
+    def detect_trigger(self, run, skill):
+        return any("hawk scan" in c for c in run.bash_commands)
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto):
+        return _WITH_SKILL if load_skill else _WITHOUT_SKILL
+
+
+def test_compare_shows_lift(monkeypatch):
+    monkeypatch.setattr(compare_mod, "get_adapter", lambda p: StubAdapter())
+    rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01")
+    row = rows[0]
+    assert row["without_verdict"] == Verdict.FAIL          # no skill -> blocking checks fail
+    assert row["with_verdict"] in (Verdict.PASS, Verdict.PASS_SLOW)  # skill -> workflow satisfied
+    assert row["with_cost"] == 0.05 and row["without_cost"] == 0.02
+
+
+def test_compare_skill_returns_lift_effect(monkeypatch):
+    from evals.lib.models import ParsedRun, Verdict
+    from evals.lib import compare as compare_mod
+
+    class Stub:
+        platform = "stub"
+        def cli_signals(self, s): return ["hawk scan"]
+        def invocation_signals(self, s): return []
+        def parse_stream(self, raw): return ParsedRun()
+        def detect_trigger(self, run, s): return any("hawk scan" in c for c in run.bash_commands)
+        def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+                   max_budget, bare, full_auto):
+            return (ParsedRun(bash_commands=["hawk version","hawk config --help",
+                    "hawkop app list","hawkop env list","hawk init",
+                    "hawk validate config stackhawk.yml","hawk scan"],
+                    output_text="reachable on localhost:8080") if load_skill
+                    else ParsedRun(bash_commands=["echo idk"]))
+    monkeypatch.setattr(compare_mod, "get_adapter", lambda p: Stub())
+    rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01")
+    assert rows[0]["effect"] == "lift"
diff --git a/tests/lib/test_config.py b/tests/lib/test_config.py
new file mode 100644
index 0000000..8f64c2e
--- /dev/null
+++ b/tests/lib/test_config.py
@@ -0,0 +1,82 @@
+# tests/lib/test_config.py
+import json
+import textwrap
+import pytest
+from pydantic import ValidationError
+from evals.lib.config import load_skill, SkillConfig
+
+
+def _write_skill(tmp_path, prompts_yaml: str, checks: dict):
+    skill_dir = tmp_path / "demo"
+    skill_dir.mkdir()
+    (skill_dir / "prompts.yaml").write_text(prompts_yaml)
+    (skill_dir / "process-checks.json").write_text(json.dumps(checks))
+    return skill_dir
+
+
+def test_load_skill_parses_prompts_and_checks(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: do the thing
+        budget:
+          bash_commands: 5
+        expected:
+          - signal: "hawk scan"
+    """)
+    checks = {"skill": "demo", "checks": [
+        {"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+         "severity": "blocking"}]}
+    skill_dir = _write_skill(tmp_path, yaml_text, checks)
+
+    cfg = load_skill("demo", base_dir=skill_dir.parent)
+    assert isinstance(cfg, SkillConfig)
+    assert cfg.skill == "demo"
+    assert len(cfg.prompts) == 1
+    assert cfg.prompts[0].budget.bash_commands == 5
+    assert cfg.checks[0]["id"] == "c1"
+
+
+def test_load_skill_rejects_bad_prompt_field(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: x
+        budget_usd: 0.1
+    """)
+    skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []})
+    with pytest.raises(ValidationError):
+        load_skill("demo", base_dir=skill_dir.parent)
+
+
+def test_load_skill_rejects_duplicate_ids(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: dup
+        should_trigger: true
+        invocation_type: explicit
+        prompt: a
+      - id: dup
+        should_trigger: false
+        invocation_type: negative
+        prompt: b
+    """)
+    skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []})
+    with pytest.raises(ValueError, match="duplicate prompt id"):
+        load_skill("demo", base_dir=skill_dir.parent)
+
+
+def test_load_skill_rejects_applies_to_unknown_prompt(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: x
+    """)
+    checks = {"skill": "demo", "checks": [
+        {"id": "c1", "type": "command_executed", "signals": ["x"],
+         "severity": "warning", "applies_to": ["nope"]}]}
+    skill_dir = _write_skill(tmp_path, yaml_text, checks)
+    with pytest.raises(ValueError, match="applies_to references unknown prompt"):
+        load_skill("demo", base_dir=skill_dir.parent)
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
new file mode 100644
index 0000000..ce34aaf
--- /dev/null
+++ b/tests/lib/test_grading.py
@@ -0,0 +1,243 @@
+# tests/lib/test_grading.py
+from evals.lib.models import ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict
+from evals.lib.grading import (
+    applicable_checks, run_process_checks, run_adhoc_expected, check_budget, grade,
+)
+
+
+def _prompt(**kw):
+    base = dict(id="d-01", should_trigger=True, invocation_type="explicit", prompt="x")
+    base.update(kw)
+    return PromptConfig(**base)
+
+
+def test_applicable_checks_global_and_scoped():
+    checks = [
+        {"id": "global", "type": "command_executed", "signals": ["a"], "severity": "warning"},
+        {"id": "scoped", "type": "command_executed", "signals": ["b"], "severity": "warning",
+         "applies_to": ["d-02"]},
+    ]
+    assert {c["id"] for c in applicable_checks(checks, "d-01")} == {"global"}
+    assert {c["id"] for c in applicable_checks(checks, "d-02")} == {"global", "scoped"}
+
+
+def test_process_check_signal_hit():
+    run = ParsedRun(bash_commands=["hawk scan --env test"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    res = run_process_checks(run, checks)
+    assert res[0].passed is True
+    assert res[0].signal_found == "hawk scan"
+
+
+def test_process_check_anti_pattern_negative_type():
+    run = ParsedRun(bash_commands=["curl https://api/v1/scan"])
+    checks = [{"id": "c1", "type": "command_negative", "anti_patterns": ["curl"],
+               "severity": "warning"}]
+    res = run_process_checks(run, checks)
+    assert res[0].passed is False
+    assert res[0].anti_found == "curl"
+
+
+def test_adhoc_expected_signal_and_anti():
+    run = ParsedRun(bash_commands=["hawk validate"], output_text="done")
+    expected = [ExpectedCheck(signal="hawk validate"),
+                ExpectedCheck(anti_pattern="rm -rf")]
+    res = run_adhoc_expected(run, expected)
+    assert all(r.passed for r in res)
+
+
+def test_adhoc_expected_missing_signal_is_blocking_fail():
+    run = ParsedRun(bash_commands=["hawk scan"])
+    res = run_adhoc_expected(run, [ExpectedCheck(signal="hawk validate")])
+    assert res[0].passed is False
+    assert res[0].severity == "blocking"
+
+
+def test_check_budget_detects_breaches():
+    run = ParsedRun(bash_commands=["a", "b", "c"], cost_usd=0.30, output_tokens=9000)
+    budget = BudgetSpec(cost_usd=0.15, bash_commands=2, output_tokens=5000)
+    breaches = check_budget(run, budget)
+    assert any("cost_usd" in b for b in breaches)
+    assert any("bash_commands" in b for b in breaches)
+    assert any("output_tokens" in b for b in breaches)
+
+
+def test_check_budget_ignores_unset_axes():
+    run = ParsedRun(bash_commands=["a", "b", "c"])
+    assert check_budget(run, BudgetSpec(cost_usd=1.0)) == []
+
+
+def test_grade_pass():
+    run = ParsedRun(bash_commands=["hawk scan"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    result = grade(_prompt(), run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.PASS
+    assert result.score == 100
+
+
+def test_grade_fail_on_blocking():
+    run = ParsedRun(bash_commands=["echo nope"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    result = grade(_prompt(), run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.FAIL
+
+
+def test_grade_pass_slow_on_budget_breach():
+    run = ParsedRun(bash_commands=["hawk scan", "a", "b", "c"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(budget=BudgetSpec(bash_commands=2))
+    result = grade(p, run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.PASS_SLOW
+    assert any("bash_commands" in b for b in result.budget_breaches)
+
+
+def test_process_check_conditional_command_enforced_when_keyword_present():
+    run = ParsedRun(bash_commands=["cat stackhawk.yml: authentication: enabled"],
+                    output_text="hawk validate ran")
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "stackhawk.yml contains 'authentication:'",
+               "signals": ["hawk validate"], "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_conditional_command_skipped_when_keyword_absent():
+    run = ParsedRun(bash_commands=["echo nothing relevant"])
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "stackhawk.yml contains 'authentication:'",
+               "signals": ["hawk validate"], "severity": "warning"}]
+    # keyword not in haystack -> check is not applicable -> passes
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_conditional_command_raises_without_quoted_keyword():
+    import pytest
+    run = ParsedRun(bash_commands=["x"])
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "no quotes here", "signals": ["x"], "severity": "warning"}]
+    with pytest.raises(ValueError, match="single-quoted keyword"):
+        run_process_checks(run, checks)
+
+
+def test_process_check_command_preference_normal():
+    run = ParsedRun(bash_commands=["hawkop scan get 123"])
+    checks = [{"id": "c1", "type": "command_preference",
+               "preferred": ["hawkop scan get"], "anti_patterns": ["curl"],
+               "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_command_preference_empty_is_unconstrained():
+    run = ParsedRun(bash_commands=["anything"])
+    checks = [{"id": "c1", "type": "command_preference", "preferred": [],
+               "anti_patterns": ["curl"], "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_file_absent():
+    run = ParsedRun(files_written=["stackhawk.yml"])
+    present = [{"id": "c1", "type": "file_absent", "target_file": "stackhawk.yml",
+                "severity": "warning"}]
+    absent = [{"id": "c2", "type": "file_absent", "target_file": "secrets.env",
+               "severity": "warning"}]
+    assert run_process_checks(run, present)[0].passed is False
+    assert run_process_checks(run, absent)[0].passed is True
+
+
+def test_adhoc_expected_check_id_is_skipped():
+    run = ParsedRun(bash_commands=["x"])
+    assert run_adhoc_expected(run, [ExpectedCheck(check_id="step1")]) == []
+
+
+def test_score_deductions():
+    from evals.lib.grading import _score
+    from evals.lib.models import ProcessCheckResult
+    def pc(passed, sev): return ProcessCheckResult(id="x", passed=passed, severity=sev)
+    assert _score([pc(True, "blocking")]) == 100
+    assert _score([pc(False, "blocking")]) == 85
+    assert _score([pc(False, "warning")]) == 95
+    assert _score([pc(False, "blocking"), pc(False, "warning")]) == 80
+    assert _score([pc(False, "blocking")] * 8) == 0  # floored
+
+
+def test_grade_correct_negative_passes_without_process_checks():
+    # should_trigger=False, did_trigger=False -> correct -> PASS, no process checks run
+    run = ParsedRun(bash_commands=["echo not relevant"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=False)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False)
+    assert res.verdict == Verdict.PASS
+    assert res.trigger_correct is True
+    assert res.process_checks == []
+    assert res.score == 100
+
+
+def test_grade_false_negative_fails():
+    # should_trigger=True but did_trigger=False -> incorrect -> FAIL, no process checks
+    run = ParsedRun(bash_commands=["echo nothing"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=True)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False)
+    assert res.verdict == Verdict.FAIL
+    assert res.trigger_correct is False
+    assert res.process_checks == []
+
+
+def test_grade_false_positive_fails_without_process_checks():
+    # should_trigger=False but did_trigger=True -> incorrect -> FAIL, no process checks
+    run = ParsedRun(bash_commands=["hawk scan"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=False)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=True)
+    assert res.verdict == Verdict.FAIL
+    assert res.trigger_correct is False
+    assert res.process_checks == []
+
+
+def test_grade_propagates_harness_error_to_note():
+    from evals.lib.models import ParsedRun, Verdict
+    from evals.lib.grading import grade
+    p = _prompt(should_trigger=True)   # _prompt helper already in this file
+    run = ParsedRun(returncode=1, stderr_tail="agent: command not found", error="exit 1: agent: command not found")
+    res = grade(p, run, [], platform="cursor", skill="hawkscan", did_trigger=False)
+    assert res.verdict == Verdict.FAIL          # didn't trigger
+    assert "command not found" in res.note      # harness error surfaced
+
+
+def test_file_absent_or_unchanged_passes_when_not_written():
+    checks = [{"id": "no_yml", "type": "file_absent_or_unchanged",
+               "target_file": "stackhawk.yml", "severity": "blocking"}]
+    assert run_process_checks(ParsedRun(output_text="done"), checks)[0].passed is True
+    # ...and fails when the file IS written
+    bad = ParsedRun(output_text="done", files_written=["stackhawk.yml"])
+    assert run_process_checks(bad, checks)[0].passed is False
+
+
+def test_file_absent_with_anti_pattern_paths():
+    checks = [{"id": "no_legacy", "type": "file_absent",
+               "anti_patterns": ["bootstrap/manifest.yaml"], "severity": "blocking"}]
+    assert run_process_checks(ParsedRun(output_text="x"), checks)[0].passed is True
+    bad = ParsedRun(files_written=["bootstrap/manifest.yaml"])
+    assert run_process_checks(bad, checks)[0].passed is False
+
+
+def test_file_present_via_write_or_narration():
+    checks = [{"id": "emit", "type": "file_present",
+               "signals": ["data-seed/manifest.yaml"], "severity": "blocking"}]
+    # written for real (execution mode)
+    assert run_process_checks(
+        ParsedRun(files_written=["data-seed/manifest.yaml"]), checks)[0].passed is True
+    # only narrated (observe mode)
+    assert run_process_checks(
+        ParsedRun(output_text="I'll write data-seed/manifest.yaml"), checks)[0].passed is True
+    # neither -> fail
+    assert run_process_checks(ParsedRun(output_text="nope"), checks)[0].passed is False
diff --git a/tests/lib/test_harness.py b/tests/lib/test_harness.py
new file mode 100644
index 0000000..4689abb
--- /dev/null
+++ b/tests/lib/test_harness.py
@@ -0,0 +1,31 @@
+# tests/lib/test_harness.py
+import json
+from evals.lib.harness import get_adapter
+from evals.lib.models import ParsedRun
+
+CC = get_adapter("claude-code")
+
+
+def test_parse_stream_extracts_bash_and_text():
+    lines = [
+        json.dumps({"type": "assistant", "message": {"content": [
+            {"type": "tool_use", "name": "Bash", "input": {"command": "hawk scan"}},
+            {"type": "text", "text": "scanning now"},
+        ]}}),
+        json.dumps({"type": "result", "result": "done", "cost_usd": 0.04}),
+    ]
+    run = CC.parse_stream("\n".join(lines))
+    assert isinstance(run, ParsedRun)
+    assert run.bash_commands == ["hawk scan"]
+    assert "scanning now" in run.output_text
+    assert run.cost_usd == 0.04
+
+
+def test_detect_trigger_via_cli_signal():
+    run = ParsedRun(bash_commands=["hawk scan --env test"])
+    assert CC.detect_trigger(run, "hawkscan") is True
+
+
+def test_detect_trigger_negative():
+    run = ParsedRun(bash_commands=["echo hello"], output_text="nothing relevant")
+    assert CC.detect_trigger(run, "hawkscan") is False
diff --git a/tests/lib/test_models.py b/tests/lib/test_models.py
new file mode 100644
index 0000000..ff84e20
--- /dev/null
+++ b/tests/lib/test_models.py
@@ -0,0 +1,94 @@
+# tests/lib/test_models.py
+import pytest
+from pydantic import ValidationError
+from evals.lib.models import (
+    BudgetSpec, ExpectedCheck, PromptConfig, ParsedRun, Verdict,
+)
+
+
+def test_prompt_config_minimal():
+    p = PromptConfig(id="hw-01", should_trigger=True,
+                     invocation_type="explicit", prompt="scan it")
+    assert p.budget is None
+    assert p.expected == []
+    assert p.notes == ""
+
+
+def test_prompt_config_rejects_unknown_field():
+    with pytest.raises(ValidationError):
+        PromptConfig(id="hw-01", should_trigger=True,
+                     invocation_type="explicit", prompt="x", budget_usd=0.1)
+
+
+def test_budget_spec_rejects_unknown_axis():
+    with pytest.raises(ValidationError):
+        BudgetSpec(cost_dollars=0.1)
+
+
+def test_expected_check_requires_exactly_one():
+    ExpectedCheck(signal="hawk scan")            # ok
+    ExpectedCheck(check_id="step1")              # ok
+    ExpectedCheck(anti_pattern="curl")           # ok
+    with pytest.raises(ValidationError):
+        ExpectedCheck()                          # none set
+    with pytest.raises(ValidationError):
+        ExpectedCheck(signal="a", anti_pattern="b")  # two set
+
+
+def test_invocation_type_is_constrained():
+    with pytest.raises(ValidationError):
+        PromptConfig(id="x", should_trigger=True,
+                     invocation_type="bogus", prompt="x")
+
+
+def test_verdict_values():
+    assert Verdict.PASS == "pass"
+    assert Verdict.PASS_SLOW == "pass-slow"
+    assert Verdict.FAIL == "fail"
+
+
+def test_parsed_run_defaults():
+    r = ParsedRun()
+    assert r.bash_commands == []
+    assert r.cost_usd == 0.0
+    assert r.output_tokens is None
+
+
+def test_cellreport_roundtrips():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    r = EvalResult(platform="codex", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=True, trigger_correct=True,
+                   verdict=Verdict.PASS, score=100)
+    cell = CellReport(platform="codex", skill="hawkscan", model="haiku",
+                      commit="abc1234", results=[r])
+    again = CellReport.model_validate_json(cell.model_dump_json())
+    assert again.results[0].run_id == "hw-01"
+    assert again.model == "haiku"
+
+
+def test_cellreport_rejects_unknown_field():
+    import pytest
+    from pydantic import ValidationError
+    from evals.lib.models import CellReport
+    with pytest.raises(ValidationError):
+        CellReport(platform="x", skill="y", model="m", commit="c", results=[], extra=1)
+
+
+def test_parsedrun_has_diagnostic_fields():
+    from evals.lib.models import ParsedRun
+    r = ParsedRun()
+    assert r.returncode is None
+    assert r.stderr_tail == ""
+    r2 = ParsedRun(returncode=1, stderr_tail="boom")
+    assert r2.returncode == 1 and r2.stderr_tail == "boom"
+
+
+def test_evalresult_has_note_field():
+    from evals.lib.models import EvalResult, Verdict
+    e = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True,
+                   did_trigger=True, trigger_correct=True, verdict=Verdict.PASS, score=100)
+    assert e.note == ""
+    e2 = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True,
+                    did_trigger=False, trigger_correct=False, verdict=Verdict.FAIL,
+                    score=0, note="harness error: agent: command not found")
+    assert "command not found" in e2.note
diff --git a/tests/lib/test_replay.py b/tests/lib/test_replay.py
new file mode 100644
index 0000000..a69f3a7
--- /dev/null
+++ b/tests/lib/test_replay.py
@@ -0,0 +1,20 @@
+# tests/lib/test_replay.py
+from pathlib import Path
+from evals.lib.replay import regrade
+from evals.lib.models import Verdict
+
+FIXTURE = Path(__file__).parent.parent / "fixtures" / "hw-07.trace.jsonl"
+
+
+def test_regrade_from_trace_passes():
+    result = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    assert result.did_trigger is True
+    assert result.verdict in (Verdict.PASS, Verdict.PASS_SLOW)
+    assert result.run_id == "hw-07"
+
+
+def test_regrade_is_deterministic():
+    a = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    b = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    assert a.verdict == b.verdict
+    assert a.score == b.score
diff --git a/tests/lib/test_reporting.py b/tests/lib/test_reporting.py
new file mode 100644
index 0000000..54707d2
--- /dev/null
+++ b/tests/lib/test_reporting.py
@@ -0,0 +1,20 @@
+# tests/lib/test_reporting.py
+from evals.lib.models import EvalResult, Verdict
+from evals.lib.reporting import build_summary
+
+
+def _r(run_id, verdict, trigger_ok=True, should=True, did=True):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id=run_id,
+                      should_trigger=should, did_trigger=did, trigger_correct=trigger_ok,
+                      verdict=verdict, score=100 if verdict != Verdict.FAIL else 40)
+
+
+def test_build_summary_counts():
+    results = [_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS_SLOW),
+               _r("hw-03", Verdict.FAIL),
+               _r("hw-13", Verdict.PASS, trigger_ok=False, should=False, did=True)]
+    s = build_summary("hawkscan", "claude-code", results)
+    assert s["trigger_accuracy"]["correct"] == 3
+    assert s["trigger_accuracy"]["total"] == 4
+    assert s["false_positives"] == ["hw-13"]
+    assert s["verdict_counts"] == {"pass": 2, "pass-slow": 1, "fail": 1}
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
new file mode 100644
index 0000000..f2e27c6
--- /dev/null
+++ b/tests/lib/test_reporting_render.py
@@ -0,0 +1,119 @@
+from evals.lib.models import CellReport, EvalResult, Verdict
+from evals.lib.reporting import badge, render_job_summary
+
+
+def _cell(*results):
+    return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                      commit="abc1234", results=list(results))
+
+
+def _r(rid, verdict, trig=True, should=True, did=True, why=""):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id=rid,
+                      should_trigger=should, did_trigger=did, trigger_correct=trig,
+                      verdict=verdict, score=100 if verdict != Verdict.FAIL else 40,
+                      budget_breaches=[why] if (why and verdict == Verdict.PASS_SLOW) else [])
+
+
+def test_badge_is_shields_image():
+    md = badge("fail", "FAIL")
+    assert md.startswith("![") and "img.shields.io/badge/" in md
+
+
+def test_job_summary_has_counts_and_all_rows_failures_first():
+    cell = _cell(_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS),
+                 _r("hw-14", Verdict.FAIL, trig=False, should=False, did=True))
+    md = render_job_summary(cell)
+    assert "claude-code" in md and "hawkscan" in md and "haiku" in md
+    assert "1 failed" in md.lower() or "❌ 1" in md
+    for rid in ("hw-01", "hw-02", "hw-14"):
+        assert rid in md
+    # failing row appears before the first passing row
+    assert md.index("hw-14") < md.index("hw-01")
+
+
+def test_write_github_summary_appends(tmp_path, monkeypatch):
+    from evals.lib.reporting import write_github_summary
+    f = tmp_path / "summary.md"
+    monkeypatch.setenv("GITHUB_STEP_SUMMARY", str(f))
+    write_github_summary("## hello\n")
+    assert "## hello" in f.read_text()
+
+
+def test_write_github_summary_noop_when_unset(monkeypatch):
+    from evals.lib.reporting import write_github_summary
+    monkeypatch.delenv("GITHUB_STEP_SUMMARY", raising=False)
+    write_github_summary("nothing")   # must not raise
+
+
+def test_digest_shows_regression_vs_baseline():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+
+    def cell(v):
+        r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                       should_trigger=True, did_trigger=True, trigger_correct=True,
+                       verdict=v, score=100 if v != Verdict.FAIL else 0)
+        return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                          commit="c", results=[r])
+    cur = cell(Verdict.FAIL)
+    base = {("claude-code", "hawkscan", "haiku"): cell(Verdict.PASS)}
+    md = render_digest([cur], baselines=base)
+    assert "regressed" in md.lower()
+    assert "no baseline" not in md.lower()
+
+
+def test_render_digest_overview_and_per_cell():
+    from pathlib import Path
+    from evals.lib.models import CellReport
+    from evals.lib.reporting import render_digest
+    root = Path(__file__).parent.parent / "fixtures" / "results"
+    cells = [CellReport.model_validate_json((p / "cell.json").read_text())
+             for p in sorted(root.iterdir()) if (p / "cell.json").exists()]
+    md = render_digest(cells)
+    assert "Skill Eval" in md
+    assert "claude-code" in md and "codex" in md
+    assert "hw-14" in md            # failing test surfaced
+    assert "no baseline" in md.lower()   # no baseline supplied
+
+
+def test_digest_overview_shows_score_delta_vs_baseline():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+
+    def cell(score):
+        r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                       should_trigger=True, did_trigger=True, trigger_correct=True,
+                       verdict=Verdict.PASS, score=score)
+        return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                          commit="c", results=[r])
+    cur = cell(70)
+    base = {("claude-code", "hawkscan", "haiku"): cell(90)}
+    md = render_digest([cur], baselines=base)
+    assert "worse" in md.lower()   # 70 vs 90 -> worse
+
+
+def test_job_summary_shows_note():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_job_summary
+    r = EvalResult(platform="cursor", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=False, trigger_correct=False,
+                   verdict=Verdict.FAIL, score=0, note="harness error: agent not found")
+    cell = CellReport(platform="cursor", skill="hawkscan", model="default",
+                      commit="c", results=[r])
+    md = render_job_summary(cell)
+    assert "agent not found" in md
+
+
+def test_digest_renders_lift_section():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+    r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=True, trigger_correct=True,
+                   verdict=Verdict.PASS, score=100)
+    cell = CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                      commit="c", results=[r])
+    lift = {("claude-code", "hawkscan", "haiku"): [
+        {"id": "hw-01", "without_verdict": "fail", "with_verdict": "pass", "effect": "lift"}]}
+    md = render_digest([cell], lift=lift)
+    assert "lift" in md.lower() and "hw-01" in md
+    assert "1/1" in md or "1 of 1" in md.lower()
diff --git a/tests/lib/test_rubric.py b/tests/lib/test_rubric.py
new file mode 100644
index 0000000..b80d016
--- /dev/null
+++ b/tests/lib/test_rubric.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+from evals.lib.models import EvalResult, Verdict, RubricResult
+from evals.lib.reporting import _pivot_cell
+from evals.lib.rubric import grade_rubric
+from evals.lib.models import ParsedRun
+
+
+def _res(rubric=None, verdict=Verdict.PASS):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                      should_trigger=True, did_trigger=True, trigger_correct=True,
+                      verdict=verdict, score=100, rubric=rubric)
+
+
+def test_rubric_tag_pass():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=True, score=85)))
+    assert cell == "✅ r85✓"
+
+
+def test_rubric_tag_fail_shows_score():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=55)))
+    assert "r55✗" in cell and cell.startswith("✅")  # deterministic pass, rubric flags quality
+
+
+def test_no_rubric_tag_when_absent():
+    assert _pivot_cell(_res(None)) == "✅"
+
+
+def test_rubric_error_renders_question_mark():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=0, error="grader failed")))
+    assert "r?" in cell
+
+
+def test_grade_rubric_none_when_config_missing(tmp_path: Path):
+    # no rubric-items.json / rubric-schema.json under base_dir -> None (not an error)
+    assert grade_rubric(ParsedRun(output_text="x"), "hawkscan", "hw-01",
+                        base_dir=tmp_path) is None
diff --git a/tests/lib/test_rubric_extract.py b/tests/lib/test_rubric_extract.py
new file mode 100644
index 0000000..d9a5002
--- /dev/null
+++ b/tests/lib/test_rubric_extract.py
@@ -0,0 +1,30 @@
+"""Tests for the tolerant JSON extractor in the rubric grader — the grader reply
+often wraps the object in prose or a ```json fence even under --json-schema."""
+import pytest
+from evals.lib.rubric import _extract_json_object
+
+
+def test_pure_json():
+    assert _extract_json_object('{"score": 85, "overall_pass": true}')["score"] == 85
+
+
+def test_fenced_json():
+    txt = "Here is the result:\n\n```json\n{\"score\": 70, \"overall_pass\": false}\n```"
+    assert _extract_json_object(txt)["score"] == 70
+
+
+def test_prose_prefixed_json():
+    # The exact failure mode reproduced locally.
+    txt = 'No skills needed.\n\n```json\n{\n  "score": 85,\n  "overall_pass": true,\n  "checks": []\n}\n```'
+    out = _extract_json_object(txt)
+    assert out["score"] == 85 and out["overall_pass"] is True
+
+
+def test_bare_object_in_prose_no_fence():
+    txt = 'The verdict is {"score": 60, "overall_pass": false, "checks": []} per the rubric.'
+    assert _extract_json_object(txt)["score"] == 60
+
+
+def test_no_json_raises():
+    with pytest.raises(ValueError):
+        _extract_json_object("there is no json here")
diff --git a/tests/lib/test_triggers.py b/tests/lib/test_triggers.py
new file mode 100644
index 0000000..4e90bf8
--- /dev/null
+++ b/tests/lib/test_triggers.py
@@ -0,0 +1,84 @@
+"""Tests for the shared trigger-decision helpers (evals/lib/triggers.py).
+
+The motivating bug: hw-17 ("Don't run a security scan after this change — just
+commit it") is a negative. The agent correctly declared `hawkscan:hawkscan: NO`,
+but detect_trigger substring-matched the loose phrase "security scan after" from
+the agent quoting the user's instruction, producing a false positive.
+"""
+from evals.lib.triggers import explicit_decision, decide_trigger
+
+
+def test_explicit_yes_recognized():
+    assert explicit_decision("hawkscan:hawkscan: YES — run a scan", "hawkscan") == "yes"
+    assert explicit_decision("**stackhawk-api:api: YES**", "api") == "yes"
+    assert explicit_decision("`stackhawk-data-seed:stackhawk-data-seed: YES`",
+                             "stackhawk-data-seed") == "yes"
+
+
+def test_explicit_no_recognized():
+    assert explicit_decision("hawkscan:hawkscan: NO — user declined", "hawkscan") == "no"
+    assert explicit_decision("Decision: none: NO", "hawkscan") == "no"
+    assert explicit_decision("stackhawk-api:api — NO", "api") == "no"
+
+
+def test_dash_and_emphasis_separators():
+    assert explicit_decision("**hawkscan:hawkscan — YES**", "hawkscan") == "yes"
+    assert explicit_decision("hawkscan:hawkscan - NO", "hawkscan") == "no"
+
+
+def test_no_decision_line_returns_none():
+    assert explicit_decision("I'll run a security scan for you.", "hawkscan") is None
+
+
+def test_hyphenated_skill_name_not_corrupted():
+    # `stackhawk-api` must not be mis-split on its internal hyphen.
+    assert explicit_decision("stackhawk-api:api: NO", "api") == "no"
+    assert explicit_decision("stackhawk-data-seed:stackhawk-data-seed: NO",
+                             "stackhawk-data-seed") == "no"
+
+
+def test_hw17_false_positive_suppressed():
+    # The exact failure mode: explicit decline + a loose phrase the agent quoted.
+    text = ("**hawkscan:hawkscan: NO** — User explicitly requested "
+            '"Don\'t run a security scan after this change"\n\n**Decision: none: NO**')
+    declared = explicit_decision(text, "hawkscan")
+    assert declared == "no"
+    # Even though a loose behavioral phrase matched, the explicit decline wins.
+    assert decide_trigger(executed_cli=False, declared=declared, loose_hit=True) is False
+
+
+def test_real_execution_overrides_declared_no():
+    # If the agent actually ran the CLI, it triggered regardless of what it said.
+    assert decide_trigger(executed_cli=True, declared="no", loose_hit=False) is True
+
+
+def test_loose_fallback_when_no_decision():
+    assert decide_trigger(executed_cli=False, declared=None, loose_hit=True) is True
+    assert decide_trigger(executed_cli=False, declared=None, loose_hit=False) is False
+
+
+def test_explicit_yes_triggers_without_loose():
+    assert decide_trigger(executed_cli=False, declared="yes", loose_hit=False) is True
+
+
+def test_does_not_apply_is_decline():
+    assert explicit_decision("`hawkscan:hawkscan` does not apply here", "hawkscan") == "no"
+    assert explicit_decision("the api skill is not needed: stackhawk-api:api not applicable", "api") == "no"
+
+
+def test_choosing_a_different_skill_declines_this_one():
+    # hw-13: agent picks api, says hawkscan doesn't apply — must not be a hawkscan trigger.
+    txt = "`stackhawk-api:api: YES`\n(`hawkscan:hawkscan` does not apply — you asked for findings.)"
+    assert explicit_decision(txt, "hawkscan") == "no"
+    assert explicit_decision(txt, "api") == "yes"
+
+
+def test_other_skill_yes_alone_declines():
+    assert explicit_decision("hawkscan:hawkscan: YES", "api") == "no"
+    assert explicit_decision("hawkscan:hawkscan: YES", "stackhawk-data-seed") == "no"
+
+
+def test_own_yes_not_suppressed_by_other():
+    # Both declared yes — this skill is still yes.
+    txt = "stackhawk-api:api: YES and hawkscan:hawkscan: YES"
+    assert explicit_decision(txt, "hawkscan") == "yes"
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..851950e
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,325 @@
+version = 1
+revision = 2
+requires-python = ">=3.11"
+
+[[package]]
+name = "agent-skills-evals"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "rich" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "pydantic", specifier = ">=2.6" },
+    { name = "pyyaml", specifier = ">=6.0" },
+    { name = "rich", specifier = ">=13.0" },
+]
+
+[package.metadata.requires-dev]
+dev = [{ name = "pytest", specifier = ">=8.0" }]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454, upload-time = "2026-05-07T12:08:28.36Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687, upload-time = "2026-05-07T12:08:27.182Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775, upload-time = "2026-05-06T13:43:05.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262, upload-time = "2026-05-06T13:43:02.641Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.46.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464, upload-time = "2026-05-06T13:37:06.98Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/fa/6d7708d2cfc1a832acb6aeb0cd16e801902df8a0f583bb3b4b527fde022e/pydantic_core-2.46.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0e96592440881c74a213e5ad528e2b24d3d4f940de2766bed9010ab1d9e51594", size = 2111872, upload-time = "2026-05-06T13:40:27.596Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/6f/aa064a3e74b5745afbdf250594f38e7ead05e2d651bcb35994b9417a0d4d/pydantic_core-2.46.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0d65b8c354be7fb5f720c3caa8bc940bc2d20ce749c8e06135f07f8ed95dd7c", size = 1948255, upload-time = "2026-05-06T13:39:12.574Z" },
+    { url = "https://files.pythonhosted.org/packages/43/3a/41114a9f7569b84b4d84e7a018c57c56347dac30c0d4a872946ec4e36c46/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bfb192b3f4b9e8a89b6277b6ce787564f62cfd272055f6e685726b111dc7826", size = 1972827, upload-time = "2026-05-06T13:38:19.841Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/25/1ab42e8048fe551934d9884e8d64daa7e990ad386f310a15981aeb6a5b08/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9037063db01f09b09e237c282b6792bd4da634b5402c4e7f0c61effed7701a04", size = 2041051, upload-time = "2026-05-06T13:38:10.447Z" },
+    { url = "https://files.pythonhosted.org/packages/94/c2/1a934597ddf08da410385b3b7aae91956a5a76c635effef456074fad7e88/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc010ab034c8c7452522748bf937df58020d256ccae0874463d1f4d01758af8e", size = 2221314, upload-time = "2026-05-06T13:40:13.089Z" },
+    { url = "https://files.pythonhosted.org/packages/02/6d/9e8ad178c9c4df27ad3c8f25d1fe2a7ab0d2ba0559fad4aee5d3d1f16771/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c5dac79fa1614d1e06ca695109c6105923bd9c7d1d6c918d4e637b7e6b32fd3", size = 2285146, upload-time = "2026-05-06T13:38:59.224Z" },
+    { url = "https://files.pythonhosted.org/packages/80/50/540cd3aeefc041beb111125c4bff779831a2111fc6b15a9138cda277d32c/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fa868638bf362d3d138ea55829cefb3d5f4b0d7f142234382a15e2485dbec4", size = 2089685, upload-time = "2026-05-06T13:38:17.762Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/a4/b440ad35f05f6a38f89fa0f149accb3f0e02be94ca5e15f3c449a61b4bc9/pydantic_core-2.46.4-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:17299feefe090f2caa5b8e37222bb5f663e4935a8bfa6931d4102e5df1a9f398", size = 2115420, upload-time = "2026-05-06T13:37:58.195Z" },
+    { url = "https://files.pythonhosted.org/packages/99/61/de4f55db8dfd57bfdfa9a12ec90fe1b57c4f41062f7ca86f08586b3e0ac0/pydantic_core-2.46.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4c63ebc82684aa89d9a3bcbd13d515b3be44250dc68dd3bd81526c1cb31286c3", size = 2165122, upload-time = "2026-05-06T13:37:01.167Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/52/7c529d7bdb2d1068bd52f51fe32572c8301f9a4febf1948f10639f1436f5/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaa2a54443eff1950ba5ddc6b6ccda0d9c84a364276a62f969bdf2a390650848", size = 2182573, upload-time = "2026-05-06T13:38:45.04Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b3/7c40325848ba78247f2812dcf9c7274e38cd801820ca6dd9fe63bcfb0eb4/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:18e5ceec2ab67e6d5f1a9085e5a24c9c4e2ac4545730bfe668680bca05e555f3", size = 2317139, upload-time = "2026-05-06T13:37:15.539Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/37/f913f81a657c865b75da6c0dbed79876073c2a43b5bd9edbe8da785e4d49/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a0f62d0a58f4e7da165457e995725421e0064f2255d8eccebc49f41bbc23b109", size = 2360433, upload-time = "2026-05-06T13:37:30.099Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/67/6acaa1be2567f9256b056d8477158cac7240813956ce86e49deae8e173b4/pydantic_core-2.46.4-cp311-cp311-win32.whl", hash = "sha256:041bde0a48fd37cf71cab1c9d56d3e8625a3793fef1f7dd232b3ff37e978ecda", size = 1985513, upload-time = "2026-05-06T13:38:15.669Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/e6/c505f83dfeda9a2e5c995cfd872949e4d05e12f7feb3dca72f633daefa94/pydantic_core-2.46.4-cp311-cp311-win_amd64.whl", hash = "sha256:6f2eeda33a839975441c86a4119e1383c50b47faf0cbb5176985565c6bb02c33", size = 2071114, upload-time = "2026-05-06T13:40:35.416Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/da/7a263a96d965d9d0df5e8de8a475f33495451117035b09acb110288c381f/pydantic_core-2.46.4-cp311-cp311-win_arm64.whl", hash = "sha256:14f4c5d6db102bd796a627bbb3a17b4cf4574b9ae861d8b7c9a9661c6dd3362d", size = 2044298, upload-time = "2026-05-06T13:38:29.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158, upload-time = "2026-05-06T13:38:57.215Z" },
+    { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724, upload-time = "2026-05-06T13:37:02.697Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742, upload-time = "2026-05-06T13:37:09.448Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418, upload-time = "2026-05-06T13:37:38.234Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274, upload-time = "2026-05-06T13:38:27.753Z" },
+    { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940, upload-time = "2026-05-06T13:38:05.353Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516, upload-time = "2026-05-06T13:39:10.577Z" },
+    { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854, upload-time = "2026-05-06T13:40:22.59Z" },
+    { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306, upload-time = "2026-05-06T13:40:10.666Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044, upload-time = "2026-05-06T13:40:43.231Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133, upload-time = "2026-05-06T13:39:57.365Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464, upload-time = "2026-05-06T13:38:06.976Z" },
+    { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823, upload-time = "2026-05-06T13:40:47.985Z" },
+    { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919, upload-time = "2026-05-06T13:39:21.153Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604, upload-time = "2026-05-06T13:39:03.753Z" },
+    { url = "https://files.pythonhosted.org/packages/51/a2/5d30b469c5267a17b39dec53208222f76a8d351dfac4af661888c5aee77d/pydantic_core-2.46.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5d5902252db0d3cedf8d4a1bc68f70eeb430f7e4c7104c8c476753519b423008", size = 2106306, upload-time = "2026-05-06T13:37:48.029Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/81/4fa520eaffa8bd7d1525e644cd6d39e7d60b1592bc5b516693c7340b50f1/pydantic_core-2.46.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94f0688e7b8d0a67abf40e57a7eaaecd17cc9586706a31b76c031f63df052b4", size = 1951906, upload-time = "2026-05-06T13:37:17.012Z" },
+    { url = "https://files.pythonhosted.org/packages/03/d5/fd02da45b659668b05923b17ba3a0100a0a3d5541e3bd8fcc4ecb711309e/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f027324c56cd5406ca49c124b0db10e56c69064fec039acc571c29020cc87c76", size = 1976802, upload-time = "2026-05-06T13:37:35.113Z" },
+    { url = "https://files.pythonhosted.org/packages/21/f2/95727e1368be3d3ed485eaab7adbd7dda408f33f7a36e8b48e0144002b91/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e739fee756ba1010f8bcccb534252e85a35fe45ae92c295a06059ce58b74ccd3", size = 2052446, upload-time = "2026-05-06T13:37:12.313Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/86/5d99feea3f77c7234b8718075b23db11532773c1a0dbd9b9490215dc2eeb/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d56801be94b86a9da183e5f3766e6310752b99ff647e38b09a9500d88e46e76", size = 2232757, upload-time = "2026-05-06T13:39:01.149Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/3a/508ac615935ef7588cf6d9e9b91309fdc2da751af865e02a9098de88258c/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2412e734dcb48da14d4e4006b82b46b74f2518b8a26ee7e58c6844a6cd6d03c4", size = 2309275, upload-time = "2026-05-06T13:37:41.406Z" },
+    { url = "https://files.pythonhosted.org/packages/07/f8/41db9de19d7987d6b04715a02b3b40aea467000275d9d758ffaa31af7d50/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9551187363ffc0de2a00b2e47c25aeaeb1020b69b668762966df15fc5659dd5a", size = 2094467, upload-time = "2026-05-06T13:39:18.847Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/e2/f35033184cb11d0052daf4416e8e10a502ea2ac006fc4f459aee872727d1/pydantic_core-2.46.4-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0186750b482eefa11d7f435892b09c5c606193ef3375bcf94aa00ae6bfb66262", size = 2134417, upload-time = "2026-05-06T13:40:17.944Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/7b/6ceeb1cc90e193862f444ebe373d8fdf613f0a82572dde03fb10734c6c71/pydantic_core-2.46.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5855698a4856556d86e8e6cd8434bc3ac0314ee8e12089ae0e143f64c6256e4e", size = 2179782, upload-time = "2026-05-06T13:40:32.618Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f2/c8d7773ede6af08036423a00ae0ceffce266c3c52a096c435d68c896083f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cbaf13819775b7f769bf4a1f066cb6df7a28d4480081a589828ef190226881cd", size = 2188782, upload-time = "2026-05-06T13:36:51.018Z" },
+    { url = "https://files.pythonhosted.org/packages/59/31/0c864784e31f09f05cdd87606f08923b9c9e7f6e51dd27f20f62f975ce9f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:633147d34cf4550417f12e2b1a0383973bdf5cdfde212cb09e9a581cf10820be", size = 2328334, upload-time = "2026-05-06T13:40:37.764Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/eb/4f6c8a41efa30baa755590f4141abf3a8c370fab610915733e74134a7270/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:82cf5301172168103724d49a1444d3378cb20cdee30b116a1bd6031236298a5d", size = 2372986, upload-time = "2026-05-06T13:39:34.152Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/24/b375a480d53113860c299764bfe9f349a3dc9108b3adc0d7f0d786492ebf/pydantic_core-2.46.4-cp313-cp313-win32.whl", hash = "sha256:9fa8ae11da9e2b3126c6426f147e0fba88d96d65921799bb30c6abd1cb2c97fb", size = 1973693, upload-time = "2026-05-06T13:37:55.072Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/e8/cff247591966f2d22ec8c003cd7587e27b7ba7b81ab2fb888e3ab75dc285/pydantic_core-2.46.4-cp313-cp313-win_amd64.whl", hash = "sha256:6b3ace8194b0e5204818c92802dcdca7fc6d88aabbb799d7c795540d9cd6d292", size = 2071819, upload-time = "2026-05-06T13:38:49.139Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/1a/f4aee670d5670e9e148e0c82c7db98d780be566c6e6a97ee8035528ca0b3/pydantic_core-2.46.4-cp313-cp313-win_arm64.whl", hash = "sha256:184c081504d17f1c1066e430e117142b2c77d9448a97f7b65c6ac9fd9aee238d", size = 2027411, upload-time = "2026-05-06T13:40:45.796Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/74/228a26ddad29c6672b805d9fd78e8d251cd04004fa7eed0e622096cd0250/pydantic_core-2.46.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:428e04521a40150c85216fc8b85e8d39fece235a9cf5e383761238c7fa9b96fb", size = 2102079, upload-time = "2026-05-06T13:38:41.019Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/8970b150a4b4365623ae00fc88603491f763c627311ae8031e3111356d6e/pydantic_core-2.46.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23ace664830ee0bfe014a0c7bc248b1f7f25ed7ad103852c317624a1083af462", size = 1952179, upload-time = "2026-05-06T13:36:59.812Z" },
+    { url = "https://files.pythonhosted.org/packages/95/30/5211a831ae054928054b2f79731661087a2bc5c01e825c672b3a4a8f1b3e/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce5c1d2a8b27468f433ca974829c44060b8097eedc39933e3c206a90ee49c4a9", size = 1978926, upload-time = "2026-05-06T13:37:39.933Z" },
+    { url = "https://files.pythonhosted.org/packages/57/e9/689668733b1eb67adeef047db3c2e8788fcf65a7fd9c9e2b46b7744fe245/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7283d57845ecf5a163403eb0702dfc220cc4fbdd18919cb5ccea4f95ee1cdab4", size = 2046785, upload-time = "2026-05-06T13:38:01.995Z" },
+    { url = "https://files.pythonhosted.org/packages/60/d9/6715260422ff50a2109878fd24d948a6c3446bb2664f34ee78cd972b3acd/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8daafc69c93ee8a0204506a3b6b30f586ef54028f52aeeeb5c4cfc5184fd5914", size = 2228733, upload-time = "2026-05-06T13:40:50.371Z" },
+    { url = "https://files.pythonhosted.org/packages/18/ae/fdb2f64316afca925640f8e70bb1a564b0ec2721c1389e25b8eb4bf9a299/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2213145bcc2ba85884d0ac63d222fece9209678f77b9b4d76f054c561adb28", size = 2307534, upload-time = "2026-05-06T13:37:21.531Z" },
+    { url = "https://files.pythonhosted.org/packages/89/1d/8eff589b45bb8190a9d12c49cfad0f176a5cbd1534908a6b5125e2886239/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a5f930472650a82629163023e630d160863fce524c616f4e5186e5de9d9a49b", size = 2099732, upload-time = "2026-05-06T13:39:31.942Z" },
+    { url = "https://files.pythonhosted.org/packages/06/d5/ee5a3366637fee41dee51a1fc91562dcf12ddbc68fda34e6b253da2324bb/pydantic_core-2.46.4-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:c1b3f518abeca3aa13c712fd202306e145abf59a18b094a6bafb2d2bbf59192c", size = 2129627, upload-time = "2026-05-06T13:37:25.033Z" },
+    { url = "https://files.pythonhosted.org/packages/94/33/2414be571d2c6a6c4d08be21f9292b6d3fdb08949a97b6dfe985017821db/pydantic_core-2.46.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a7dd0b3ee80d90150e3495a3a13ac34dbcbfd4f012996a6a1d8900e91b5c0fb", size = 2179141, upload-time = "2026-05-06T13:37:14.046Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/79/7daa95be995be0eecc4cf75064cb33f9bbbfe3fe0158caf2f0d4a996a5c7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:3fb702cd90b0446a3a1c5e470bfa0dd23c0233b676a9099ddcc964fa6ca13898", size = 2184325, upload-time = "2026-05-06T13:36:53.615Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/cb/d0a382f5c0de8a222dc61c65348e0ce831b1f68e0a018450d31c2cace3a5/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b8458003118a712e66286df6a707db01c52c0f52f7db8e4a38f0da1d3b94fc4e", size = 2323990, upload-time = "2026-05-06T13:40:29.971Z" },
+    { url = "https://files.pythonhosted.org/packages/05/db/d9ba624cc4a5aced1598e88c04fdbd8310c8a69b9d38b9a3d39ce3a61ed7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:372429a130e469c9cd698925ce5fc50940b7a1336b0d82038e63d5bbc4edc519", size = 2369978, upload-time = "2026-05-06T13:37:23.027Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/20/d15df15ba918c423461905802bfd2981c3af0bfa0e40d05e13edbfa48bc3/pydantic_core-2.46.4-cp314-cp314-win32.whl", hash = "sha256:85bb3611ff1802f3ee7fdd7dbff26b56f343fb432d57a4728fdd49b6ef35e2f4", size = 1966354, upload-time = "2026-05-06T13:38:03.499Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/b6/6b8de4c0a7d7ab3004c439c80c5c1e0a3e8d78bbae19379b01960383d9e5/pydantic_core-2.46.4-cp314-cp314-win_amd64.whl", hash = "sha256:811ff8e9c313ab425368bcbb36e5c4ebd7108c2bbf4e4089cfbb0b01eff63fac", size = 2072238, upload-time = "2026-05-06T13:39:40.807Z" },
+    { url = "https://files.pythonhosted.org/packages/32/36/51eb763beec1f4cf59b1db243a7dcc39cbb41230f050a09b9d69faaf0a48/pydantic_core-2.46.4-cp314-cp314-win_arm64.whl", hash = "sha256:bfec22eab3c8cc2ceec0248aec886624116dc079afa027ecc8ad4a7e62010f8a", size = 2018251, upload-time = "2026-05-06T13:37:26.72Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/91/855af51d625b23aa987116a19e231d2aaef9c4a415273ddc189b79a45fee/pydantic_core-2.46.4-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:af8244b2bef6aaad6d92cda81372de7f8c8d36c9f0c3ea36e827c60e7d9467a0", size = 2099593, upload-time = "2026-05-06T13:39:47.682Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/1b/8784a54c65edb5f49f0a14d6977cf1b209bba85a4c77445b255c2de58ab3/pydantic_core-2.46.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a4330cdbc57162e4b3aa303f588ba752257694c9c9be3e7ebb11b4aca659b5d", size = 1935226, upload-time = "2026-05-06T13:40:40.428Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/e7/1955d28d1afc56dd4b3ad7cc0cf39df1b9852964cf16e5d13912756d6d6b/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c61fc04a3d840155ff08e475a04809278972fe6aef51e2720554e96367e34b", size = 1974605, upload-time = "2026-05-06T13:37:32.029Z" },
+    { url = "https://files.pythonhosted.org/packages/93/e2/3fedbf0ba7a22850e6e9fd78117f1c0f10f950182344d8a6c535d468fdd8/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c50f2528cf200c5eed56faf3f4e22fcd5f38c157a8b78576e6ba3168ec35f000", size = 2030777, upload-time = "2026-05-06T13:38:55.239Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/61/46be275fcaaba0b4f5b9669dd852267ce1ff616592dccf7a7845588df091/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0cbe8b01f948de4286c74cdd6c667aceb38f5c1e26f0693b3983d9d74887c65e", size = 2236641, upload-time = "2026-05-06T13:37:08.096Z" },
+    { url = "https://files.pythonhosted.org/packages/60/db/12e93e46a8bac9988be3c016860f83293daea8c716c029c9ace279036f2f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:617d7e2ca7dcb8c5cf6bcb8c59b8832c94b36196bbf1cbd1bfb56ed341905edd", size = 2286404, upload-time = "2026-05-06T13:40:20.221Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/4a/4d8b19008f38d31c53b8219cfedc2e3d5de5fe99d90076b7e767de29274f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027560ee92211647d0d34e3f7cd6f50da56399d26a9c8ad0da286d3869a53f3", size = 2109219, upload-time = "2026-05-06T13:38:12.153Z" },
+    { url = "https://files.pythonhosted.org/packages/88/70/3cbc40978fefb7bb09c6708d40d4ad1a5d70fd7213c3d17f971de868ec1f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:f99626688942fb746e545232e7726926f3be91b5975f8b55327665fafda991c7", size = 2110594, upload-time = "2026-05-06T13:40:02.971Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/20/b8d36736216e29491125531685b2f9e61aa5b4b2599893f8268551da3338/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3e9034a63de20e15e8ade85358bc6efc614008cab72898b4b4952bea0509ff", size = 2159542, upload-time = "2026-05-06T13:39:27.506Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/a2/367df868eb584dacf6bf82a389272406d7178e301c4ac82545ab98bc2dd9/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:97e7cf2be5c77b7d1a9713a05605d49460d02c6078d38d8bef3cbe323c548424", size = 2168146, upload-time = "2026-05-06T13:38:31.93Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/b8/4460f77f7e201893f649a29ab355dddd3beee8a97bcb1a320db414f9a06e/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:3bf92c5d0e00fefaab325a4d27828fe6b6e2a21848686b5b60d2d9eeb09d76c6", size = 2306309, upload-time = "2026-05-06T13:37:44.717Z" },
+    { url = "https://files.pythonhosted.org/packages/64/c4/be2639293acd87dc8ddbcec41a73cee9b2ebf996fe6d892a1a74e88ad3f7/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:3ecbc122d18468d06ca279dc26a8c2e2d5acb10943bb35e36ae92096dc3b5565", size = 2369736, upload-time = "2026-05-06T13:37:05.645Z" },
+    { url = "https://files.pythonhosted.org/packages/30/a6/9f9f380dbb301f67023bf8f707aaa75daadf84f7152d95c410fd7e81d994/pydantic_core-2.46.4-cp314-cp314t-win32.whl", hash = "sha256:e846ae7835bf0703ae43f534ab79a867146dadd59dc9ca5c8b53d5c8f7c9ef02", size = 1955575, upload-time = "2026-05-06T13:38:51.116Z" },
+    { url = "https://files.pythonhosted.org/packages/40/1f/f1eb9eb350e795d1af8586289746f5c5677d16043040d63710e22abc43c9/pydantic_core-2.46.4-cp314-cp314t-win_amd64.whl", hash = "sha256:2108ba5c1c1eca18030634489dc544844144ee36357f2f9f780b93e7ddbb44b5", size = 2051624, upload-time = "2026-05-06T13:38:21.672Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/a4/73995fd4ebbb46ba0ee51e6fa049b8f02c40daebb762208feda8a6b7894d/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:14d4edf427bdcf950a8a02d7cb44a08614388dd6e1bdcbf4f67504fa7887da9c", size = 2111589, upload-time = "2026-05-06T13:37:10.817Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/7f/f37d3a5e8bfcc2e403f5c57a730f2d815693fb42119e8ea48b3789335af1/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ce40cd7b21210e99342afafbd4d0f76d784eb5b1d60f3bdc566be4983c6c73b", size = 1944552, upload-time = "2026-05-06T13:36:56.717Z" },
+    { url = "https://files.pythonhosted.org/packages/15/3c/d7eb777b3ff43e8433a4efb39a17aa8fd98a4ee8561a24a67ef5db07b2d6/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90884113d8b48f760e9587002789ddd741e76ab9f89518cd1e43b1f1a52ec44b", size = 1982984, upload-time = "2026-05-06T13:39:06.207Z" },
+    { url = "https://files.pythonhosted.org/packages/63/87/70b9f40170a81afd55ca26c9b2acb25c20d64bcfbf888fafecb3ba077d4c/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66ce7632c22d837c95301830e111ad0128a32b8207533b60896a96c4915192ea", size = 2138417, upload-time = "2026-05-06T13:39:45.476Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1d/8987ad40f65ae1432753072f214fb5c74fe47ffbd0698bb9cbbb585664f8/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:1d8ba486450b14f3b1d63bc521d410ec7565e52f887b9fb671791886436a42f7", size = 2095527, upload-time = "2026-05-06T13:39:52.283Z" },
+    { url = "https://files.pythonhosted.org/packages/64/d3/84c282a7eee1d3ac4c0377546ef5a1ea436ce26840d9ac3b7ed54a377507/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:3009f12e4e90b7f88b4f9adb1b0c4a3d58fe7820f3238c190047209d148026df", size = 1936024, upload-time = "2026-05-06T13:40:15.671Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ca/eac61596cdeb4d7e174d3dc0bd8a6238f14f75f97a24e7b7db4c7e7340a0/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad785e92e6dc634c21555edc8bd6b64957ab844541bcb96a1366c202951ae526", size = 1990696, upload-time = "2026-05-06T13:38:34.717Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/c3/7c8b240552251faf6b3a957db200fcfbbcec36763c050428b601e0c9b83b/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c603d540afdd6b80eb39f078f33ebd46211f02f33e34a32d9f053bba711de0", size = 2147590, upload-time = "2026-05-06T13:39:29.883Z" },
+    { url = "https://files.pythonhosted.org/packages/11/cb/428de0385b6c8d44b716feba566abfacfbd23ee3c4439faa789a1456242f/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:0c563b08bca408dc7f65f700633d8442fffb2421fc47b8101377e9fd65051ff0", size = 2112782, upload-time = "2026-05-06T13:37:04.016Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/b5/6a17bdadd0fc1f170adfd05a20d37c832f52b117b4d9131da1f41bb097ce/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:db06ffe51636ffe9ca531fe9023dd64bdd794be8754cb5df57c5498ae5b518a7", size = 1952146, upload-time = "2026-05-06T13:39:43.092Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/dc/03734d80e362cd43ef65428e9de77c730ce7f2f11c60d2b1e1b39f0fbf99/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133878133d271ade3d41d1bfb2a45ec38dbdbda40bc065921c6b04e4630127e2", size = 2134492, upload-time = "2026-05-06T13:36:58.124Z" },
+    { url = "https://files.pythonhosted.org/packages/de/df/5e5ffc085ed07cc22d298134d3d911c63e91f6a0eb91fe646750a3209910/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9bc519fbf2b7578398853d815009ae5e4d4603d12f4e3f91da8c06852d3da3e9", size = 2156604, upload-time = "2026-05-06T13:37:49.88Z" },
+    { url = "https://files.pythonhosted.org/packages/81/44/6e112a4253e56f5705467cbab7ab5e91ee7398ba3d56d358635958893d3e/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c7a7bd4e39e8e4c12c39cd480356842b6a8a06e41b23a55a5e3e191718838ddf", size = 2183828, upload-time = "2026-05-06T13:37:43.053Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/ad/5565071e937d8e752842ac241463944c9eb14c87e2d269f2658a5bd05e98/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:d396ec2b979760aaf3218e76c24e65bd0aca24983298653b3a9d7a45f9e47b30", size = 2310000, upload-time = "2026-05-06T13:37:56.694Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/c3/66883a5cec183e7fba4d024b4cbbe61851a63750ef606b0afecc46d1f2bf/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:86e1a4418c6cd97d60c95c71164158eaf7324fae7b0923264016baa993eba6fc", size = 2361286, upload-time = "2026-05-06T13:40:05.667Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/2d/69abac8f838090bbecd5df894befb2c2619e7996a98ddb949db9f3b93225/pydantic_core-2.46.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:d51026d73fcfd93610abc7b27789c26b313920fcfb20e27462d74a7f8b06e983", size = 2193071, upload-time = "2026-05-06T13:38:08.682Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "rich"
+version = "15.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]