fix: batch-eval docs + preserve config-bundle placeholders on AB-test promote #1193
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Claude Security Review | |
| # This workflow inlines the security-review prompt rather than calling the | |
| # bundled /security-review slash command. The bundled skill silently bombs | |
| # whenever the runner's clone gets shallowed mid-run (claude-code-action's | |
| # restoreConfigFromBase does this on every PR by design — see | |
| # https://github.com/anthropics/claude-code-action/blob/v1/src/github/operations/restore-config.ts), | |
| # because its first action is `git diff origin/HEAD...` and a shallow clone | |
| # has no merge base. Computing the diff ourselves before the action starts | |
| # eliminates that whole class of failure. | |
| on: | |
| pull_request_target: | |
| types: [opened, reopened, synchronize, labeled] | |
| # Only review PRs targeting our two long-lived release branches. PRs | |
| # into short-lived feature branches don't need a security gate — they | |
| # get reviewed when those features are merged into main or | |
| # feat/summit_release. | |
| branches: | |
| - main | |
| - feat/summit_release | |
| workflow_dispatch: | |
| inputs: | |
| pr_number: | |
| description: | |
| PR number to review (workflow_dispatch will NOT post inline comments — use only for prompt smoke tests) | |
| required: true | |
| type: string | |
| permissions: | |
| id-token: write | |
| pull-requests: write | |
| issues: write | |
| contents: read | |
| concurrency: | |
| # Don't cancel-in-progress: a cancelled run that has already started its labels/checkout | |
| # but not the actual review still triggers always() steps and ends up posting a misleading | |
| # "no findings" summary (since the inline-comment buffer is empty when the analysis step | |
| # was skipped due to cancellation). Letting both runs complete is the safer default. | |
| group: pr-security-review-${{ github.event.pull_request.number || inputs.pr_number }} | |
| cancel-in-progress: false | |
| jobs: | |
| authorize: | |
| runs-on: ubuntu-latest | |
| # On 'labeled' events, only proceed when the label is exactly 'safe-to-review'. | |
| # Other labels (e.g. size/m) are filtered out so we don't spawn API calls. | |
| if: | | |
| github.event_name != 'pull_request_target' || | |
| github.event.action != 'labeled' || | |
| github.event.label.name == 'safe-to-review' | |
| outputs: | |
| authorized: ${{ steps.auth.outputs.authorized || steps.dispatch-auth.outputs.authorized }} | |
| steps: | |
| - name: Check authorization | |
| id: auth | |
| if: github.event_name == 'pull_request_target' | |
| uses: actions/github-script@v9 | |
| with: | |
| script: | | |
| const isLabel = context.payload.action === 'labeled'; | |
| const user = isLabel | |
| ? context.payload.sender.login | |
| : context.payload.pull_request.user.login; | |
| const reason = isLabel ? `labeler ${user}` : `PR author ${user}`; | |
| try { | |
| await github.rest.teams.getMembershipForUserInOrg({ | |
| org: context.repo.owner, | |
| team_slug: 'agentcore-cli-devs', | |
| username: user, | |
| }); | |
| core.setOutput('authorized', 'true'); | |
| } catch { | |
| try { | |
| const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| username: user, | |
| }); | |
| core.setOutput('authorized', ['write', 'admin'].includes(data.permission) ? 'true' : 'false'); | |
| } catch { | |
| core.setOutput('authorized', 'false'); | |
| } | |
| } | |
| - name: Auto-authorize workflow_dispatch | |
| id: dispatch-auth | |
| if: github.event_name == 'workflow_dispatch' | |
| run: echo "authorized=true" >> "$GITHUB_OUTPUT" | |
| review: | |
| needs: authorize | |
| if: needs.authorize.outputs.authorized == 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| env: | |
| AWS_REGION: us-west-2 | |
| steps: | |
| - name: Generate GitHub App token | |
| id: app-token | |
| uses: actions/create-github-app-token@v1 | |
| with: | |
| app-id: ${{ vars.APP_ID }} | |
| private-key: ${{ secrets.APP_PRIVATE_KEY }} | |
| - name: Resolve PR | |
| id: pr | |
| uses: actions/github-script@v9 | |
| env: | |
| PR_NUMBER_INPUT: ${{ inputs.pr_number }} | |
| with: | |
| github-token: ${{ steps.app-token.outputs.token }} | |
| script: | | |
| const num = context.eventName === 'workflow_dispatch' | |
| ? parseInt(process.env.PR_NUMBER_INPUT, 10) | |
| : context.payload.pull_request.number; | |
| const { data: pr } = await github.rest.pulls.get({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| pull_number: num, | |
| }); | |
| core.setOutput('number', num); | |
| core.setOutput('head_sha', pr.head.sha); | |
| core.setOutput('base_ref', pr.base.ref); | |
| - name: Add reviewing label | |
| uses: actions/github-script@v9 | |
| env: | |
| PR_NUMBER: ${{ steps.pr.outputs.number }} | |
| with: | |
| github-token: ${{ steps.app-token.outputs.token }} | |
| script: | | |
| const prNumber = parseInt(process.env.PR_NUMBER, 10); | |
| try { | |
| await github.rest.issues.getLabel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| name: 'claude-security-reviewing', | |
| }); | |
| } catch (e) { | |
| if (e.status === 404) { | |
| await github.rest.issues.createLabel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| name: 'claude-security-reviewing', | |
| color: 'D73A4A', | |
| description: 'Claude Code security review in progress', | |
| }); | |
| } | |
| } | |
| await github.rest.issues.addLabels({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| labels: ['claude-security-reviewing'], | |
| }); | |
| - name: Checkout PR head | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ steps.pr.outputs.head_sha }} | |
| fetch-depth: 0 | |
| - name: Compute diff | |
| id: diff | |
| env: | |
| BASE_REF: ${{ steps.pr.outputs.base_ref }} | |
| run: | | |
| set -euo pipefail | |
| # Compute the diff *before* claude-code-action shallows the repo. | |
| # The action's restoreConfigFromBase() runs `git fetch --depth=1` | |
| # against the base branch on startup, which strips history and | |
| # would break any base-vs-head diff after that point. Doing it | |
| # here means the model gets a frozen artifact that can't be | |
| # invalidated by anything the action does later. | |
| git fetch --no-tags origin "+refs/heads/$BASE_REF:refs/remotes/origin/$BASE_REF" | |
| git diff "origin/$BASE_REF...HEAD" > /tmp/pr.diff | |
| BYTES=$(wc -c < /tmp/pr.diff) | |
| FILES=$(git diff --name-only "origin/$BASE_REF...HEAD" | wc -l | tr -d ' ') | |
| echo "bytes=$BYTES" >> "$GITHUB_OUTPUT" | |
| echo "files=$FILES" >> "$GITHUB_OUTPUT" | |
| echo "Diff: $BYTES bytes across $FILES files" | |
| - name: Build prompt | |
| if: steps.diff.outputs.bytes != '0' | |
| run: | | |
| set -euo pipefail | |
| mkdir -p "$RUNNER_TEMP/prompt" | |
| cat > "$RUNNER_TEMP/prompt/prompt.md" <<'PROMPT_EOF' | |
| You are performing a HIGH-CONFIDENCE security code review of a pull | |
| request. The complete diff is at `/tmp/pr.diff` — read it first | |
| using the Read tool. That file is the ground truth for what the | |
| PR changes; do not run `git diff` or any other git commands. To | |
| understand context — callers of a changed function, existing | |
| sanitization patterns, the project's threat model — use Grep, | |
| Glob, and Read against the repository working tree. Do not use | |
| Bash. | |
| # OBJECTIVE | |
| Identify HIGH-CONFIDENCE security vulnerabilities newly introduced | |
| by this PR that have real exploitation potential. This is NOT a | |
| general code review. Focus ONLY on security implications added by | |
| the PR. Do not comment on pre-existing issues. | |
| # CRITICAL INSTRUCTIONS | |
| 1. MINIMIZE FALSE POSITIVES: Only flag issues where you are >80% | |
| confident of actual exploitability. | |
| 2. AVOID NOISE: Skip theoretical issues, style concerns, or | |
| low-impact findings. | |
| 3. FOCUS ON IMPACT: Prioritize vulnerabilities that lead to | |
| unauthorized access, data breach, or system compromise. | |
| 4. DO NOT report any of: | |
| - Denial of service / resource exhaustion / rate limiting | |
| - Secrets at rest on disk (handled by other tooling) | |
| - Memory consumption or CPU exhaustion | |
| # CATEGORIES TO EXAMINE | |
| **Input validation**: SQL injection, command injection, XXE, | |
| template injection, NoSQL injection, path traversal. | |
| **AuthN/AuthZ**: authentication bypass, privilege escalation, | |
| session/JWT flaws, authorization-logic bypasses. | |
| **Crypto & secrets**: hardcoded keys/passwords/tokens, weak | |
| algorithms, improper key storage, weak randomness, certificate | |
| validation bypass. | |
| **Code execution**: deserialization RCE (pickle, YAML, etc.), | |
| eval injection, XSS (reflected/stored/DOM) — only in unsafe paths | |
| (see precedents). | |
| **Data exposure**: sensitive logging, PII handling violations, | |
| API leakage, debug-info exposure. | |
| A finding can still be HIGH severity if only exploitable from the | |
| local network. | |
| # METHODOLOGY | |
| Phase 1 — Repository context: identify existing security | |
| libraries/frameworks, sanitization patterns, the project's threat | |
| model. Use search tools. | |
| Phase 2 — Comparative analysis: compare new changes against | |
| established patterns; flag deviations and net-new attack surface. | |
| Phase 3 — Vulnerability assessment: for each modified file, | |
| trace user input → sensitive operations, look for unsafe privilege | |
| boundary crossings, identify injection points. | |
| # FALSE-POSITIVE FILTER (apply hard) | |
| Read the code (Read/Grep/Glob); do not run commands to reproduce | |
| or write files. | |
| HARD EXCLUSIONS — drop any finding matching: | |
| 1. DoS / resource exhaustion. | |
| 2. Secrets/credentials on disk if otherwise secured. | |
| 3. Rate limiting or service overload. | |
| 4. Memory/CPU exhaustion. | |
| 5. Missing input validation on non-security-critical fields. | |
| 6. Input sanitization in GitHub Actions workflows unless clearly | |
| triggerable via untrusted input. | |
| 7. Lack of hardening; only flag concrete vulns. | |
| 8. Theoretical race conditions or timing attacks. | |
| 9. Outdated third-party libraries (handled separately). | |
| 10. Memory-safety issues in memory-safe languages (Rust, Go, | |
| JS/TS, Python). | |
| 11. Files that are unit tests or test-only. | |
| 12. Log spoofing — un-sanitized user input to logs is not a vuln. | |
| 13. SSRF that only controls the path (host/protocol control is | |
| required). | |
| 14. User-controlled content in AI system prompts is not a vuln. | |
| 15. Regex injection. | |
| 16. Regex DoS. | |
| 17. Insecure documentation (.md and similar). | |
| 18. Lack of audit logs. | |
| PRECEDENTS: | |
| 1. Plaintext-logging high-value secrets IS a vuln; logging URLs | |
| is assumed safe. | |
| 2. UUIDs are unguessable and need no validation. | |
| 3. Env vars and CLI flags are trusted inputs. | |
| 4. Resource leaks (memory, fd) are not vulns. | |
| 5. Tabnabbing, XS-Leaks, prototype pollution, open redirects: | |
| only with extremely high confidence. | |
| 6. React / Angular: do not report XSS in components or .tsx files | |
| unless using `dangerouslySetInnerHTML`, | |
| `bypassSecurityTrustHtml`, or equivalents. | |
| 7. GitHub Actions workflow vulns: only when a concrete attack | |
| path through untrusted input exists. | |
| 8. Missing AuthN/AuthZ in client-side code is not a vuln — | |
| validation is the server's job. | |
| 9. MEDIUM findings only when obvious and concrete. | |
| 10. .ipynb notebook vulns: only with a concrete attack path | |
| through untrusted input. | |
| 11. Logging non-PII data is not a vuln. Only flag when the data | |
| is secrets, passwords, or PII. | |
| 12. Command injection in shell scripts: only when there is a | |
| concrete attack path through untrusted input. | |
| For each surviving finding, score confidence 1–10: | |
| - 1–3: low / likely noise — drop | |
| - 4–6: medium — drop unless obvious and concrete | |
| - 7–10: high — keep | |
| # PROCESS | |
| Run this in three steps, exactly: | |
| 1. Spawn a Task sub-agent to identify candidate vulnerabilities. | |
| Pass the full instructions above (objective, categories, | |
| methodology, hard exclusions, precedents). Have it return a | |
| structured list of candidates with file/line/category/ | |
| description/exploit/fix. | |
| 2. For EACH candidate from step 1, spawn an independent Task | |
| sub-agent IN PARALLEL to adversarially verify it. Each | |
| verifier gets the full FALSE-POSITIVE FILTER above and is | |
| told to default to "drop" if uncertain. Each returns a | |
| confidence score 1–10. | |
| 3. Drop any finding with confidence < 8. For every finding that | |
| survives, call: | |
| mcp__github_inline_comment__create_inline_comment | |
| with `{ path, line, body }` pointing at the exact file and | |
| line in the diff. The body should follow: | |
| **<Severity>: <Category>** | |
| <One-paragraph description of the issue and concrete | |
| exploit scenario.> | |
| **Recommendation:** <One-sentence fix.> | |
| Do NOT post a single summary comment listing all findings — | |
| the workflow handles a top-level summary after this run | |
| completes. If zero findings survive Phase 3, exit without | |
| calling any tool. | |
| Begin. | |
| PROMPT_EOF | |
| - name: Configure AWS credentials | |
| if: steps.diff.outputs.bytes != '0' | |
| uses: aws-actions/configure-aws-credentials@v6 | |
| with: | |
| role-to-assume: ${{ secrets.BEDROCK_SECURITY_REVIEW_ROLE_ARN }} | |
| aws-region: us-west-2 | |
| - name: Load prompt into env | |
| id: load-prompt | |
| if: steps.diff.outputs.bytes != '0' | |
| # The action only accepts `prompt:` (a string), not a file path — | |
| # passing prompt_file silently no-ops, leaves the action with no | |
| # trigger, and skips the run with "No trigger found". Read the | |
| # built prompt into an environment variable so we can pass it | |
| # inline below. Using GITHUB_ENV with a randomized heredoc | |
| # sentinel is the standard Actions idiom for multi-line values. | |
| env: | |
| PROMPT_FILE: ${{ runner.temp }}/prompt/prompt.md | |
| run: | | |
| set -euo pipefail | |
| DELIM="EOF_$(uuidgen)" | |
| { | |
| echo "PROMPT_BODY<<$DELIM" | |
| cat "$PROMPT_FILE" | |
| echo "$DELIM" | |
| } >> "$GITHUB_ENV" | |
| - name: Run Claude Code | |
| id: review | |
| if: steps.diff.outputs.bytes != '0' | |
| uses: anthropics/claude-code-action@v1 | |
| with: | |
| github_token: ${{ steps.app-token.outputs.token }} | |
| use_bedrock: 'true' | |
| prompt: ${{ env.PROMPT_BODY }} | |
| show_full_output: 'true' | |
| # Read/Grep/Glob let the model explore the repo for context | |
| # (existing sanitization patterns, threat model, callers of a | |
| # changed function). Task is needed for the parallel verifier | |
| # sub-agents in Phase 2. The github_inline_comment MCP tool is | |
| # the output channel; allow-listing it is also what tells the | |
| # action to attach the inline-comment MCP server. Bash is | |
| # intentionally NOT allowed: the prompt forbids running | |
| # commands, and keeping Bash off the list makes the diff at | |
| # /tmp/pr.diff the only ground truth (no `git diff` re-runs | |
| # against a possibly-shallow clone). | |
| claude_args: >- | |
| --model us.anthropic.claude-opus-4-7 --max-turns 60 --allowedTools "Read Grep Glob Task | |
| mcp__github_inline_comment__create_inline_comment" | |
| - name: Verify model ran productively | |
| id: model-ran | |
| if: | |
| steps.diff.outputs.bytes != '0' && (steps.review.conclusion == 'success' || steps.review.conclusion == | |
| 'failure') | |
| env: | |
| OUTPUT_JSON: | |
| ${{ steps.review.outputs.execution_file || format('{0}/claude-execution-output.json', runner.temp) }} | |
| run: | | |
| set -euo pipefail | |
| if [ ! -s "$OUTPUT_JSON" ]; then | |
| echo "::warning::No execution transcript at $OUTPUT_JSON — cannot verify" | |
| echo "ran=unknown" >> "$GITHUB_OUTPUT" | |
| echo "num_turns=0" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| NUM_TURNS=$(jq -r '.[-1].num_turns // 0' "$OUTPUT_JSON") | |
| IS_ERROR=$(jq -r '.[-1].is_error // false' "$OUTPUT_JSON") | |
| OUTPUT_TOKENS=$(jq -r '.[-1].usage.output_tokens // 0' "$OUTPUT_JSON") | |
| echo "num_turns=$NUM_TURNS, is_error=$IS_ERROR, output_tokens=$OUTPUT_TOKENS" | |
| echo "num_turns=$NUM_TURNS" >> "$GITHUB_OUTPUT" | |
| if [ "$IS_ERROR" = "true" ] || [ "$NUM_TURNS" = "0" ] || [ "$OUTPUT_TOKENS" = "0" ]; then | |
| echo "::group::Last messages from SDK transcript" | |
| jq -r '.[] | select(.type == "user" or .type == "system") | .message.content // .subtype' "$OUTPUT_JSON" | tail -40 | |
| echo "::endgroup::" | |
| echo "::error::Model did not run productively (turns=$NUM_TURNS, output_tokens=$OUTPUT_TOKENS, is_error=$IS_ERROR)" | |
| echo "ran=false" >> "$GITHUB_OUTPUT" | |
| exit 1 | |
| fi | |
| echo "ran=true" >> "$GITHUB_OUTPUT" | |
| - name: Count findings | |
| id: findings | |
| if: | |
| steps.diff.outputs.bytes != '0' && (steps.review.conclusion == 'success' || steps.review.conclusion == | |
| 'failure') | |
| run: | | |
| set -euo pipefail | |
| BUFFER=/tmp/inline-comments-buffer.jsonl | |
| if [ -s "$BUFFER" ]; then | |
| COUNT=$(wc -l < "$BUFFER" | tr -d ' ') | |
| else | |
| COUNT=0 | |
| fi | |
| echo "count=$COUNT" >> "$GITHUB_OUTPUT" | |
| echo "Buffered findings: $COUNT" | |
| - name: Post summary | |
| if: always() | |
| uses: actions/github-script@v9 | |
| env: | |
| PR_NUMBER: ${{ steps.pr.outputs.number }} | |
| FINDING_COUNT: ${{ steps.findings.outputs.count }} | |
| REVIEW_CONCLUSION: ${{ steps.review.conclusion }} | |
| MODEL_RAN: ${{ steps.model-ran.outputs.ran }} | |
| NUM_TURNS: ${{ steps.model-ran.outputs.num_turns }} | |
| DIFF_BYTES: ${{ steps.diff.outputs.bytes }} | |
| DIFF_FILES: ${{ steps.diff.outputs.files }} | |
| RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| with: | |
| github-token: ${{ steps.app-token.outputs.token }} | |
| script: | | |
| const prNumber = parseInt(process.env.PR_NUMBER, 10); | |
| const count = parseInt(process.env.FINDING_COUNT || '0', 10); | |
| const conclusion = process.env.REVIEW_CONCLUSION || 'skipped'; | |
| const modelRan = process.env.MODEL_RAN || 'unknown'; | |
| const numTurns = process.env.NUM_TURNS || '0'; | |
| const runUrl = process.env.RUN_URL; | |
| const diffBytes = parseInt(process.env.DIFF_BYTES || '0', 10); | |
| let body; | |
| if (diffBytes === 0) { | |
| body = `**Claude Security Review:** PR has an empty diff against base — nothing to review. ([run](${runUrl}))`; | |
| } else if (modelRan !== 'true') { | |
| body = `**Claude Security Review:** the review did not analyze this PR (model took ${numTurns} turn${numTurns === '1' ? '' : 's'}). See the [run](${runUrl}) for details; a later push or re-run is needed.`; | |
| } else if (conclusion === 'success') { | |
| body = count > 0 | |
| ? `**Claude Security Review:** posted ${count} inline finding${count === 1 ? '' : 's'} on this PR. ([run](${runUrl}))` | |
| : `**Claude Security Review:** no high-confidence findings. ([run](${runUrl}))`; | |
| } else if (conclusion === 'failure') { | |
| body = `**Claude Security Review:** the review run failed before completing. See the [run](${runUrl}) for details.`; | |
| } else { | |
| body = `**Claude Security Review:** the review run was ${conclusion} before analysis could complete. See the [run](${runUrl}); a later run on this PR will replace this status.`; | |
| } | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body, | |
| }); | |
| - name: Remove reviewing label | |
| if: always() | |
| uses: actions/github-script@v9 | |
| env: | |
| PR_NUMBER: ${{ steps.pr.outputs.number }} | |
| with: | |
| github-token: ${{ steps.app-token.outputs.token }} | |
| script: | | |
| const prNumber = parseInt(process.env.PR_NUMBER, 10); | |
| try { | |
| await github.rest.issues.removeLabel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| name: 'claude-security-reviewing', | |
| }); | |
| } catch (error) { | |
| console.log('Label removal failed (may not exist):', error.message); | |
| } |