Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions .github/workflows/approve-override.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
name: Approve Override

# workflow_dispatch — only board members can trigger this.
# IAM trust condition on javabin-ci-override-approver verifies the actor.
# Workflow dispatch — only runs after approval from the 'override-approval'
# GitHub environment (requires board member review).

on:
workflow_dispatch:
inputs:
plan_key:
description: "S3 plan key (from the Slack alert)"
required: true
type: string
repo:
description: "Repository (e.g. javaBin/moresleep)"
required: true
type: string
sha:
description: "Commit SHA to override"
run_id:
description: "Failed apply run ID to retrigger"
required: true
type: string
reason:
Expand All @@ -22,6 +26,7 @@ on:
permissions:
id-token: write
contents: read
actions: write

env:
AWS_ACCOUNT_ID: "553637109631"
Expand All @@ -30,6 +35,7 @@ env:
jobs:
approve:
runs-on: ubuntu-latest
environment: override-approval
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -41,10 +47,25 @@ jobs:
role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-override-approver
aws-region: ${{ env.AWS_REGION }}

- name: Write override token
run: sh scripts/write-override-token.sh "${{ inputs.repo }}" "${{ inputs.sha }}" "${{ github.actor }}" "${{ inputs.reason }}"
- name: Sign override via gate Lambda
run: |
aws lambda invoke \
--function-name javabin-apply-gate \
--payload "$(jq -n \
--arg action sign \
--arg plan_key "${{ inputs.plan_key }}" \
--arg approved_by "${{ github.actor }}" \
--arg reason "${{ inputs.reason }}" \
'{action: $action, plan_key: $plan_key, approved_by: $approved_by, reason: $reason}')" \
--cli-binary-format raw-in-base64-out \
/dev/stdout

- name: Notify Slack
env:
SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook
run: sh scripts/notify-slack.sh "Risk Override Approved" "*Repo:* ${{ inputs.repo }}\n*SHA:* \`${{ inputs.sha }}\`\n*By:* ${{ github.actor }}\n*Reason:* ${{ inputs.reason }}" "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "View Approval Run"
run: sh scripts/notify-slack.sh "Risk Override Approved" "*Repo:* ${{ inputs.repo }}\n*Plan:* \`${{ inputs.plan_key }}\`\n*By:* ${{ github.actor }}\n*Reason:* ${{ inputs.reason }}"

- name: Retrigger failed apply
env:
GH_TOKEN: ${{ github.token }}
run: gh run rerun ${{ inputs.run_id }} --repo ${{ inputs.repo }} --failed
1 change: 0 additions & 1 deletion .github/workflows/javabin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ jobs:
with:
plan_key: ${{ needs.tf-plan.outputs.plan_key }}
plan_sha256: ${{ needs.tf-plan.outputs.plan_sha256 }}
risk_level: ${{ needs.plan-review.outputs.risk_level || 'UNKNOWN' }}
secrets: inherit

# --------------------------------------------------------------------------
Expand Down
16 changes: 15 additions & 1 deletion .github/workflows/plan-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ jobs:
REVIEW_RESULT_PATH: review-result.json
run: sh platform/scripts/extract-review-risk.sh platform/scripts/review-plan.py plan-output.txt

- name: Upload risk assessment to S3
run: |
PLAN_DIR=$(dirname "${{ inputs.plan_key }}")
echo '{"level":"${{ steps.review.outputs.risk_level }}","reviewed_at":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' | \
aws s3 cp - "s3://${PLAN_BUCKET}/${PLAN_DIR}/risk.json" --content-type application/json

- name: Post review to PR
if: github.event_name == 'pull_request'
env:
Expand All @@ -73,4 +79,12 @@ jobs:

- name: Alert Slack on HIGH risk
if: github.event_name == 'push' && github.ref == 'refs/heads/main' && steps.review.outputs.risk_level == 'HIGH'
run: sh platform/scripts/notify-high-risk.sh /javabin/slack/platform-override-alerts-webhook "https://github.com/javaBin/platform/actions/workflows/approve-override.yml"
env:
SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook
PLAN_KEY: ${{ inputs.plan_key }}
run: |
OVERRIDE_URL="https://github.com/javaBin/platform/actions/workflows/approve-override.yml"
RUN_URL="https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
sh platform/scripts/notify-slack.sh \
"Deploy Blocked — HIGH Risk Plan" \
"*Repo:* ${GITHUB_REPOSITORY}\n*Plan:* \`${PLAN_KEY}\`\n<${OVERRIDE_URL}|Approve Override> | <${RUN_URL}|View Run>"
21 changes: 11 additions & 10 deletions .github/workflows/tf-apply.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ on:
description: "SHA256 hash of the plan artifact"
type: string
required: true
risk_level:
description: "Risk level from LLM review"
type: string
required: true
aws_account_id:
description: "AWS account ID"
type: string
Expand All @@ -35,7 +31,6 @@ permissions:
jobs:
apply:
runs-on: ubuntu-latest
environment: production
env:
PLAN_BUCKET: javabin-ci-plan-artifacts-${{ inputs.aws_account_id }}
steps:
Expand All @@ -62,15 +57,14 @@ jobs:
path: .platform
sparse-checkout: scripts

- name: Configure AWS credentials via OIDC
# Step 1: Lightweight OIDC role — can only invoke gate Lambda + read S3
- name: Configure gate credentials via OIDC
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-app-${{ github.event.repository.name }}
role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-apply-gate
aws-region: ${{ inputs.aws_region }}

- name: Check risk level
run: sh .platform/scripts/check-risk-gate.sh "${{ inputs.risk_level }}" "${{ github.repository }}" "${{ github.sha }}" /javabin/slack/platform-override-alerts-webhook

# Step 2: Download and verify plan
- name: Download plan from S3
working-directory: ${{ inputs.tf_root }}
run: aws s3 cp "s3://${PLAN_BUCKET}/${{ inputs.plan_key }}" tfplan
Expand All @@ -79,6 +73,13 @@ jobs:
working-directory: ${{ inputs.tf_root }}
run: sh "${{ github.workspace }}/.platform/scripts/verify-plan.sh" tfplan "${{ inputs.plan_sha256 }}"

# Step 3: Invoke gate Lambda — checks risk, verifies override if needed, returns temp credentials
- name: Request apply credentials from gate
env:
SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook
run: sh .platform/scripts/invoke-apply-gate.sh "${{ inputs.plan_key }}" "${{ github.event.repository.name }}"

# Step 4: Apply with credentials from the gate Lambda
- name: Terraform Init
working-directory: ${{ inputs.tf_root }}
run: terraform init -input=false
Expand Down
5 changes: 3 additions & 2 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ Migration happens later per-app at developer's pace — apps move from old ALB/E
| `docs/bootstrap-runbook.md` | State backend bootstrap procedure |
| `docs/org-runbook.md` | AWS Organizations setup procedure |
| `docs/cognito-google-setup.md` | Cognito + Google Workspace IdP setup |
| `docs/apply-gate.md` | Apply gate: credential broker, HMAC overrides, security model |

### Terraform — Platform (CI-applied)
```
Expand All @@ -115,7 +116,7 @@ terraform/platform/
iam/ GitHub OIDC, CI roles, permission boundary
compute/ ECS cluster, ECR base config
monitoring/ SNS, EventBridge, Config, GuardDuty, Security Hub
lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner
lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner, apply-gate
identity/ Cognito user pools (internal + external). Identity Center is in terraform/org/
```

Expand Down Expand Up @@ -192,7 +193,7 @@ terraform/state/
| `scripts/provision-teams.py` | CI: fetch team YAMLs from registry, invoke team-provisioner Lambda |
| `scripts/review-plan.py` | CI: LLM plan review via Bedrock |
| `scripts/notify-slack.py` | CI: generic Slack webhook notification |
| `scripts/check-risk-gate.sh` | CI: check risk level, consume override token or block |
| `scripts/invoke-apply-gate.sh` | CI: invoke gate Lambda for apply credentials |
| `scripts/run-plan.sh` | CI: terraform plan with exit code handling |
| `scripts/upload-plan.sh` | CI: upload plan artifact to S3 |

Expand Down
113 changes: 113 additions & 0 deletions docs/apply-gate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Apply Gate — Credential-Brokered Terraform Apply

## Overview

Terraform apply does NOT have direct IAM permissions to modify infrastructure.
Instead, a Lambda (`javabin-apply-gate`) acts as a credential broker: it verifies
the risk assessment and override status before issuing short-lived STS credentials.

## Why

If tf-apply had direct IAM permissions, the risk gate would be a shell `if`
statement — the credentials would already be available. A compromised or
modified workflow step could skip the check. The gate Lambda ensures credentials
are only issued after verification.

## Flow

```
tf-plan → uploads plan + plan-output.txt to S3
plan-review → LLM reviews plan → uploads risk.json to S3
tf-apply → assumes javabin-ci-apply-gate (lightweight OIDC role)
→ downloads plan from S3, verifies SHA256
→ invokes gate Lambda: check(plan_key, repo_name)
→ Lambda reads risk.json from S3
→ LOW/MEDIUM: assumes app CI role via STS, returns temp credentials
→ HIGH: checks for override.json, verifies HMAC signature
→ valid override: issues credentials
→ no override: returns 403, apply fails
→ tf-apply uses temp credentials for terraform init + apply
```

## IAM Roles

| Role | Purpose | Permissions |
|------|---------|-------------|
| `javabin-ci-apply-gate` | OIDC role for tf-apply workflow | `lambda:InvokeFunction` on gate Lambda + `s3:GetObject` on plan bucket |
| `javabin-apply-gate` | Lambda execution role | `ssm:GetParameter` (signing key) + `s3:Get/PutObject` (plan bucket) + `sts:AssumeRole` (app CI roles) |
| `javabin-ci-app-{repo}` | App infrastructure role | Trusted by OIDC (plan/review) AND by gate Lambda role (apply) |

## Credential Lifetime

- Temp credentials issued by the gate Lambda have a **15-minute TTL** (`CREDENTIAL_DURATION=900`)
- This is enough for `terraform init + apply` on most plans
- Credentials are scoped to the specific app's CI role (`javabin-ci-app-{repo}`)

## Risk Levels

| Level | Action |
|-------|--------|
| LOW | Auto-approve, credentials issued |
| MEDIUM | Auto-approve, credentials issued |
| HIGH | Blocked until override token exists and HMAC is valid |
| FAILED | Blocked (review script error) |

## Override Tokens

An override token is an HMAC-signed JSON file stored in S3 alongside the plan:

```json
{
"signature": "HMAC-SHA256(signing_key, plan_key)",
"plan_key": "javaBin/moresleep/abc123/tfplan",
"approved_by": "alexanderamiri",
"reason": "Reviewed manually, safe to apply",
"approved_at": "2026-03-10T12:00:00Z"
}
```

- **Signing key**: Random 256-bit secret in SSM at `/javabin/platform/override-signing-key`
- **HMAC is plan-scoped**: `HMAC(key, plan_key)` — different plan = different signature, can't reuse
- **Only the Lambda** can read the signing key — no CI role has SSM access to it
- **Tokens expire** with the S3 lifecycle on the plan bucket

## Override Flow

1. Apply blocks on HIGH risk → Slack alert with plan key and override link
2. Admin triggers `approve-override.yml` in the platform repo
- Protected by GitHub environment `override-approval` (requires board member review)
3. Override workflow assumes `javabin-ci-override-approver` OIDC role
4. Invokes gate Lambda `sign` action → Lambda computes HMAC, writes `override.json` to S3
5. Workflow retriggers the failed apply run via `gh run rerun --failed`
6. Apply re-runs → gate Lambda finds `override.json` → verifies HMAC → issues credentials

## S3 Plan Artifact Layout

```
{repo}/{sha}/
tfplan # binary plan file
plan-output.txt # human-readable plan text
risk.json # {"level": "HIGH", "reviewed_at": "..."}
override.json # HMAC-signed override token (only if overridden)
```

## SSM Parameters

| Path | Type | Purpose |
|------|------|---------|
| `/javabin/platform/override-signing-key` | SecureString | HMAC signing key for override tokens |

## Setup

The signing key must be created once:

```bash
aws ssm put-parameter \
--name /javabin/platform/override-signing-key \
--type SecureString \
--value "$(openssl rand -hex 32)" \
--profile javabin --region eu-central-1
```

The `override-approval` GitHub environment must be created on the platform repo
with required reviewers set to board members.
81 changes: 81 additions & 0 deletions scripts/invoke-apply-gate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/sh
# Invoke the apply-gate Lambda to get temporary credentials for terraform apply.
#
# Usage: invoke-apply-gate.sh <plan_key> <repo_name>
#
# On success: exports AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN
# On failure: exits 1 with reason
#
# Requires: aws CLI, jq, LAMBDA_NAME env var

set -e

PLAN_KEY="$1"
REPO_NAME="$2"
LAMBDA_NAME="${LAMBDA_NAME:-javabin-apply-gate}"

if [ -z "$PLAN_KEY" ] || [ -z "$REPO_NAME" ]; then
echo "Usage: invoke-apply-gate.sh <plan_key> <repo_name>"
exit 1
fi

echo "Requesting apply credentials from gate Lambda..."

PAYLOAD=$(jq -n \
--arg action "check" \
--arg plan_key "$PLAN_KEY" \
--arg repo_name "$REPO_NAME" \
'{action: $action, plan_key: $plan_key, repo_name: $repo_name}')

RESPONSE=$(aws lambda invoke \
--function-name "$LAMBDA_NAME" \
--payload "$PAYLOAD" \
--cli-binary-format raw-in-base64-out \
/dev/stdout 2>/dev/null)

BODY=$(echo "$RESPONSE" | jq -r '.body // empty' 2>/dev/null)
if [ -z "$BODY" ]; then
BODY="$RESPONSE"
fi

APPROVED=$(echo "$BODY" | jq -r '.approved // false')
RISK=$(echo "$BODY" | jq -r '.risk_level // "UNKNOWN"')
REASON=$(echo "$BODY" | jq -r '.reason // empty')

echo "Risk level: $RISK"

if [ "$APPROVED" != "true" ]; then
echo "Apply blocked: ${REASON:-risk gate denied}"

# Notify Slack about the block
if [ -n "$SSM_WEBHOOK_PARAM" ]; then
SCRIPT_DIR=$(dirname "$0")
RUN_URL="https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
sh "$SCRIPT_DIR/notify-slack.sh" \
"Terraform Apply Blocked" \
"*Repo:* ${GITHUB_REPOSITORY}\n*Risk:* ${RISK}\n*Reason:* ${REASON:-override required}\n<${RUN_URL}|View Run>" || true
fi

exit 1
fi

# Export credentials for terraform
ACCESS_KEY=$(echo "$BODY" | jq -r '.credentials.AccessKeyId')
SECRET_KEY=$(echo "$BODY" | jq -r '.credentials.SecretAccessKey')
SESSION_TOKEN=$(echo "$BODY" | jq -r '.credentials.SessionToken')

echo "AWS_ACCESS_KEY_ID=${ACCESS_KEY}" >> "$GITHUB_ENV"
echo "AWS_SECRET_ACCESS_KEY=${SECRET_KEY}" >> "$GITHUB_ENV"
echo "AWS_SESSION_TOKEN=${SESSION_TOKEN}" >> "$GITHUB_ENV"

# Mask credentials in logs
echo "::add-mask::${ACCESS_KEY}"
echo "::add-mask::${SECRET_KEY}"
echo "::add-mask::${SESSION_TOKEN}"

OVERRIDE_BY=$(echo "$BODY" | jq -r '.override.approved_by // empty')
if [ -n "$OVERRIDE_BY" ]; then
echo "Apply approved via override by ${OVERRIDE_BY}"
else
echo "Apply approved (risk: ${RISK})"
fi
Loading