From 6ca616c2f0afdcacb73bff246c4376178145cd8e Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 10 Mar 2026 00:31:16 +0100 Subject: [PATCH 1/2] Add apply-gate Lambda for credential-brokered Terraform apply MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New security model: tf-apply no longer has direct infrastructure IAM permissions. Instead it invokes a gate Lambda that: - Reads risk.json from S3 (uploaded by plan-review) - If LOW/MEDIUM: issues temp STS credentials via AssumeRole - If HIGH: verifies HMAC-signed override token before issuing credentials - Signing key stays in SSM, only the Lambda can read it New resources: - javabin-apply-gate Lambda (credential broker) - javabin-ci-apply-gate OIDC role (can only invoke Lambda + read S3) - Updated ci-app roles: trust gate Lambda for apply, OIDC for plan only - Updated override-approver: invokes Lambda sign action instead of SSM write Flow: plan-review uploads risk.json → tf-apply calls gate → gate checks risk + override → returns temp credentials → apply runs with those creds Override: admin triggers approve-override.yml (GitHub environment protected) → Lambda signs HMAC token → writes override.json to S3 → retriggers apply --- .github/workflows/approve-override.yml | 35 ++- .github/workflows/javabin.yml | 1 - .github/workflows/plan-review.yml | 16 +- .github/workflows/tf-apply.yml | 21 +- scripts/invoke-apply-gate.sh | 81 +++++++ terraform/lambda-src/apply_gate/handler.py | 247 +++++++++++++++++++++ terraform/platform/iam/main.tf | 88 +++++++- terraform/platform/lambdas/main.tf | 85 ++++++- 8 files changed, 542 insertions(+), 32 deletions(-) create mode 100644 scripts/invoke-apply-gate.sh create mode 100644 terraform/lambda-src/apply_gate/handler.py diff --git a/.github/workflows/approve-override.yml b/.github/workflows/approve-override.yml index bc216a2..f78e832 100644 --- a/.github/workflows/approve-override.yml +++ b/.github/workflows/approve-override.yml @@ -1,17 +1,21 @@ name: Approve Override -# workflow_dispatch — only board members can trigger this. -# IAM trust condition on javabin-ci-override-approver verifies the actor. +# Workflow dispatch — only runs after approval from the 'override-approval' +# GitHub environment (requires board member review). on: workflow_dispatch: inputs: + plan_key: + description: "S3 plan key (from the Slack alert)" + required: true + type: string repo: description: "Repository (e.g. javaBin/moresleep)" required: true type: string - sha: - description: "Commit SHA to override" + run_id: + description: "Failed apply run ID to retrigger" required: true type: string reason: @@ -22,6 +26,7 @@ on: permissions: id-token: write contents: read + actions: write env: AWS_ACCOUNT_ID: "553637109631" @@ -30,6 +35,7 @@ env: jobs: approve: runs-on: ubuntu-latest + environment: override-approval steps: - uses: actions/checkout@v4 with: @@ -41,10 +47,25 @@ jobs: role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-override-approver aws-region: ${{ env.AWS_REGION }} - - name: Write override token - run: sh scripts/write-override-token.sh "${{ inputs.repo }}" "${{ inputs.sha }}" "${{ github.actor }}" "${{ inputs.reason }}" + - name: Sign override via gate Lambda + run: | + aws lambda invoke \ + --function-name javabin-apply-gate \ + --payload "$(jq -n \ + --arg action sign \ + --arg plan_key "${{ inputs.plan_key }}" \ + --arg approved_by "${{ github.actor }}" \ + --arg reason "${{ inputs.reason }}" \ + '{action: $action, plan_key: $plan_key, approved_by: $approved_by, reason: $reason}')" \ + --cli-binary-format raw-in-base64-out \ + /dev/stdout - name: Notify Slack env: SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook - run: sh scripts/notify-slack.sh "Risk Override Approved" "*Repo:* ${{ inputs.repo }}\n*SHA:* \`${{ inputs.sha }}\`\n*By:* ${{ github.actor }}\n*Reason:* ${{ inputs.reason }}" "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "View Approval Run" + run: sh scripts/notify-slack.sh "Risk Override Approved" "*Repo:* ${{ inputs.repo }}\n*Plan:* \`${{ inputs.plan_key }}\`\n*By:* ${{ github.actor }}\n*Reason:* ${{ inputs.reason }}" + + - name: Retrigger failed apply + env: + GH_TOKEN: ${{ github.token }} + run: gh run rerun ${{ inputs.run_id }} --repo ${{ inputs.repo }} --failed diff --git a/.github/workflows/javabin.yml b/.github/workflows/javabin.yml index 5d7d062..c6316a6 100644 --- a/.github/workflows/javabin.yml +++ b/.github/workflows/javabin.yml @@ -102,7 +102,6 @@ jobs: with: plan_key: ${{ needs.tf-plan.outputs.plan_key }} plan_sha256: ${{ needs.tf-plan.outputs.plan_sha256 }} - risk_level: ${{ needs.plan-review.outputs.risk_level || 'UNKNOWN' }} secrets: inherit # -------------------------------------------------------------------------- diff --git a/.github/workflows/plan-review.yml b/.github/workflows/plan-review.yml index f6ba776..29670bf 100644 --- a/.github/workflows/plan-review.yml +++ b/.github/workflows/plan-review.yml @@ -64,6 +64,12 @@ jobs: REVIEW_RESULT_PATH: review-result.json run: sh platform/scripts/extract-review-risk.sh platform/scripts/review-plan.py plan-output.txt + - name: Upload risk assessment to S3 + run: | + PLAN_DIR=$(dirname "${{ inputs.plan_key }}") + echo '{"level":"${{ steps.review.outputs.risk_level }}","reviewed_at":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' | \ + aws s3 cp - "s3://${PLAN_BUCKET}/${PLAN_DIR}/risk.json" --content-type application/json + - name: Post review to PR if: github.event_name == 'pull_request' env: @@ -73,4 +79,12 @@ jobs: - name: Alert Slack on HIGH risk if: github.event_name == 'push' && github.ref == 'refs/heads/main' && steps.review.outputs.risk_level == 'HIGH' - run: sh platform/scripts/notify-high-risk.sh /javabin/slack/platform-override-alerts-webhook "https://github.com/javaBin/platform/actions/workflows/approve-override.yml" + env: + SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook + PLAN_KEY: ${{ inputs.plan_key }} + run: | + OVERRIDE_URL="https://github.com/javaBin/platform/actions/workflows/approve-override.yml" + RUN_URL="https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" + sh platform/scripts/notify-slack.sh \ + "Deploy Blocked — HIGH Risk Plan" \ + "*Repo:* ${GITHUB_REPOSITORY}\n*Plan:* \`${PLAN_KEY}\`\n<${OVERRIDE_URL}|Approve Override> | <${RUN_URL}|View Run>" diff --git a/.github/workflows/tf-apply.yml b/.github/workflows/tf-apply.yml index 14b27f9..bced35c 100644 --- a/.github/workflows/tf-apply.yml +++ b/.github/workflows/tf-apply.yml @@ -11,10 +11,6 @@ on: description: "SHA256 hash of the plan artifact" type: string required: true - risk_level: - description: "Risk level from LLM review" - type: string - required: true aws_account_id: description: "AWS account ID" type: string @@ -35,7 +31,6 @@ permissions: jobs: apply: runs-on: ubuntu-latest - environment: production env: PLAN_BUCKET: javabin-ci-plan-artifacts-${{ inputs.aws_account_id }} steps: @@ -62,15 +57,14 @@ jobs: path: .platform sparse-checkout: scripts - - name: Configure AWS credentials via OIDC + # Step 1: Lightweight OIDC role — can only invoke gate Lambda + read S3 + - name: Configure gate credentials via OIDC uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-app-${{ github.event.repository.name }} + role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-apply-gate aws-region: ${{ inputs.aws_region }} - - name: Check risk level - run: sh .platform/scripts/check-risk-gate.sh "${{ inputs.risk_level }}" "${{ github.repository }}" "${{ github.sha }}" /javabin/slack/platform-override-alerts-webhook - + # Step 2: Download and verify plan - name: Download plan from S3 working-directory: ${{ inputs.tf_root }} run: aws s3 cp "s3://${PLAN_BUCKET}/${{ inputs.plan_key }}" tfplan @@ -79,6 +73,13 @@ jobs: working-directory: ${{ inputs.tf_root }} run: sh "${{ github.workspace }}/.platform/scripts/verify-plan.sh" tfplan "${{ inputs.plan_sha256 }}" + # Step 3: Invoke gate Lambda — checks risk, verifies override if needed, returns temp credentials + - name: Request apply credentials from gate + env: + SSM_WEBHOOK_PARAM: /javabin/slack/platform-override-alerts-webhook + run: sh .platform/scripts/invoke-apply-gate.sh "${{ inputs.plan_key }}" "${{ github.event.repository.name }}" + + # Step 4: Apply with credentials from the gate Lambda - name: Terraform Init working-directory: ${{ inputs.tf_root }} run: terraform init -input=false diff --git a/scripts/invoke-apply-gate.sh b/scripts/invoke-apply-gate.sh new file mode 100644 index 0000000..6d2a2de --- /dev/null +++ b/scripts/invoke-apply-gate.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# Invoke the apply-gate Lambda to get temporary credentials for terraform apply. +# +# Usage: invoke-apply-gate.sh +# +# On success: exports AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN +# On failure: exits 1 with reason +# +# Requires: aws CLI, jq, LAMBDA_NAME env var + +set -e + +PLAN_KEY="$1" +REPO_NAME="$2" +LAMBDA_NAME="${LAMBDA_NAME:-javabin-apply-gate}" + +if [ -z "$PLAN_KEY" ] || [ -z "$REPO_NAME" ]; then + echo "Usage: invoke-apply-gate.sh " + exit 1 +fi + +echo "Requesting apply credentials from gate Lambda..." + +PAYLOAD=$(jq -n \ + --arg action "check" \ + --arg plan_key "$PLAN_KEY" \ + --arg repo_name "$REPO_NAME" \ + '{action: $action, plan_key: $plan_key, repo_name: $repo_name}') + +RESPONSE=$(aws lambda invoke \ + --function-name "$LAMBDA_NAME" \ + --payload "$PAYLOAD" \ + --cli-binary-format raw-in-base64-out \ + /dev/stdout 2>/dev/null) + +BODY=$(echo "$RESPONSE" | jq -r '.body // empty' 2>/dev/null) +if [ -z "$BODY" ]; then + BODY="$RESPONSE" +fi + +APPROVED=$(echo "$BODY" | jq -r '.approved // false') +RISK=$(echo "$BODY" | jq -r '.risk_level // "UNKNOWN"') +REASON=$(echo "$BODY" | jq -r '.reason // empty') + +echo "Risk level: $RISK" + +if [ "$APPROVED" != "true" ]; then + echo "Apply blocked: ${REASON:-risk gate denied}" + + # Notify Slack about the block + if [ -n "$SSM_WEBHOOK_PARAM" ]; then + SCRIPT_DIR=$(dirname "$0") + RUN_URL="https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" + sh "$SCRIPT_DIR/notify-slack.sh" \ + "Terraform Apply Blocked" \ + "*Repo:* ${GITHUB_REPOSITORY}\n*Risk:* ${RISK}\n*Reason:* ${REASON:-override required}\n<${RUN_URL}|View Run>" || true + fi + + exit 1 +fi + +# Export credentials for terraform +ACCESS_KEY=$(echo "$BODY" | jq -r '.credentials.AccessKeyId') +SECRET_KEY=$(echo "$BODY" | jq -r '.credentials.SecretAccessKey') +SESSION_TOKEN=$(echo "$BODY" | jq -r '.credentials.SessionToken') + +echo "AWS_ACCESS_KEY_ID=${ACCESS_KEY}" >> "$GITHUB_ENV" +echo "AWS_SECRET_ACCESS_KEY=${SECRET_KEY}" >> "$GITHUB_ENV" +echo "AWS_SESSION_TOKEN=${SESSION_TOKEN}" >> "$GITHUB_ENV" + +# Mask credentials in logs +echo "::add-mask::${ACCESS_KEY}" +echo "::add-mask::${SECRET_KEY}" +echo "::add-mask::${SESSION_TOKEN}" + +OVERRIDE_BY=$(echo "$BODY" | jq -r '.override.approved_by // empty') +if [ -n "$OVERRIDE_BY" ]; then + echo "Apply approved via override by ${OVERRIDE_BY}" +else + echo "Apply approved (risk: ${RISK})" +fi diff --git a/terraform/lambda-src/apply_gate/handler.py b/terraform/lambda-src/apply_gate/handler.py new file mode 100644 index 0000000..cb63054 --- /dev/null +++ b/terraform/lambda-src/apply_gate/handler.py @@ -0,0 +1,247 @@ +"""Apply gate — credential broker for Terraform apply. + +Actions: + check — Verify risk level and override status, return temp credentials if approved + sign — Create HMAC-signed override token for a HIGH-risk plan + status — Return current risk and override state for a plan + +The signing key lives in SSM. Only this Lambda can read it. CI roles invoke +the Lambda but never see the key. Temp credentials are issued via STS +AssumeRole on the app's CI role. +""" + +import hashlib +import hmac +import json +import logging +import os +import time + +import boto3 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +ssm = boto3.client("ssm") +s3 = boto3.client("s3") +sts = boto3.client("sts") + +SIGNING_KEY_PARAM = os.environ.get( + "SIGNING_KEY_PARAM", "/javabin/platform/override-signing-key" +) +PLAN_BUCKET = os.environ.get("PLAN_BUCKET", "") +PROJECT = os.environ.get("PROJECT", "javabin") +CREDENTIAL_DURATION = int(os.environ.get("CREDENTIAL_DURATION", "900")) # 15 min + +# Cache signing key across invocations +_key_cache = {} + + +def _get_signing_key(): + """Read the HMAC signing key from SSM (cached).""" + if "key" not in _key_cache: + resp = ssm.get_parameter(Name=SIGNING_KEY_PARAM, WithDecryption=True) + _key_cache["key"] = resp["Parameter"]["Value"].encode("utf-8") + return _key_cache["key"] + + +def _compute_hmac(plan_key): + """Compute HMAC-SHA256 over the plan key.""" + key = _get_signing_key() + return hmac.new(key, plan_key.encode("utf-8"), hashlib.sha256).hexdigest() + + +def _read_s3_json(key): + """Read a JSON file from the plan bucket. Returns None if not found.""" + try: + resp = s3.get_object(Bucket=PLAN_BUCKET, Key=key) + return json.loads(resp["Body"].read()) + except s3.exceptions.NoSuchKey: + return None + except Exception as e: + logger.error("Failed to read s3://%s/%s: %s", PLAN_BUCKET, key, e) + return None + + +def _write_s3_json(key, data): + """Write a JSON file to the plan bucket.""" + s3.put_object( + Bucket=PLAN_BUCKET, + Key=key, + Body=json.dumps(data), + ContentType="application/json", + ) + + +def _plan_prefix(plan_key): + """Get the directory prefix from a plan key (e.g. 'repo/sha/' from 'repo/sha/tfplan').""" + parts = plan_key.rsplit("/", 1) + return parts[0] + "/" if len(parts) > 1 else "" + + +def action_check(event): + """Check risk + override, return temp credentials if approved. + + Input: {plan_key, repo_name} + Output: {approved, risk_level, credentials?, reason?} + """ + plan_key = event["plan_key"] + repo_name = event["repo_name"] + prefix = _plan_prefix(plan_key) + + # Read risk assessment + risk = _read_s3_json(f"{prefix}risk.json") + if not risk: + return { + "approved": False, + "reason": "No risk assessment found for this plan", + } + + risk_level = risk.get("level", "UNKNOWN") + logger.info("Plan %s risk: %s", plan_key, risk_level) + + # LOW/MEDIUM — auto-approve + if risk_level in ("LOW", "MEDIUM"): + credentials = _issue_credentials(repo_name) + return { + "approved": True, + "risk_level": risk_level, + "credentials": credentials, + } + + # HIGH/FAILED — check for override + override = _read_s3_json(f"{prefix}override.json") + if not override: + return { + "approved": False, + "risk_level": risk_level, + "reason": f"Risk is {risk_level} — override required", + } + + # Verify HMAC signature + expected = _compute_hmac(plan_key) + actual = override.get("signature", "") + + if not hmac.compare_digest(expected, actual): + logger.warning("Invalid override signature for %s", plan_key) + return { + "approved": False, + "risk_level": risk_level, + "reason": "Override signature is invalid", + } + + logger.info( + "Override verified for %s (approved by %s)", + plan_key, override.get("approved_by"), + ) + + credentials = _issue_credentials(repo_name) + return { + "approved": True, + "risk_level": risk_level, + "override": { + "approved_by": override.get("approved_by"), + "reason": override.get("reason"), + }, + "credentials": credentials, + } + + +def action_sign(event): + """Create a signed override token and write it to S3. + + Input: {plan_key, approved_by, reason} + Output: {signed: true, plan_key} + """ + plan_key = event["plan_key"] + approved_by = event["approved_by"] + reason = event.get("reason", "") + prefix = _plan_prefix(plan_key) + + signature = _compute_hmac(plan_key) + + override = { + "signature": signature, + "plan_key": plan_key, + "approved_by": approved_by, + "reason": reason, + "approved_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + + _write_s3_json(f"{prefix}override.json", override) + logger.info("Override signed for %s by %s", plan_key, approved_by) + + return {"signed": True, "plan_key": plan_key} + + +def action_status(event): + """Return current risk and override state for a plan. + + Input: {plan_key} + Output: {risk_level, has_override, override_valid} + """ + plan_key = event["plan_key"] + prefix = _plan_prefix(plan_key) + + risk = _read_s3_json(f"{prefix}risk.json") + risk_level = risk.get("level", "UNKNOWN") if risk else "NOT_FOUND" + + override = _read_s3_json(f"{prefix}override.json") + has_override = override is not None + override_valid = False + + if has_override: + expected = _compute_hmac(plan_key) + override_valid = hmac.compare_digest( + expected, override.get("signature", "") + ) + + return { + "risk_level": risk_level, + "has_override": has_override, + "override_valid": override_valid, + } + + +def _issue_credentials(repo_name): + """Assume the app's CI role and return temporary credentials.""" + account_id = os.environ.get("ACCOUNT_ID", "") + role_arn = f"arn:aws:iam::{account_id}:role/{PROJECT}-ci-app-{repo_name}" + + resp = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=f"apply-gate-{repo_name}", + DurationSeconds=CREDENTIAL_DURATION, + ) + + creds = resp["Credentials"] + return { + "AccessKeyId": creds["AccessKeyId"], + "SecretAccessKey": creds["SecretAccessKey"], + "SessionToken": creds["SessionToken"], + } + + +def handler(event, context): + logger.info("Apply gate invoked: %s", json.dumps(event, default=str)[:500]) + + action = event.get("action", "check") + + try: + if action == "check": + result = action_check(event) + elif action == "sign": + result = action_sign(event) + elif action == "status": + result = action_status(event) + else: + result = {"error": f"Unknown action: {action}"} + + return {"statusCode": 200, "body": json.dumps(result, default=str)} + + except Exception as e: + logger.error("Apply gate error: %s", e, exc_info=True) + return { + "statusCode": 500, + "body": json.dumps({"error": str(e)}), + } diff --git a/terraform/platform/iam/main.tf b/terraform/platform/iam/main.tf index 96f377a..88f2337 100644 --- a/terraform/platform/iam/main.tf +++ b/terraform/platform/iam/main.tf @@ -158,6 +158,7 @@ resource "aws_iam_role" "ci_app" { Version = "2012-10-17" Statement = [ { + Sid = "AllowPlanAndReviewViaOIDC" Effect = "Allow" Principal = { Federated = data.aws_iam_openid_connect_provider.github.arn @@ -169,15 +170,22 @@ resource "aws_iam_role" "ci_app" { } StringLike = { "token.actions.githubusercontent.com:sub" = "repo:${var.github_org}/${each.key}:*" - # Terraform operations: plan (read), review (read), apply (write) + # Plan and review only — apply goes through the gate Lambda "token.actions.githubusercontent.com:job_workflow_ref" = [ "${var.github_org}/platform/.github/workflows/tf-plan.yml@refs/heads/main", "${var.github_org}/platform/.github/workflows/plan-review.yml@refs/heads/main", - "${var.github_org}/platform/.github/workflows/tf-apply.yml@refs/heads/main", ] } } - } + }, + { + Sid = "AllowApplyViaGateLambda" + Effect = "Allow" + Principal = { + AWS = "arn:aws:iam::${var.aws_account_id}:role/${var.project}-apply-gate" + } + Action = "sts:AssumeRole" + }, ] }) @@ -509,18 +517,16 @@ resource "aws_iam_role" "ci_override_approver" { } } -resource "aws_iam_role_policy" "ci_override_approver_ssm" { - name = "ssm-put-overrides" +resource "aws_iam_role_policy" "ci_override_approver" { + name = "invoke-apply-gate" role = aws_iam_role.ci_override_approver.id policy = jsonencode({ Version = "2012-10-17" Statement = [{ - Effect = "Allow" - Action = [ - "ssm:PutParameter", - ] - Resource = "arn:aws:ssm:${var.region}:${var.aws_account_id}:parameter/${var.project}/platform-overrides/*" + Effect = "Allow" + Action = "lambda:InvokeFunction" + Resource = "arn:aws:lambda:${var.region}:${var.aws_account_id}:function:${var.project}-apply-gate" }] }) } @@ -578,6 +584,68 @@ resource "aws_iam_role_policy" "ci_registry_lambda" { }) } +################################################################################ +# 6. javabin-ci-apply-gate — Lightweight role for tf-apply +# +# Trust: GitHub OIDC pinned to tf-apply.yml workflow on main. +# Permissions: ONLY invoke the apply-gate Lambda + read plan artifacts from S3. +# The actual infrastructure permissions come from the gate Lambda (via STS). +################################################################################ + +resource "aws_iam_role" "ci_apply_gate" { + name = "${var.project}-ci-apply-gate" + permissions_boundary = aws_iam_policy.developer_boundary.arn + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = data.aws_iam_openid_connect_provider.github.arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "token.actions.githubusercontent.com:aud" = "sts.amazonaws.com" + } + StringLike = { + "token.actions.githubusercontent.com:sub" = "repo:${var.github_org}/*:*" + "token.actions.githubusercontent.com:job_workflow_ref" = "${var.github_org}/platform/.github/workflows/tf-apply.yml@refs/heads/main" + } + } + } + ] + }) + + tags = { + Name = "${var.project}-ci-apply-gate" + } +} + +resource "aws_iam_role_policy" "ci_apply_gate" { + name = "invoke-gate-and-read-plans" + role = aws_iam_role.ci_apply_gate.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "InvokeGateLambda" + Effect = "Allow" + Action = "lambda:InvokeFunction" + Resource = "arn:aws:lambda:${var.region}:${var.aws_account_id}:function:${var.project}-apply-gate" + }, + { + Sid = "ReadPlanArtifacts" + Effect = "Allow" + Action = "s3:GetObject" + Resource = "arn:aws:s3:::${var.project}-ci-plan-artifacts-${var.aws_account_id}/*" + }, + ] + }) +} + ################################################################################ # ECS Execution Role — pulls images, writes logs, reads secrets # diff --git a/terraform/platform/lambdas/main.tf b/terraform/platform/lambdas/main.tf index c899d04..7be9abc 100644 --- a/terraform/platform/lambdas/main.tf +++ b/terraform/platform/lambdas/main.tf @@ -698,6 +698,85 @@ resource "aws_lambda_permission" "override_cleanup_schedule" { source_arn = aws_cloudwatch_event_rule.override_cleanup_schedule.arn } -# --- team-provisioner: no schedule — triggered via function URL or SNS (future) --- -# For now, the stub has no trigger. It will be connected to a GitHub webhook -# or repository dispatch event when the registry repo is set up. +# --- team-provisioner: no schedule — triggered via direct Lambda invocation from registry CI --- + +################################################################################ +# Apply Gate — credential broker for Terraform apply +################################################################################ + +data "archive_file" "apply_gate" { + type = "zip" + output_path = "${path.module}/builds/apply_gate.zip" + source_dir = "${local.lambda_src_path}/apply_gate" +} + +resource "aws_iam_role" "apply_gate" { + name = "${var.project}-apply-gate" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "lambda.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) +} + +resource "aws_iam_role_policy" "apply_gate" { + name = "${var.project}-apply-gate" + role = aws_iam_role.apply_gate.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "ReadSigningKey" + Effect = "Allow" + Action = "ssm:GetParameter" + Resource = "arn:aws:ssm:${var.region}:${var.aws_account_id}:parameter/${var.project}/platform/override-signing-key" + }, + { + Sid = "PlanBucketAccess" + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:PutObject", + ] + Resource = "arn:aws:s3:::${var.project}-ci-plan-artifacts-${var.aws_account_id}/*" + }, + { + Sid = "AssumeAppRoles" + Effect = "Allow" + Action = "sts:AssumeRole" + Resource = "arn:aws:iam::${var.aws_account_id}:role/${var.project}-ci-app-*" + }, + ] + }) +} + +resource "aws_iam_role_policy_attachment" "apply_gate_logs" { + role = aws_iam_role.apply_gate.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +resource "aws_lambda_function" "apply_gate" { + function_name = "${var.project}-apply-gate" + role = aws_iam_role.apply_gate.arn + handler = "handler.handler" + runtime = "python3.12" + timeout = 30 + memory_size = 128 + filename = data.archive_file.apply_gate.output_path + source_code_hash = data.archive_file.apply_gate.output_base64sha256 + + environment { + variables = { + SIGNING_KEY_PARAM = "/${var.project}/platform/override-signing-key" + PLAN_BUCKET = "${var.project}-ci-plan-artifacts-${var.aws_account_id}" + PROJECT = var.project + ACCOUNT_ID = var.aws_account_id + CREDENTIAL_DURATION = "900" + } + } +} From d47a173aa432d25e265511292a873ffebbc333f7 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 10 Mar 2026 00:34:01 +0100 Subject: [PATCH 2/2] Add apply-gate documentation and update CLAUDE.md --- CLAUDE.md | 5 +- docs/apply-gate.md | 113 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 docs/apply-gate.md diff --git a/CLAUDE.md b/CLAUDE.md index 202d620..911c3ae 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -101,6 +101,7 @@ Migration happens later per-app at developer's pace — apps move from old ALB/E | `docs/bootstrap-runbook.md` | State backend bootstrap procedure | | `docs/org-runbook.md` | AWS Organizations setup procedure | | `docs/cognito-google-setup.md` | Cognito + Google Workspace IdP setup | +| `docs/apply-gate.md` | Apply gate: credential broker, HMAC overrides, security model | ### Terraform — Platform (CI-applied) ``` @@ -115,7 +116,7 @@ terraform/platform/ iam/ GitHub OIDC, CI roles, permission boundary compute/ ECS cluster, ECR base config monitoring/ SNS, EventBridge, Config, GuardDuty, Security Hub - lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner + lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner, apply-gate identity/ Cognito user pools (internal + external). Identity Center is in terraform/org/ ``` @@ -192,7 +193,7 @@ terraform/state/ | `scripts/provision-teams.py` | CI: fetch team YAMLs from registry, invoke team-provisioner Lambda | | `scripts/review-plan.py` | CI: LLM plan review via Bedrock | | `scripts/notify-slack.py` | CI: generic Slack webhook notification | -| `scripts/check-risk-gate.sh` | CI: check risk level, consume override token or block | +| `scripts/invoke-apply-gate.sh` | CI: invoke gate Lambda for apply credentials | | `scripts/run-plan.sh` | CI: terraform plan with exit code handling | | `scripts/upload-plan.sh` | CI: upload plan artifact to S3 | diff --git a/docs/apply-gate.md b/docs/apply-gate.md new file mode 100644 index 0000000..5c0996f --- /dev/null +++ b/docs/apply-gate.md @@ -0,0 +1,113 @@ +# Apply Gate — Credential-Brokered Terraform Apply + +## Overview + +Terraform apply does NOT have direct IAM permissions to modify infrastructure. +Instead, a Lambda (`javabin-apply-gate`) acts as a credential broker: it verifies +the risk assessment and override status before issuing short-lived STS credentials. + +## Why + +If tf-apply had direct IAM permissions, the risk gate would be a shell `if` +statement — the credentials would already be available. A compromised or +modified workflow step could skip the check. The gate Lambda ensures credentials +are only issued after verification. + +## Flow + +``` +tf-plan → uploads plan + plan-output.txt to S3 +plan-review → LLM reviews plan → uploads risk.json to S3 +tf-apply → assumes javabin-ci-apply-gate (lightweight OIDC role) + → downloads plan from S3, verifies SHA256 + → invokes gate Lambda: check(plan_key, repo_name) + → Lambda reads risk.json from S3 + → LOW/MEDIUM: assumes app CI role via STS, returns temp credentials + → HIGH: checks for override.json, verifies HMAC signature + → valid override: issues credentials + → no override: returns 403, apply fails + → tf-apply uses temp credentials for terraform init + apply +``` + +## IAM Roles + +| Role | Purpose | Permissions | +|------|---------|-------------| +| `javabin-ci-apply-gate` | OIDC role for tf-apply workflow | `lambda:InvokeFunction` on gate Lambda + `s3:GetObject` on plan bucket | +| `javabin-apply-gate` | Lambda execution role | `ssm:GetParameter` (signing key) + `s3:Get/PutObject` (plan bucket) + `sts:AssumeRole` (app CI roles) | +| `javabin-ci-app-{repo}` | App infrastructure role | Trusted by OIDC (plan/review) AND by gate Lambda role (apply) | + +## Credential Lifetime + +- Temp credentials issued by the gate Lambda have a **15-minute TTL** (`CREDENTIAL_DURATION=900`) +- This is enough for `terraform init + apply` on most plans +- Credentials are scoped to the specific app's CI role (`javabin-ci-app-{repo}`) + +## Risk Levels + +| Level | Action | +|-------|--------| +| LOW | Auto-approve, credentials issued | +| MEDIUM | Auto-approve, credentials issued | +| HIGH | Blocked until override token exists and HMAC is valid | +| FAILED | Blocked (review script error) | + +## Override Tokens + +An override token is an HMAC-signed JSON file stored in S3 alongside the plan: + +```json +{ + "signature": "HMAC-SHA256(signing_key, plan_key)", + "plan_key": "javaBin/moresleep/abc123/tfplan", + "approved_by": "alexanderamiri", + "reason": "Reviewed manually, safe to apply", + "approved_at": "2026-03-10T12:00:00Z" +} +``` + +- **Signing key**: Random 256-bit secret in SSM at `/javabin/platform/override-signing-key` +- **HMAC is plan-scoped**: `HMAC(key, plan_key)` — different plan = different signature, can't reuse +- **Only the Lambda** can read the signing key — no CI role has SSM access to it +- **Tokens expire** with the S3 lifecycle on the plan bucket + +## Override Flow + +1. Apply blocks on HIGH risk → Slack alert with plan key and override link +2. Admin triggers `approve-override.yml` in the platform repo + - Protected by GitHub environment `override-approval` (requires board member review) +3. Override workflow assumes `javabin-ci-override-approver` OIDC role +4. Invokes gate Lambda `sign` action → Lambda computes HMAC, writes `override.json` to S3 +5. Workflow retriggers the failed apply run via `gh run rerun --failed` +6. Apply re-runs → gate Lambda finds `override.json` → verifies HMAC → issues credentials + +## S3 Plan Artifact Layout + +``` +{repo}/{sha}/ + tfplan # binary plan file + plan-output.txt # human-readable plan text + risk.json # {"level": "HIGH", "reviewed_at": "..."} + override.json # HMAC-signed override token (only if overridden) +``` + +## SSM Parameters + +| Path | Type | Purpose | +|------|------|---------| +| `/javabin/platform/override-signing-key` | SecureString | HMAC signing key for override tokens | + +## Setup + +The signing key must be created once: + +```bash +aws ssm put-parameter \ + --name /javabin/platform/override-signing-key \ + --type SecureString \ + --value "$(openssl rand -hex 32)" \ + --profile javabin --region eu-central-1 +``` + +The `override-approval` GitHub environment must be created on the platform repo +with required reviewers set to board members.