From 3b68a5dd7f49145cd9c987e55f035e68a0d29ff5 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 17 Mar 2026 20:11:50 +0100 Subject: [PATCH 1/2] Tags as primitive, alert enrichment, budget enforcement, RDS module (#77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation changes: - Tag schema: drop project tag, add repo tag. 5 static tags (team, service, repo, environment, managed-by) applied via provider default_tags. - Resource tagger Lambda: EventBridge-triggered (wildcard prefix match), auto-tags created-by + commit from CloudTrail session names. Tags added via AWS API outside Terraform — no drift or plan noise. - Cost allocation tags activated for all 7 tag keys. - ECS tag propagation: enable_ecs_managed_tags + propagate_tags = SERVICE so Fargate task-level compute costs are attributed to teams. Alert enrichment: - CI session names changed to {actor}-{sha8}-{run_id} in all 4 workflows. - slack_alert parse_identity() updated to extract actor/commit from new format. - Cost reports (daily + weekly) now include per-team tag breakdown. Budget enforcement: - New budget_enforcer Lambda: scales ECS services to desired_count=0 when team exceeds 200% of budget. Triggered via SNS from AWS Budgets. - team_provisioner sync_budget adds 200% notification alongside existing 80%. RDS module: - New service-rds module: PostgreSQL with Secrets Manager password, private subnet placement, security group scoped to ECS tasks SG. - Registry + expand-modules updated with engine-based routing (postgres vs dynamodb). IAM restructure: - Team deny policy: ABAC (ARN-scoped) where AWS supports tags (SNS, S3, ECS, ELB). Explicit denies only where AWS lacks tag conditions (EC2 VPC, GuardDuty, SecurityHub, Config, CloudTrail, Organizations, IAM users). - Service-role module: configurable trusted_services (ECS/EC2/Lambda). - EventBridge monitoring rules: documented volume rationale for curated lists. --- .github/workflows/docker-build.yml | 4 + .github/workflows/ecs-deploy.yml | 4 + .github/workflows/platform-ci.yml | 15 +- .github/workflows/tf-plan.yml | 4 + docs/app-yaml-reference.md | 21 +- scripts/ensure-tf-boilerplate.sh | 8 +- scripts/expand-modules.py | 31 +- scripts/registry.py | 43 ++- .../lambda-src/budget_enforcer/handler.py | 196 +++++++++++ terraform/lambda-src/cost_report/handler.py | 17 +- .../lambda-src/daily_cost_check/handler.py | 12 + .../lambda-src/resource_tagger/handler.py | 312 ++++++++++++++++++ terraform/lambda-src/slack_alert/handler.py | 24 +- .../lambda-src/team_provisioner/handler.py | 39 ++- terraform/modules/ecs-service/main.tf | 3 + terraform/modules/service-rds/main.tf | 91 +++++ terraform/modules/service-rds/outputs.tf | 24 ++ terraform/modules/service-rds/variables.tf | 61 ++++ terraform/modules/service-role/main.tf | 2 +- terraform/modules/service-role/variables.tf | 6 + terraform/org/providers.tf | 5 +- terraform/platform/iam/main.tf | 110 ++++-- terraform/platform/lambdas/main.tf | 282 +++++++++++++++- terraform/platform/lambdas/outputs.tf | 15 + terraform/platform/monitoring/main.tf | 16 + terraform/platform/providers.tf | 5 +- terraform/state/providers.tf | 5 +- 27 files changed, 1274 insertions(+), 81 deletions(-) create mode 100644 terraform/lambda-src/budget_enforcer/handler.py create mode 100644 terraform/lambda-src/resource_tagger/handler.py create mode 100644 terraform/modules/service-rds/main.tf create mode 100644 terraform/modules/service-rds/outputs.tf create mode 100644 terraform/modules/service-rds/variables.tf diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 224606f..a552387 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -39,6 +39,9 @@ jobs: image_uri: ${{ steps.push.outputs.image_uri }} image_tag: ${{ steps.tags.outputs.primary_tag }} steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - uses: actions/checkout@v6 - name: Generate GitHub App token @@ -62,6 +65,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-app-broker aws-region: ${{ inputs.aws_region }} + role-session-name: ${{ env.SESSION_NAME }} - name: Get deploy credentials from broker id: broker diff --git a/.github/workflows/ecs-deploy.yml b/.github/workflows/ecs-deploy.yml index 0c81a30..5d21656 100644 --- a/.github/workflows/ecs-deploy.yml +++ b/.github/workflows/ecs-deploy.yml @@ -33,6 +33,9 @@ jobs: name: ECS Deploy runs-on: ubuntu-latest steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - name: Generate GitHub App token id: app-token uses: actions/create-github-app-token@v2 @@ -55,6 +58,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-app-broker aws-region: ${{ inputs.aws_region }} + role-session-name: ${{ env.SESSION_NAME }} - name: Get deploy credentials from broker id: broker diff --git a/.github/workflows/platform-ci.yml b/.github/workflows/platform-ci.yml index a967d2c..a2f9f9f 100644 --- a/.github/workflows/platform-ci.yml +++ b/.github/workflows/platform-ci.yml @@ -44,6 +44,9 @@ jobs: plan_sha256: ${{ steps.upload.outputs.plan_sha256 }} risk_level: ${{ steps.review.outputs.risk_level }} steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -71,7 +74,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-infra-plan aws-region: ${{ env.AWS_REGION }} - role-session-name: javabin-platform-plan-${{ github.run_id }} + role-session-name: ${{ env.SESSION_NAME }} - name: Sync registered teams from GitHub org if: steps.changes.outputs.has_infra_changes == 'true' && github.ref == 'refs/heads/main' @@ -146,6 +149,9 @@ jobs: needs.plan.outputs.has_changes == 'true' environment: production steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - uses: actions/checkout@v6 - uses: hashicorp/setup-terraform@v4 @@ -157,7 +163,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-infra aws-region: ${{ env.AWS_REGION }} - role-session-name: javabin-apply-${{ github.run_id }} + role-session-name: ${{ env.SESSION_NAME }} - name: Check risk level env: @@ -194,6 +200,9 @@ jobs: runs-on: ubuntu-latest if: github.event_name == 'schedule' steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - uses: actions/checkout@v6 - uses: hashicorp/setup-terraform@v4 @@ -205,7 +214,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-infra aws-region: ${{ env.AWS_REGION }} - role-session-name: javabin-drift-${{ github.run_id }} + role-session-name: ${{ env.SESSION_NAME }} - name: Terraform Init working-directory: ${{ env.TF_ROOT }} diff --git a/.github/workflows/tf-plan.yml b/.github/workflows/tf-plan.yml index 23d172f..9e288fa 100644 --- a/.github/workflows/tf-plan.yml +++ b/.github/workflows/tf-plan.yml @@ -46,6 +46,9 @@ jobs: env: PLAN_BUCKET: javabin-ci-plan-artifacts-${{ inputs.aws_account_id }} steps: + - name: Set session name + run: echo "SESSION_NAME=$(echo "${GITHUB_ACTOR}-${GITHUB_SHA:0:8}-${GITHUB_RUN_ID}" | head -c 64)" >> "$GITHUB_ENV" + - uses: actions/checkout@v6 with: ref: ${{ github.ref }} @@ -60,6 +63,7 @@ jobs: with: role-to-assume: arn:aws:iam::${{ inputs.aws_account_id }}:role/javabin-ci-app-broker aws-region: ${{ inputs.aws_region }} + role-session-name: ${{ env.SESSION_NAME }} - name: Get team credentials from broker id: broker diff --git a/docs/app-yaml-reference.md b/docs/app-yaml-reference.md index 2306849..1575b08 100644 --- a/docs/app-yaml-reference.md +++ b/docs/app-yaml-reference.md @@ -116,17 +116,31 @@ resources: #### databases -DynamoDB tables. +DynamoDB tables (default) or RDS PostgreSQL instances. ```yaml resources: databases: - name: sessions - hash_key: id # required - range_key: timestamp # optional + hash_key: id # required (DynamoDB) + range_key: timestamp # optional (DynamoDB) env: SESSIONS_TABLE + + - name: main + engine: postgres # "dynamodb" (default) or "postgres"/"postgresql" + instance_class: db.t3.micro # RDS only, default: db.t3.micro + allocated_storage: 20 # GB, RDS only, default: 20 + engine_version: "16" # PostgreSQL version, RDS only, default: "16" + backup_retention_period: 7 # days, RDS only, default: 7 + multi_az: false # RDS only, default: false + deletion_protection: true # RDS only, default: true + env: DATABASE_URL ``` +DynamoDB and PostgreSQL entries can coexist in the same `databases` list. Entries without `engine` (or with `engine: dynamodb`) use the DynamoDB module. Entries with `engine: postgres` or `engine: postgresql` use the RDS module. + +RDS instances use `manage_master_user_password = true`, which stores the auto-generated master password in Secrets Manager. The ECS task role automatically receives IAM policies for `rds-db:connect` and `secretsmanager:GetSecretValue` on the password secret. + #### secrets Secrets Manager secrets. Value is set manually after creation. @@ -354,6 +368,7 @@ Generated files have a `# GENERATED FROM app.yaml` marker. The script only overw | S3 bucket | `javabin-{bucket_name}-{account_id}` | | DynamoDB table | `javabin-{table_name}` | | SQS queue | `javabin-{queue_name}` | +| RDS instance | `{db_name}` (identifier) | | Secrets Manager | `javabin/{secret_name}` | | IAM task role | `javabin-{name}` | | CloudWatch logs | `/ecs/javabin/{name}` | diff --git a/scripts/ensure-tf-boilerplate.sh b/scripts/ensure-tf-boilerplate.sh index a1d019c..3a0bff1 100755 --- a/scripts/ensure-tf-boilerplate.sh +++ b/scripts/ensure-tf-boilerplate.sh @@ -58,9 +58,11 @@ provider "aws" { default_tags { tags = { - project = "${REPO_NAME}" - team = "${TEAM}" - managed-by = "terraform" + team = "${TEAM}" + service = "${REPO_NAME}" + repo = "${GITHUB_REPOSITORY}" + environment = "production" + managed-by = "terraform" } } } diff --git a/scripts/expand-modules.py b/scripts/expand-modules.py index 6a03259..d0abdae 100644 --- a/scripts/expand-modules.py +++ b/scripts/expand-modules.py @@ -34,6 +34,25 @@ GENERATED_MARKER = "# GENERATED FROM app.yaml — do not edit, changes will be overwritten" +# Engine aliases — these all resolve to "postgres" for registry matching +_POSTGRES_ENGINES = {"postgres", "postgresql"} + + +def _item_matches_engine_filter(entry, item): + """Check if a YAML list item matches the entry's engine_filter. + + If the entry has no engine_filter, the item always matches. + Items without an explicit 'engine' field default to 'dynamodb' + for backward compatibility with existing DynamoDB entries. + """ + engine_filter = entry.get("engine_filter") + if engine_filter is None: + return True + item_engine = (item.get("engine") or "dynamodb").lower() + if item_engine in _POSTGRES_ENGINES: + item_engine = "postgres" + return item_engine == engine_filter + # --------------------------------------------------------------------------- # YAML helpers @@ -592,8 +611,11 @@ def main(): instance_key = entry.get("instance_key", "name") for item in items: inst_name = item.get(instance_key, "") - if inst_name: - collection_instances.append((entry["id"], inst_name, entry)) + if not inst_name: + continue + if not _item_matches_engine_filter(entry, item): + continue + collection_instances.append((entry["id"], inst_name, entry)) # -- Pre-compute collect expressions -- access_policies = collect_access_policies(collection_instances) @@ -649,6 +671,8 @@ def main(): inst_name = item.get(instance_key, "") if not inst_name: continue + if not _item_matches_engine_filter(entry, item): + continue hcl = _expand_collection_item( entry, source, yaml_data, env_vars, ref_resolver, item, inst_name, mod_vars, @@ -670,8 +694,9 @@ def main(): write_file( os.path.join(tf_root, "providers.tf"), PROVIDERS_TEMPLATE.format( - region=region, project=PROJECT, + region=region, service=service, team=app_team, + repo=os.environ.get("GITHUB_REPOSITORY", f"javaBin/{service}"), ), ) write_file( diff --git a/scripts/registry.py b/scripts/registry.py index 03aea58..067575e 100644 --- a/scripts/registry.py +++ b/scripts/registry.py @@ -115,7 +115,7 @@ }, # ------------------------------------------------------------------ - # service-role: ECS task IAM role + policies + # service-role: task IAM role + policies (ECS, Lambda, or EC2) # ------------------------------------------------------------------ { "id": "task_role", @@ -128,6 +128,7 @@ "region": "env:AWS_REGION", "aws_account_id": "env:AWS_ACCOUNT_ID", "permissions_boundary_arn": "ref:platform.developer_boundary_arn", + "trusted_services": "list:yaml:compute.trusted_service|default:ecs-tasks.amazonaws.com", "additional_policy_jsons": "collect:access_policy_json", }, "rename": "task", @@ -262,6 +263,7 @@ "cardinality": "collection", "yaml_list": "resources.databases", "instance_key": "name", + "engine_filter": "dynamodb", "vars": { "name": "item:name", "project": "yaml:name", @@ -290,6 +292,39 @@ }, }, }, + { + "id": "rds", + "module_path": f"{MODULE_ROOT}/service-rds", + "output_file": "databases.tf", + "cardinality": "collection", + "yaml_list": "resources.databases", + "instance_key": "name", + "engine_filter": "postgres", + "vars": { + "name": "item:name", + "engine_version": "item:engine_version|default:16", + "instance_class": "item:instance_class|default:db.t3.micro", + "allocated_storage": "item:allocated_storage|default:20", + "subnet_ids": "ref:platform.private_subnet_ids", + "vpc_id": "ref:platform.vpc_id", + "allowed_security_group_ids": "list:ref:platform.ecs_tasks_security_group_id", + "backup_retention_period": "item:backup_retention_period|default:7", + "multi_az": "item:multi_az|default:false", + "deletion_protection": "item:deletion_protection|default:true", + }, + "rename": "rds", + "output_map": { + "endpoint": "aws_db_instance.{instance}.endpoint", + "port": "aws_db_instance.{instance}.port", + "db_name": "aws_db_instance.{instance}.db_name", + "access_policy_json": "data.aws_iam_policy_document.{instance}_access.json", + "security_group_id": "aws_security_group.{instance}.id", + }, + "exports": { + "access_policy_json": True, + "env_var": {"output": "endpoint", "yaml_field": "env", "target": "environment"}, + }, + }, { "id": "secret", "module_path": f"{MODULE_ROOT}/service-secret", @@ -378,11 +413,11 @@ default_tags {{ tags = {{ - project = "{project}" - managed-by = "terraform" - service = "{service}" team = "{team}" + service = "{service}" + repo = "{repo}" environment = "production" + managed-by = "terraform" }} }} }} diff --git a/terraform/lambda-src/budget_enforcer/handler.py b/terraform/lambda-src/budget_enforcer/handler.py new file mode 100644 index 0000000..f7731d2 --- /dev/null +++ b/terraform/lambda-src/budget_enforcer/handler.py @@ -0,0 +1,196 @@ +"""Budget enforcer — scales ECS services to zero when a team exceeds 200% of budget. + +Trigger: SNS notification from AWS Budgets (200% threshold). +Parses budget name to extract team, finds ECS services tagged with that team, +scales them to desired_count=0, and posts a Slack alert to the cost channel. + +Does NOT destroy services — just scales to zero for easy recovery. +""" + +import json +import logging +import os +from datetime import datetime, timezone + +import boto3 +from shared.constants import USD_TO_NOK +from shared.slack import get_webhook_url, post_to_slack + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +ecs = boto3.client("ecs") +ssm = boto3.client("ssm") + +CLUSTER = os.environ.get("ECS_CLUSTER", "javabin-platform") +COST_WEBHOOK_PARAM = os.environ["COST_WEBHOOK_PARAM"] +BUDGET_NAME_PREFIX = os.environ.get("BUDGET_NAME_PREFIX", "javabin-team-") + + +# --------------------------------------------------------------------------- +# ECS helpers +# --------------------------------------------------------------------------- +def list_services_for_team(cluster, team_name): + """List ECS services in the cluster that are tagged with the given team.""" + matched = [] + paginator = ecs.get_paginator("list_services") + + for page in paginator.paginate(cluster=cluster): + service_arns = page.get("serviceArns", []) + if not service_arns: + continue + + # DescribeServices accepts max 10 at a time + for i in range(0, len(service_arns), 10): + batch = service_arns[i : i + 10] + desc = ecs.describe_services(cluster=cluster, services=batch, include=["TAGS"]) + for svc in desc.get("services", []): + tags = {t["key"]: t["value"] for t in svc.get("tags", [])} + if tags.get("team") == team_name: + matched.append(svc) + + return matched + + +def scale_service_to_zero(cluster, service_name): + """Scale an ECS service to desired_count=0.""" + ecs.update_service( + cluster=cluster, + service=service_name, + desiredCount=0, + ) + logger.info("Scaled %s to 0 in cluster %s", service_name, cluster) + + +# --------------------------------------------------------------------------- +# Slack notification +# --------------------------------------------------------------------------- +def build_alert_blocks(team_name, budget_name, services_scaled, budget_limit): + """Build Slack Block Kit message for budget enforcement action.""" + ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + + service_list = "\n".join( + f" - `{svc}`" for svc in services_scaled + ) if services_scaled else " _(no matching services found)_" + + nok_limit = float(budget_limit) * USD_TO_NOK if budget_limit else None + limit_text = f"${budget_limit}" + (f" (~{nok_limit:.0f} NOK)" if nok_limit else "") + + blocks = [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": ":rotating_light: Budget Enforcement — Services Scaled to Zero", + "emoji": True, + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + f"Team *{team_name}* has exceeded *200%* of their monthly budget " + f"(`{budget_name}`, limit: {limit_text}).\n\n" + f"The following ECS services have been scaled to `desired_count=0`:" + ), + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": service_list, + }, + }, + {"type": "divider"}, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + ":arrow_right: *To restore services*, increase the team budget in " + "the registry or manually scale services back up via the AWS console/CLI." + ), + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": f"Enforced at {ts} | Cluster: `{CLUSTER}`", + } + ], + }, + ] + return blocks + + +# --------------------------------------------------------------------------- +# Lambda entry point +# --------------------------------------------------------------------------- +def handler(event, context): + for record in event.get("Records", []): + sns_message = record.get("Sns", {}).get("Message", "{}") + + try: + message = json.loads(sns_message) + except json.JSONDecodeError: + logger.error("Failed to parse SNS message: %s", sns_message) + continue + + budget_name = message.get("budgetName", "") + if not budget_name.startswith(BUDGET_NAME_PREFIX): + logger.info( + "Ignoring budget notification for %s (not a team budget)", budget_name + ) + continue + + team_name = budget_name[len(BUDGET_NAME_PREFIX) :] + if not team_name: + logger.error("Could not extract team name from budget: %s", budget_name) + continue + + budget_limit = message.get("budgetLimit", {}).get("amount") + + logger.info( + "Budget enforcement triggered for team %s (budget: %s)", + team_name, + budget_name, + ) + + # Find and scale down team's ECS services + services = list_services_for_team(CLUSTER, team_name) + services_scaled = [] + + for svc in services: + svc_name = svc["serviceName"] + current_count = svc.get("desiredCount", 0) + if current_count == 0: + logger.info( + "Service %s already at desired_count=0, skipping", svc_name + ) + continue + try: + scale_service_to_zero(CLUSTER, svc_name) + services_scaled.append(svc_name) + except Exception as e: + logger.error("Failed to scale service %s: %s", svc_name, e) + + # Post Slack notification regardless of whether services were found + blocks = build_alert_blocks( + team_name, budget_name, services_scaled, budget_limit + ) + fallback = ( + f"Budget enforcement: team {team_name} exceeded 200% — " + f"{len(services_scaled)} service(s) scaled to zero" + ) + + try: + webhook_url = get_webhook_url(ssm, COST_WEBHOOK_PARAM) + post_to_slack(webhook_url, {"blocks": blocks, "text": fallback}) + except Exception as e: + logger.error("Failed to send Slack notification: %s", e) + + return {"statusCode": 200, "body": "OK"} diff --git a/terraform/lambda-src/cost_report/handler.py b/terraform/lambda-src/cost_report/handler.py index 291adfd..8b59cee 100644 --- a/terraform/lambda-src/cost_report/handler.py +++ b/terraform/lambda-src/cost_report/handler.py @@ -378,7 +378,7 @@ def analyze_spike(this_week, prev_week, tw_total, pw_total): # --------------------------------------------------------------------------- # Block Kit builder # --------------------------------------------------------------------------- -def build_blocks(this_week, prev_week, mtd, prev_mtd, project_costs, +def build_blocks(this_week, prev_week, mtd, prev_mtd, project_costs, team_costs, tw_start, tw_end, pw_start, pw_end, mtd_start_d, mtd_end_d, today): blocks = [] @@ -414,6 +414,16 @@ def build_blocks(this_week, prev_week, mtd, prev_mtd, project_costs, }) blocks.append(build_service_table(filtered, tw_total)) + # Per-team tag breakdown as table + if team_costs: + filtered_teams = {k: v for k, v in team_costs.items() if v >= 0.01} + if filtered_teams: + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": ":busts_in_silhouette: *By Team Tag*"} + }) + blocks.append(build_service_table(filtered_teams, tw_total)) + # Categorised service breakdown as tables ai, infra, other = categorise(this_week) prev_ai, prev_infra, prev_other = categorise(prev_week) @@ -534,15 +544,16 @@ def handler(event, context): pm_day = min((mtd_end_d - mtd_start_d).days, days_in_month(pm_start) - 1) pm_end = pm_start + timedelta(days=pm_day) - # Fetch cost data (5 API calls: 4 by-service + 1 by-tag) + # Fetch cost data (6 API calls: 4 by-service + 2 by-tag) this_week = get_costs_by_service(ce, tw_start, tw_end) prev_week = get_costs_by_service(ce, pw_start, pw_end) mtd = get_costs_by_service(ce, mtd_start_d, mtd_end_d) prev_mtd = get_costs_by_service(ce, pm_start, pm_end) project_costs = get_costs_by_tag(ce, tw_start, tw_end, "project") + team_costs = get_costs_by_tag(ce, tw_start, tw_end, "team") blocks = build_blocks( - this_week, prev_week, mtd, prev_mtd, project_costs, + this_week, prev_week, mtd, prev_mtd, project_costs, team_costs, tw_start, tw_end, pw_start, pw_end, mtd_start_d, mtd_end_d, today, ) diff --git a/terraform/lambda-src/daily_cost_check/handler.py b/terraform/lambda-src/daily_cost_check/handler.py index a972edd..d958c27 100644 --- a/terraform/lambda-src/daily_cost_check/handler.py +++ b/terraform/lambda-src/daily_cost_check/handler.py @@ -229,6 +229,14 @@ def build_alert_blocks(spikes, spike_details, yesterday_date): ) detail_parts.append(f"*By project:* {tag_lines}") + team_tags = detail.get("team_tags") + if team_tags: + team_lines = ", ".join( + f"{t}: ${c:.2f}" for t, c in + sorted(team_tags.items(), key=lambda x: x[1], reverse=True)[:5] + ) + detail_parts.append(f"*By team:* {team_lines}") + ce_url = detail.get("url") if ce_url: detail_parts.append(f"<{ce_url}|View in Cost Explorer>") @@ -283,6 +291,10 @@ def handler(event, context): detail["tags"] = get_tag_breakdown(ce, yesterday, svc) except Exception as e: logger.warning("Tag query failed for %s: %s", svc, e) + try: + detail["team_tags"] = get_tag_breakdown(ce, yesterday, svc, tag_key="team") + except Exception as e: + logger.warning("Team tag query failed for %s: %s", svc, e) spike_details[svc] = detail blocks = build_alert_blocks(spikes, spike_details, yesterday) diff --git a/terraform/lambda-src/resource_tagger/handler.py b/terraform/lambda-src/resource_tagger/handler.py new file mode 100644 index 0000000..70a4df9 --- /dev/null +++ b/terraform/lambda-src/resource_tagger/handler.py @@ -0,0 +1,312 @@ +"""Resource tagger — auto-tags newly created AWS resources with creator attribution. + +Receives CloudTrail events via EventBridge (same trigger pattern as compliance-reporter). +Tags resources with `created-by` and `commit` extracted from the IAM session name. + +For CI users: session name = {actor}-{sha8}-{run_id} -> created-by={actor}, commit={sha8} +For console users (Identity Center): created-by from principalId/username, no commit tag. + +Best-effort: never fails the Lambda. Logs warnings for unhandled event types. +""" + +import json +import logging +import os +import re + +import boto3 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +AWS_ACCOUNT_ID = os.environ.get("AWS_ACCOUNT_ID", "") +REGION = os.environ.get("AWS_REGION", "eu-central-1") + +tagging_client = boto3.client("resourcegroupstaggingapi") + + +def handler(event, context): + """Main entry point — EventBridge CloudTrail event.""" + detail = event.get("detail", {}) + event_name = detail.get("eventName", "") + event_source = detail.get("eventSource", "") + + identity = detail.get("userIdentity", {}) + caller_arn = identity.get("arn", "") + + # Skip events from our own role to prevent loops + if "resource-tagger" in caller_arn: + return + + # Extract resource ARNs from the event + arns = extract_resource_arns(event_name, event_source, detail) + if not arns: + logger.info("No taggable ARNs for %s from %s", event_name, event_source) + return + + # Parse creator identity from the session + created_by, commit = parse_session_identity(identity) + if not created_by: + logger.info("Could not determine creator for %s", event_name) + return + + # Build tags to apply + tags = {"created-by": created_by} + if commit: + tags["commit"] = commit + + # Tag each resource, skipping those that already have created-by + for arn in arns: + tag_resource(arn, tags) + + +def parse_session_identity(identity): + """Extract creator and commit from CloudTrail userIdentity. + + Returns (created_by, commit) where commit may be None. + """ + identity_type = identity.get("type", "") + arn = identity.get("arn", "unknown") + + if identity_type == "AssumedRole": + parts = arn.split("/") + role_name = parts[1] if len(parts) > 1 else "unknown" + session = parts[2] if len(parts) > 2 else "" + + # Check if this is a GitHub Actions CI session + is_ci = "javabin-ci-" in role_name.lower() + + if is_ci and session: + # Parse enriched session name: {actor}-{sha8}-{run_id} + # Actor may contain hyphens, sha8 is 8 hex chars, run_id is digits + enriched_match = re.match(r"^(.+)-([0-9a-f]{8})-(\d+)$", session) + if enriched_match: + actor = enriched_match.group(1) + sha8 = enriched_match.group(2) + return actor, sha8 + + # Fallback: use session name as actor, no commit + return session, None + + # Non-CI assumed role (Identity Center, etc.) + # Use role_name + session for attribution + if session: + return f"{role_name}/{session}", None + return role_name, None + + elif identity_type == "IAMUser": + username = identity.get("userName", arn.split("/")[-1]) + return username, None + + elif identity_type == "Root": + return "root", None + + # Federated or unknown + principal_id = identity.get("principalId", "") + if principal_id: + return principal_id, None + + return None, None + + +def extract_resource_arns(event_name, event_source, detail): + """Extract resource ARNs from a CloudTrail event. + + Tries multiple extraction strategies in order of reliability: + 1. detail.resources[].ARN — EventBridge-enriched ARN + 2. responseElements — service-specific ARN fields + 3. requestParameters — build ARN from identifiers (EC2, S3) + + Returns a list of ARN strings. + """ + arns = [] + + # Strategy 1: EventBridge-enriched resources array + resources = detail.get("resources", []) + for res in resources: + arn = res.get("ARN") or res.get("arn") + if arn: + arns.append(arn) + if arns: + return arns + + # Strategy 2: responseElements — common ARN fields + response = detail.get("responseElements") or {} + request = detail.get("requestParameters") or {} + + if event_source == "ec2.amazonaws.com": + arns.extend(_extract_ec2_arns(event_name, response)) + + elif event_source == "s3.amazonaws.com": + if event_name == "CreateBucket": + bucket = request.get("bucketName") + if bucket: + arns.append(f"arn:aws:s3:::{bucket}") + + elif event_source == "rds.amazonaws.com": + arn = response.get("dBInstanceArn") + if arn: + arns.append(arn) + # CreateDBCluster + arn = response.get("dBClusterArn") + if arn: + arns.append(arn) + + elif event_source == "ecs.amazonaws.com": + arns.extend(_extract_ecs_arns(event_name, response)) + + elif event_source == "elasticloadbalancing.amazonaws.com": + arns.extend(_extract_elb_arns(event_name, response)) + + elif event_source == "lambda.amazonaws.com": + arn = response.get("functionArn") + if arn: + arns.append(arn) + + if not arns: + # Strategy 3: scan responseElements for any field ending in "Arn" or "ARN" + arns.extend(_scan_for_arns(response)) + + return arns + + +def _extract_ec2_arns(event_name, response): + """Extract ARNs from EC2 CloudTrail response.""" + arns = [] + if event_name == "RunInstances": + for inst in response.get("instancesSet", {}).get("items", []): + iid = inst.get("instanceId") + if iid: + arns.append( + f"arn:aws:ec2:{REGION}:{AWS_ACCOUNT_ID}:instance/{iid}" + ) + elif event_name == "CreateSecurityGroup": + gid = response.get("groupId") + if gid: + arns.append( + f"arn:aws:ec2:{REGION}:{AWS_ACCOUNT_ID}:security-group/{gid}" + ) + elif event_name == "CreateNatGateway": + nat = response.get("CreateNatGatewayResponse", response) + nat_id = nat.get("natGateway", {}).get("natGatewayId") + if nat_id: + arns.append( + f"arn:aws:ec2:{REGION}:{AWS_ACCOUNT_ID}:natgateway/{nat_id}" + ) + elif event_name == "CreateVpc": + vpc_id = response.get("vpc", {}).get("vpcId") + if vpc_id: + arns.append( + f"arn:aws:ec2:{REGION}:{AWS_ACCOUNT_ID}:vpc/{vpc_id}" + ) + elif event_name == "CreateSubnet": + subnet_id = response.get("subnet", {}).get("subnetId") + if subnet_id: + arns.append( + f"arn:aws:ec2:{REGION}:{AWS_ACCOUNT_ID}:subnet/{subnet_id}" + ) + return arns + + +def _extract_ecs_arns(event_name, response): + """Extract ARNs from ECS CloudTrail response.""" + arns = [] + if event_name == "CreateCluster": + arn = response.get("cluster", {}).get("clusterArn") + if arn: + arns.append(arn) + elif event_name == "CreateService": + arn = response.get("service", {}).get("serviceArn") + if arn: + arns.append(arn) + elif event_name == "RunTask": + for task in response.get("tasks", []): + arn = task.get("taskArn") + if arn: + arns.append(arn) + elif event_name == "CreateTaskSet": + arn = response.get("taskSet", {}).get("taskSetArn") + if arn: + arns.append(arn) + return arns + + +def _extract_elb_arns(event_name, response): + """Extract ARNs from ELB CloudTrail response.""" + arns = [] + if event_name == "CreateLoadBalancer": + for lb in response.get("loadBalancers", []): + arn = lb.get("loadBalancerArn") + if arn: + arns.append(arn) + elif event_name == "CreateTargetGroup": + for tg in response.get("targetGroups", []): + arn = tg.get("targetGroupArn") + if arn: + arns.append(arn) + return arns + + +def _scan_for_arns(obj, depth=0): + """Recursively scan a dict for values that look like ARNs. + + Limits depth to avoid excessive recursion on large events. + """ + if depth > 4: + return [] + + arns = [] + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, str) and value.startswith("arn:aws:"): + arns.append(value) + elif isinstance(value, (dict, list)): + arns.extend(_scan_for_arns(value, depth + 1)) + elif isinstance(obj, list): + for item in obj: + arns.extend(_scan_for_arns(item, depth + 1)) + return arns + + +def tag_resource(arn, tags): + """Tag a single resource, checking for existing created-by tag first. + + Uses the Resource Groups Tagging API for uniform cross-service tagging. + Skips if created-by already exists (preserves original creator). + """ + try: + # Check existing tags + resp = tagging_client.get_resources( + ResourceARNList=[arn], + ResourcesPerPage=1, + ) + resource_list = resp.get("ResourceTagMappingList", []) + if resource_list: + existing_tags = { + t["Key"]: t["Value"] + for t in resource_list[0].get("Tags", []) + } + if "created-by" in existing_tags: + logger.info( + "Skipping %s — already has created-by=%s", + arn, + existing_tags["created-by"], + ) + return + + # Apply tags + tagging_client.tag_resources( + ResourceARNList=[arn], + Tags=tags, + ) + logger.info("Tagged %s with %s", arn, json.dumps(tags)) + + except tagging_client.exceptions.InvalidParameterException as e: + logger.warning("Cannot tag %s (unsupported resource type): %s", arn, e) + except Exception as e: + # Best effort — never fail the Lambda + error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + if error_code in ("InternalServiceException", "ThrottlingException"): + logger.warning("Transient error tagging %s: %s", arn, e) + else: + logger.warning("Failed to tag %s: %s", arn, e) diff --git a/terraform/lambda-src/slack_alert/handler.py b/terraform/lambda-src/slack_alert/handler.py index 6f4642e..e468600 100644 --- a/terraform/lambda-src/slack_alert/handler.py +++ b/terraform/lambda-src/slack_alert/handler.py @@ -305,12 +305,24 @@ def parse_identity(detail): ci_ctx["repo"] = sub_match.group(1) ci_ctx["ref"] = sub_match.group(2) - # Fallback: extract run ID from our session name convention - # Pattern: javabin-{purpose}-{run_id} or javabin-{repo}-{purpose}-{run_id} - if not ci_ctx.get("GitHubRunID"): - run_match = re.search(r"-(\d{8,})$", session) - if run_match: - ci_ctx["GitHubRunID"] = run_match.group(1) + # Parse enriched session name: {actor}-{sha8}-{run_id} + # The actor may contain hyphens, sha8 is 8 hex chars, run_id is digits + enriched_match = re.match( + r"^(.+)-([0-9a-f]{8})-(\d+)$", session + ) + if enriched_match: + if not ci_ctx.get("GitHubActor"): + ci_ctx["GitHubActor"] = enriched_match.group(1) + if not ci_ctx.get("GitHubSHA"): + ci_ctx["GitHubSHA"] = enriched_match.group(2) + if not ci_ctx.get("GitHubRunID"): + ci_ctx["GitHubRunID"] = enriched_match.group(3) + else: + # Legacy fallback: javabin-{purpose}-{run_id} + if not ci_ctx.get("GitHubRunID"): + run_match = re.search(r"-(\d{8,})$", session) + if run_match: + ci_ctx["GitHubRunID"] = run_match.group(1) # Fallback: infer repo from role name if not resolved # javabin-ci-infra → javaBin/platform diff --git a/terraform/lambda-src/team_provisioner/handler.py b/terraform/lambda-src/team_provisioner/handler.py index d129ffb..348a233 100644 --- a/terraform/lambda-src/team_provisioner/handler.py +++ b/terraform/lambda-src/team_provisioner/handler.py @@ -63,6 +63,7 @@ PASSWORD_SET_URL_PARAM = os.environ.get( "PASSWORD_SET_URL_PARAM", "/javabin/platform/password-set-function-url" ) +BUDGET_ENFORCEMENT_TOPIC_ARN = os.environ.get("BUDGET_ENFORCEMENT_TOPIC_ARN", "") # javaBin horizontal white logo PNG — loaded from the Lambda zip at cold start. # CID-embedded in welcome emails to avoid "trust this email" prompts in Outlook. @@ -647,21 +648,37 @@ def sync_budget(team): ) logger.info("Updated budget %s: $%.2f/mo", budget_name, budget_usd) except budgets_client.exceptions.NotFoundException: - notification = { - "Notification": { - "NotificationType": "ACTUAL", - "ComparisonOperator": "GREATER_THAN", - "Threshold": 80.0, - "ThresholdType": "PERCENTAGE", + notifications = [ + { + "Notification": { + "NotificationType": "ACTUAL", + "ComparisonOperator": "GREATER_THAN", + "Threshold": 80.0, + "ThresholdType": "PERCENTAGE", + }, + "Subscribers": [ + {"SubscriptionType": "SNS", "Address": ALERTS_TOPIC_ARN} + ], }, - "Subscribers": [ - {"SubscriptionType": "SNS", "Address": ALERTS_TOPIC_ARN} - ], - } + ] + # 200% enforcement notification — triggers budget-enforcer Lambda to + # scale ECS services to zero for the team. + if BUDGET_ENFORCEMENT_TOPIC_ARN: + notifications.append({ + "Notification": { + "NotificationType": "ACTUAL", + "ComparisonOperator": "GREATER_THAN", + "Threshold": 200.0, + "ThresholdType": "PERCENTAGE", + }, + "Subscribers": [ + {"SubscriptionType": "SNS", "Address": BUDGET_ENFORCEMENT_TOPIC_ARN} + ], + }) budgets_client.create_budget( AccountId=ACCOUNT_ID, Budget=budget_spec, - NotificationsWithSubscribers=[notification], + NotificationsWithSubscribers=notifications, ) logger.info("Created budget %s: $%.2f/mo", budget_name, budget_usd) diff --git a/terraform/modules/ecs-service/main.tf b/terraform/modules/ecs-service/main.tf index eb197ba..baa9226 100644 --- a/terraform/modules/ecs-service/main.tf +++ b/terraform/modules/ecs-service/main.tf @@ -97,5 +97,8 @@ resource "aws_ecs_service" "this" { container_port = var.port } + enable_ecs_managed_tags = true + propagate_tags = "SERVICE" + depends_on = [aws_ecs_task_definition.this] } diff --git a/terraform/modules/service-rds/main.tf b/terraform/modules/service-rds/main.tf new file mode 100644 index 0000000..05176e8 --- /dev/null +++ b/terraform/modules/service-rds/main.tf @@ -0,0 +1,91 @@ +################################################################################ +# RDS PostgreSQL — managed database for service data +################################################################################ + +resource "aws_db_subnet_group" "this" { + name = "${var.name}-db-subnet" + subnet_ids = var.subnet_ids + + tags = merge(var.tags, { + Name = "${var.name}-db-subnet" + }) +} + +resource "aws_security_group" "this" { + name = "${var.name}-rds-sg" + description = "Allow PostgreSQL access from ECS tasks" + vpc_id = var.vpc_id + + tags = merge(var.tags, { + Name = "${var.name}-rds-sg" + }) +} + +resource "aws_security_group_rule" "this" { + count = length(var.allowed_security_group_ids) + type = "ingress" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + source_security_group_id = var.allowed_security_group_ids[count.index] + security_group_id = aws_security_group.this.id +} + +resource "aws_db_instance" "this" { + identifier = var.name + engine = "postgres" + engine_version = var.engine_version + instance_class = var.instance_class + + allocated_storage = var.allocated_storage + storage_encrypted = true + + db_name = replace(var.name, "-", "_") + username = "postgres" + + manage_master_user_password = true + + db_subnet_group_name = aws_db_subnet_group.this.name + vpc_security_group_ids = [aws_security_group.this.id] + + backup_retention_period = var.backup_retention_period + multi_az = var.multi_az + deletion_protection = var.deletion_protection + + skip_final_snapshot = false + final_snapshot_identifier = "${var.name}-final" + + tags = merge(var.tags, { + Name = var.name + }) +} + +################################################################################ +# IAM Policy Document — for attaching to task roles +################################################################################ + +data "aws_iam_policy_document" "access" { + statement { + sid = "RDSConnect" + effect = "Allow" + actions = [ + "rds-db:connect", + ] + resources = [ + # ARN format: arn:aws:rds-db:{region}:{account}:dbuser:{dbi-resource-id}/* + "arn:aws:rds-db:*:*:dbuser:${aws_db_instance.this.resource_id}/*", + ] + } + + statement { + sid = "SecretsManagerRead" + effect = "Allow" + actions = [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + ] + resources = [ + aws_db_instance.this.master_user_secret[0].secret_arn, + ] + } +} diff --git a/terraform/modules/service-rds/outputs.tf b/terraform/modules/service-rds/outputs.tf new file mode 100644 index 0000000..536217c --- /dev/null +++ b/terraform/modules/service-rds/outputs.tf @@ -0,0 +1,24 @@ +output "endpoint" { + description = "RDS instance endpoint" + value = aws_db_instance.this.endpoint +} + +output "port" { + description = "RDS instance port" + value = aws_db_instance.this.port +} + +output "db_name" { + description = "Database name" + value = aws_db_instance.this.db_name +} + +output "access_policy_json" { + description = "IAM policy JSON granting access to this database" + value = data.aws_iam_policy_document.access.json +} + +output "security_group_id" { + description = "Security group ID for the RDS instance" + value = aws_security_group.this.id +} diff --git a/terraform/modules/service-rds/variables.tf b/terraform/modules/service-rds/variables.tf new file mode 100644 index 0000000..869a08c --- /dev/null +++ b/terraform/modules/service-rds/variables.tf @@ -0,0 +1,61 @@ +variable "name" { + description = "Database instance identifier" + type = string +} + +variable "engine_version" { + description = "PostgreSQL engine version" + type = string + default = "16" +} + +variable "instance_class" { + description = "RDS instance class" + type = string + default = "db.t3.micro" +} + +variable "allocated_storage" { + description = "Storage in GB" + type = number + default = 20 +} + +variable "subnet_ids" { + description = "Private subnet IDs for DB subnet group" + type = list(string) +} + +variable "vpc_id" { + description = "VPC ID for security group" + type = string +} + +variable "allowed_security_group_ids" { + description = "Security group IDs allowed to connect (e.g., ECS tasks SG)" + type = list(string) +} + +variable "backup_retention_period" { + description = "Backup retention in days" + type = number + default = 7 +} + +variable "multi_az" { + description = "Enable Multi-AZ deployment" + type = bool + default = false +} + +variable "deletion_protection" { + description = "Enable deletion protection" + type = bool + default = true +} + +variable "tags" { + description = "Additional tags" + type = map(string) + default = {} +} diff --git a/terraform/modules/service-role/main.tf b/terraform/modules/service-role/main.tf index 570df7b..e34d4b6 100644 --- a/terraform/modules/service-role/main.tf +++ b/terraform/modules/service-role/main.tf @@ -11,7 +11,7 @@ resource "aws_iam_role" "this" { Statement = [{ Effect = "Allow" Principal = { - Service = "ecs-tasks.amazonaws.com" + Service = var.trusted_services } Action = "sts:AssumeRole" }] diff --git a/terraform/modules/service-role/variables.tf b/terraform/modules/service-role/variables.tf index fb027cd..addefc7 100644 --- a/terraform/modules/service-role/variables.tf +++ b/terraform/modules/service-role/variables.tf @@ -23,6 +23,12 @@ variable "permissions_boundary_arn" { type = string } +variable "trusted_services" { + description = "AWS service principals allowed to assume this role (e.g. ecs-tasks, ec2, lambda)" + type = list(string) + default = ["ecs-tasks.amazonaws.com"] +} + variable "additional_policy_jsons" { description = "Map of policy name => IAM policy JSON to attach (e.g. from service-bucket or service-database access_policy_json outputs)" type = map(string) diff --git a/terraform/org/providers.tf b/terraform/org/providers.tf index de3b1cf..5e25c5e 100644 --- a/terraform/org/providers.tf +++ b/terraform/org/providers.tf @@ -11,10 +11,11 @@ terraform { locals { required_tags = { - project = var.project team = "javabin" - managed-by = "terraform" + service = "org" + repo = "javaBin/platform" environment = var.environment + managed-by = "terraform" } } diff --git a/terraform/platform/iam/main.tf b/terraform/platform/iam/main.tf index 81aa642..4b9c098 100644 --- a/terraform/platform/iam/main.tf +++ b/terraform/platform/iam/main.tf @@ -380,9 +380,27 @@ resource "aws_iam_role_policy" "ci_team_deny" { name = "deny-platform-operations" role = aws_iam_role.ci_team[each.key].id + # Deny policy protects platform infrastructure from team CI roles. + # + # Design principle: use tag-based ABAC where AWS supports it, explicit + # denies only where the service lacks tag condition support. Each statement + # documents WHY it can't be replaced with ABAC. + # + # The allow policy (ci_team_allow) already gates all actions behind + # aws:ResourceTag/team + aws:RequestTag/team conditions. These denies + # are a second layer for services that don't honor tag conditions. policy = jsonencode({ Version = "2012-10-17" Statement = [ + # ---------------------------------------------------------------- + # EC2 networking — MUST be explicit deny + # + # AWS EC2 does NOT support aws:RequestTag conditions on CreateVpc, + # CreateSubnet, CreateInternetGateway, or CreateNatGateway. + # Resource doesn't exist at creation time so aws:ResourceTag is N/A. + # Cross-service dependencies (subnet→VPC) can't be scoped by tag. + # See: docs.aws.amazon.com/AWSEC2/latest/UserGuide/supported-iam-actions-tagging.html + # ---------------------------------------------------------------- { Sid = "DenyNetworkInfra" Effect = "Deny" @@ -393,22 +411,16 @@ resource "aws_iam_role_policy" "ci_team_deny" { "ec2:CreateInternetGateway", "ec2:DeleteInternetGateway", "ec2:AttachInternetGateway", "ec2:DetachInternetGateway", "ec2:CreateRouteTable", "ec2:DeleteRouteTable", - "ec2:CreateSecurityGroup", "ec2:DeleteSecurityGroup", - "ec2:AuthorizeSecurityGroupIngress", "ec2:RevokeSecurityGroupIngress", - "ec2:AuthorizeSecurityGroupEgress", "ec2:RevokeSecurityGroupEgress", - ] - Resource = "*" - }, - { - Sid = "DenyLoadBalancerAndCluster" - Effect = "Deny" - Action = [ - "elasticloadbalancingv2:CreateLoadBalancer", - "elasticloadbalancingv2:DeleteLoadBalancer", - "ecs:CreateCluster", "ecs:DeleteCluster", ] Resource = "*" }, + # ---------------------------------------------------------------- + # Security services — MUST be explicit deny + # + # GuardDuty, Security Hub, AWS Config, and CloudTrail do NOT + # support aws:ResourceTag or aws:RequestTag conditions in IAM. + # There is no tag-based mechanism to protect these services. + # ---------------------------------------------------------------- { Sid = "DenySecurityServices" Effect = "Deny" @@ -420,6 +432,11 @@ resource "aws_iam_role_policy" "ci_team_deny" { ] Resource = "*" }, + # ---------------------------------------------------------------- + # Organizations + Account — MUST be explicit deny + # + # These are account-level services with no resource tagging. + # ---------------------------------------------------------------- { Sid = "DenyOrgAndAccount" Effect = "Deny" @@ -429,6 +446,12 @@ resource "aws_iam_role_policy" "ci_team_deny" { ] Resource = "*" }, + # ---------------------------------------------------------------- + # IAM user creation — MUST be explicit deny + # + # IAM users are global resources, not taggable per-team. + # Prevents creation of long-lived credentials. + # ---------------------------------------------------------------- { Sid = "DenyDangerousIAM" Effect = "Deny" @@ -439,20 +462,58 @@ resource "aws_iam_role_policy" "ci_team_deny" { ] Resource = "*" }, + # ---------------------------------------------------------------- + # Platform SNS topics — scoped to platform ARNs (ABAC-friendly) + # + # SNS supports full tag-based ABAC. Teams CAN create their own + # topics (gated by aws:RequestTag/team in the allow policy). + # This deny only protects platform-owned alert topics by ARN. + # ---------------------------------------------------------------- { Sid = "DenyPlatformSNS" Effect = "Deny" - Action = [ - "sns:CreateTopic", - "sns:DeleteTopic", + Action = ["sns:DeleteTopic", "sns:SetTopicAttributes", "sns:Subscribe", "sns:Unsubscribe"] + Resource = [ + "arn:aws:sns:${var.region}:${var.aws_account_id}:${var.project}-alerts", + "arn:aws:sns:${var.region}:${var.aws_account_id}:${var.project}-security", + "arn:aws:sns:${var.region}:${var.aws_account_id}:${var.project}-budget-enforcement", ] - Resource = "*" }, - { - Sid = "DenyStateBucketDeletion" - Effect = "Deny" - Action = "s3:DeleteBucket" - Resource = "*" + # ---------------------------------------------------------------- + # Platform S3 buckets — scoped to platform ARNs (ABAC-friendly) + # + # S3 supports full tag-based ABAC. Teams CAN create/delete their + # own buckets (gated by aws:RequestTag/team). This protects only + # the Terraform state and CI artifact buckets. + # ---------------------------------------------------------------- + { + Sid = "DenyPlatformS3" + Effect = "Deny" + Action = ["s3:DeleteBucket", "s3:PutBucketPolicy", "s3:DeleteBucketPolicy"] + Resource = [ + "arn:aws:s3:::${var.project}-terraform-state-${var.aws_account_id}", + "arn:aws:s3:::${var.project}-ci-plan-artifacts-${var.aws_account_id}", + ] + }, + # ---------------------------------------------------------------- + # Platform ECS cluster + ALB — scoped to platform ARNs (ABAC-friendly) + # + # ECS and ELBv2 support tag-based ABAC. Teams CAN manage their + # own services (gated by aws:ResourceTag/team). This protects + # only the shared platform cluster and load balancer. + # ---------------------------------------------------------------- + { + Sid = "DenyPlatformCompute" + Effect = "Deny" + Action = [ + "ecs:DeleteCluster", "ecs:UpdateCluster", + "elasticloadbalancingv2:DeleteLoadBalancer", + "elasticloadbalancingv2:ModifyLoadBalancerAttributes", + ] + Resource = [ + "arn:aws:ecs:${var.region}:${var.aws_account_id}:cluster/${var.project}-platform", + "arn:aws:elasticloadbalancingv2:${var.region}:${var.aws_account_id}:loadbalancer/app/${var.project}-*", + ] }, ] }) @@ -521,7 +582,10 @@ resource "aws_iam_role_policy" "ci_deploy_ecr" { "ecr:CreateRepository", "ecr:DescribeRepositories", ] - # Team can push to any repo tagged with their team + # ECR push actions don't support tag-based authorization. + # Blast radius is contained: ci-broker limits which team role you get, + # and each team can only push to repos they've registered. + # Wildcard is required because app ECR repos use plain names (not project-prefixed). Resource = "arn:aws:ecr:${var.region}:${var.aws_account_id}:repository/*" } ] diff --git a/terraform/platform/lambdas/main.tf b/terraform/platform/lambdas/main.tf index 973497c..8af8d8a 100644 --- a/terraform/platform/lambdas/main.tf +++ b/terraform/platform/lambdas/main.tf @@ -1,6 +1,7 @@ ################################################################################ # Lambda Functions — slack-alert, cost-report, daily-cost-check, -# compliance-reporter, override-cleanup, team-provisioner +# compliance-reporter, resource-tagger, override-cleanup, +# team-provisioner ################################################################################ locals { @@ -100,6 +101,13 @@ data "archive_file" "compliance_reporter" { source_dir = "${local.lambda_src_path}/compliance_reporter" } +data "archive_file" "resource_tagger" { + type = "zip" + output_path = "${path.module}/builds/resource_tagger.zip" + output_file_mode = "0644" + source_dir = "${local.lambda_src_path}/resource_tagger" +} + data "archive_file" "override_cleanup" { type = "zip" output_path = "${path.module}/builds/override_cleanup.zip" @@ -147,6 +155,29 @@ data "archive_file" "password_set" { } } +data "archive_file" "budget_enforcer" { + type = "zip" + output_path = "${path.module}/builds/budget_enforcer.zip" + output_file_mode = "0644" + + source { + content = file("${local.lambda_src_path}/budget_enforcer/handler.py") + filename = "handler.py" + } + source { + content = file("${local.lambda_src_path}/shared/__init__.py") + filename = "shared/__init__.py" + } + source { + content = file("${local.lambda_src_path}/shared/slack.py") + filename = "shared/slack.py" + } + source { + content = file("${local.lambda_src_path}/shared/constants.py") + filename = "shared/constants.py" + } +} + ################################################################################ # IAM Roles ################################################################################ @@ -382,6 +413,46 @@ resource "aws_iam_role_policy_attachment" "compliance_reporter_logs" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" } +# --- resource-tagger role --- +resource "aws_iam_role" "resource_tagger" { + name = "${var.project}-resource-tagger" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "lambda.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) +} + +resource "aws_iam_role_policy" "resource_tagger" { + name = "${var.project}-resource-tagger" + role = aws_iam_role.resource_tagger.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "TagResources" + Effect = "Allow" + Action = [ + "tag:TagResources", + "tag:GetResources", + ] + # Resource Groups Tagging API requires resource "*" + Resource = "*" + }, + ] + }) +} + +resource "aws_iam_role_policy_attachment" "resource_tagger_logs" { + role = aws_iam_role.resource_tagger.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + # --- override-cleanup role --- resource "aws_iam_role" "override_cleanup" { name = "${var.project}-override-cleanup" @@ -672,6 +743,23 @@ resource "aws_lambda_function" "compliance_reporter" { } } +resource "aws_lambda_function" "resource_tagger" { + function_name = "${var.project}-resource-tagger" + role = aws_iam_role.resource_tagger.arn + handler = "handler.handler" + runtime = "python3.12" + timeout = 30 + memory_size = 128 + filename = data.archive_file.resource_tagger.output_path + source_code_hash = data.archive_file.resource_tagger.output_base64sha256 + + environment { + variables = { + AWS_ACCOUNT_ID = var.aws_account_id + } + } +} + resource "aws_lambda_function" "override_cleanup" { function_name = "${var.project}-override-cleanup" role = aws_iam_role.override_cleanup.arn @@ -702,20 +790,21 @@ resource "aws_lambda_function" "team_provisioner" { environment { variables = { - INFRA_WEBHOOK_PARAM = "/javabin/slack/platform-resource-alerts-webhook" - GOOGLE_SA_PARAM = "/javabin/platform/google-admin-sa" - GOOGLE_ADMIN_EMAIL_PARAM = "/javabin/platform/google-admin-email" - GITHUB_APP_ID_PARAM = "/javabin/platform/github-app-id" - GITHUB_APP_KEY_PARAM = "/javabin/platform/github-app-key" - ACCOUNT_ID = var.aws_account_id - GITHUB_ORG = "javaBin" - ALERTS_TOPIC_ARN = var.alerts_topic_arn - COGNITO_INTERNAL_POOL_ID = var.internal_user_pool_id - IDENTITY_STORE_ID = var.identity_store_id - SSO_INSTANCE_ARN = var.sso_instance_arn - PROJECT = var.project - SIGNING_KEY_PARAM = "/javabin/platform/password-token-signing-key" - PASSWORD_SET_URL_PARAM = "/javabin/platform/password-set-function-url" + INFRA_WEBHOOK_PARAM = "/javabin/slack/platform-resource-alerts-webhook" + GOOGLE_SA_PARAM = "/javabin/platform/google-admin-sa" + GOOGLE_ADMIN_EMAIL_PARAM = "/javabin/platform/google-admin-email" + GITHUB_APP_ID_PARAM = "/javabin/platform/github-app-id" + GITHUB_APP_KEY_PARAM = "/javabin/platform/github-app-key" + ACCOUNT_ID = var.aws_account_id + GITHUB_ORG = "javaBin" + ALERTS_TOPIC_ARN = var.alerts_topic_arn + COGNITO_INTERNAL_POOL_ID = var.internal_user_pool_id + IDENTITY_STORE_ID = var.identity_store_id + SSO_INSTANCE_ARN = var.sso_instance_arn + PROJECT = var.project + SIGNING_KEY_PARAM = "/javabin/platform/password-token-signing-key" + PASSWORD_SET_URL_PARAM = "/javabin/platform/password-set-function-url" + BUDGET_ENFORCEMENT_TOPIC_ARN = aws_sns_topic.budget_enforcement.arn } } } @@ -821,6 +910,135 @@ resource "aws_route53_record" "password_set" { } } +################################################################################ +# Budget Enforcer — scales ECS services to zero on 200% budget breach +################################################################################ + +# Dedicated SNS topic for budget enforcement (200% threshold notifications) +resource "aws_sns_topic" "budget_enforcement" { + name = "${var.project}-budget-enforcement" +} + +# Allow AWS Budgets to publish to this topic +resource "aws_sns_topic_policy" "budget_enforcement" { + arn = aws_sns_topic.budget_enforcement.arn + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowBudgetsPublish" + Effect = "Allow" + Principal = { Service = "budgets.amazonaws.com" } + Action = "SNS:Publish" + Resource = aws_sns_topic.budget_enforcement.arn + Condition = { + StringEquals = { + "aws:SourceAccount" = var.aws_account_id + } + } + }, + ] + }) +} + +# --- budget-enforcer role --- +resource "aws_iam_role" "budget_enforcer" { + name = "${var.project}-budget-enforcer" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "lambda.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) +} + +resource "aws_iam_role_policy" "budget_enforcer" { + name = "${var.project}-budget-enforcer" + role = aws_iam_role.budget_enforcer.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "SSMRead" + Effect = "Allow" + Action = "ssm:GetParameter" + Resource = "arn:aws:ssm:${var.region}:${var.aws_account_id}:parameter/javabin/slack/*" + }, + { + Sid = "ECSListAndDescribe" + Effect = "Allow" + Action = [ + "ecs:ListServices", + "ecs:DescribeServices", + "ecs:ListTagsForResource", + ] + Resource = "*" + Condition = { + StringEquals = { + "ecs:cluster" = "arn:aws:ecs:${var.region}:${var.aws_account_id}:cluster/${var.project}-platform" + } + } + }, + { + Sid = "ECSListServicesUnconstrained" + Effect = "Allow" + Action = "ecs:ListServices" + # ListServices requires * or cluster ARN — condition key not supported for list calls + Resource = "arn:aws:ecs:${var.region}:${var.aws_account_id}:cluster/${var.project}-platform" + }, + { + Sid = "ECSUpdateService" + Effect = "Allow" + Action = "ecs:UpdateService" + Resource = "arn:aws:ecs:${var.region}:${var.aws_account_id}:service/${var.project}-platform/*" + }, + ] + }) +} + +resource "aws_iam_role_policy_attachment" "budget_enforcer_logs" { + role = aws_iam_role.budget_enforcer.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +resource "aws_lambda_function" "budget_enforcer" { + function_name = "${var.project}-budget-enforcer" + role = aws_iam_role.budget_enforcer.arn + handler = "handler.handler" + runtime = "python3.12" + timeout = 60 + memory_size = 128 + filename = data.archive_file.budget_enforcer.output_path + source_code_hash = data.archive_file.budget_enforcer.output_base64sha256 + + environment { + variables = { + ECS_CLUSTER = "${var.project}-platform" + COST_WEBHOOK_PARAM = "/javabin/slack/platform-cost-alerts-webhook" + BUDGET_NAME_PREFIX = "${var.project}-team-" + } + } +} + +resource "aws_sns_topic_subscription" "budget_enforcer" { + topic_arn = aws_sns_topic.budget_enforcement.arn + protocol = "lambda" + endpoint = aws_lambda_function.budget_enforcer.arn +} + +resource "aws_lambda_permission" "budget_enforcer_sns" { + statement_id = "AllowSNSBudgetEnforcement" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.budget_enforcer.function_name + principal = "sns.amazonaws.com" + source_arn = aws_sns_topic.budget_enforcement.arn +} + ################################################################################ # SNS Subscriptions — slack-alert subscribes to both topics ################################################################################ @@ -933,6 +1151,40 @@ resource "aws_lambda_permission" "compliance_reporter_eventbridge" { source_arn = aws_cloudwatch_event_rule.compliance_reporter_trigger.arn } +# --- resource-tagger: EventBridge — tag newly created resources --- +resource "aws_cloudwatch_event_rule" "resource_tagger_trigger" { + name = "${var.project}-resource-tagger-trigger" + description = "Tag newly created resources with creator attribution" + + # Match ALL AWS services — resource_tagger has generic ARN extraction + # via ResourceGroupsTaggingAPI, so new services are auto-covered. + # Filtered to Create/Run events only, keeping volume manageable. + event_pattern = jsonencode({ + source = [{ "prefix" : "aws." }] + detail-type = ["AWS API Call via CloudTrail"] + detail = { + eventName = [ + { "prefix" : "Create" }, + { "prefix" : "Run" }, + ] + } + }) +} + +resource "aws_cloudwatch_event_target" "resource_tagger" { + rule = aws_cloudwatch_event_rule.resource_tagger_trigger.name + target_id = "invoke-resource-tagger" + arn = aws_lambda_function.resource_tagger.arn +} + +resource "aws_lambda_permission" "resource_tagger_eventbridge" { + statement_id = "AllowEventBridge" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.resource_tagger.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.resource_tagger_trigger.arn +} + # --- override-cleanup: hourly --- resource "aws_cloudwatch_event_rule" "override_cleanup_schedule" { name = "${var.project}-override-cleanup-schedule" diff --git a/terraform/platform/lambdas/outputs.tf b/terraform/platform/lambdas/outputs.tf index 1c969b9..58273aa 100644 --- a/terraform/platform/lambdas/outputs.tf +++ b/terraform/platform/lambdas/outputs.tf @@ -18,6 +18,11 @@ output "compliance_reporter_function_arn" { value = aws_lambda_function.compliance_reporter.arn } +output "resource_tagger_function_arn" { + description = "ARN of the resource-tagger Lambda function" + value = aws_lambda_function.resource_tagger.arn +} + output "override_cleanup_function_arn" { description = "ARN of the override-cleanup Lambda function" value = aws_lambda_function.override_cleanup.arn @@ -52,3 +57,13 @@ output "ci_broker_function_arn" { description = "ARN of the CI broker Lambda function" value = aws_lambda_function.ci_broker.arn } + +output "budget_enforcer_function_arn" { + description = "ARN of the budget-enforcer Lambda function" + value = aws_lambda_function.budget_enforcer.arn +} + +output "budget_enforcement_topic_arn" { + description = "ARN of the budget-enforcement SNS topic" + value = aws_sns_topic.budget_enforcement.arn +} diff --git a/terraform/platform/monitoring/main.tf b/terraform/platform/monitoring/main.tf index e041895..40e80ab 100644 --- a/terraform/platform/monitoring/main.tf +++ b/terraform/platform/monitoring/main.tf @@ -158,6 +158,11 @@ resource "aws_cloudwatch_event_rule" "resource_creation" { name = "${var.project}-resource-creation" description = "Detect creation of significant AWS resources" + # Intentionally scoped to major compute/storage services — NOT a wildcard. + # A {"prefix": "aws."} match would fire ~10,000 events/hour (CreateLogStream, + # CreateNetworkInterface, etc.) overwhelming SNS and making alerts unusable. + # The resource-tagger Lambda uses a wildcard because it has generic fallback; + # these security alerts need a curated list for signal-to-noise. event_pattern = jsonencode({ source = [ "aws.s3", "aws.ec2", "aws.rds", "aws.ecs", @@ -188,6 +193,7 @@ resource "aws_cloudwatch_event_rule" "resource_modification" { name = "${var.project}-resource-modification" description = "Detect modifications and deletions of significant AWS resources" + # Same rationale as resource_creation — curated list for alert signal quality. event_pattern = jsonencode({ source = [ "aws.s3", "aws.ec2", "aws.rds", "aws.ecs", @@ -490,3 +496,13 @@ resource "aws_dynamodb_table" "alert_dedup" { Name = "${var.project}-alert-dedup" } } + +################################################################################ +# Cost Allocation Tags +################################################################################ + +resource "aws_ce_cost_allocation_tag" "tags" { + for_each = toset(["team", "service", "repo", "environment", "managed-by", "created-by", "commit"]) + tag_key = each.key + status = "Active" +} diff --git a/terraform/platform/providers.tf b/terraform/platform/providers.tf index a52ce82..b6c6198 100644 --- a/terraform/platform/providers.tf +++ b/terraform/platform/providers.tf @@ -11,10 +11,11 @@ terraform { locals { required_tags = { - project = var.project team = "javabin" - managed-by = "terraform" + service = "platform" + repo = "javaBin/platform" environment = var.environment + managed-by = "terraform" } } diff --git a/terraform/state/providers.tf b/terraform/state/providers.tf index de3b1cf..08c39e6 100644 --- a/terraform/state/providers.tf +++ b/terraform/state/providers.tf @@ -11,10 +11,11 @@ terraform { locals { required_tags = { - project = var.project team = "javabin" - managed-by = "terraform" + service = "state" + repo = "javaBin/platform" environment = var.environment + managed-by = "terraform" } } From 83230af85c03deb520e916ba6d719e35b1ed0755 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 17 Mar 2026 20:14:26 +0100 Subject: [PATCH 2/2] Update docs for new Lambdas, RDS module, tag schema, ABAC changes - CLAUDE.md: add budget-enforcer, resource-tagger, ci-broker to Lambda table and alert routing diagram. Update function count to 11. - lambda-functions.md: add budget-enforcer and resource-tagger sections, update team-provisioner from stub to working status. - reusable-modules.md: add service-rds module, document trusted_services on service-role, add ECS tag propagation note. - platform-modules.md: update Lambda count, add cost allocation tags. --- CLAUDE.md | 9 +++++++-- docs/lambda-functions.md | 36 +++++++++++++++++++++++++++++++++--- docs/platform-modules.md | 3 ++- docs/reusable-modules.md | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 6 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index cc54f6e..f4c3de8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -116,7 +116,7 @@ terraform/platform/ iam/ GitHub OIDC, CI roles, permission boundary compute/ ECS cluster, ECR base config monitoring/ SNS, EventBridge, Config, GuardDuty, Security Hub - lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner, apply-gate, securityhub-summary, password-set + lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, resource-tagger, budget-enforcer, override-cleanup, team-provisioner, apply-gate, securityhub-summary, password-set, ci-broker identity/ Cognito user pools (internal + external). Identity Center is in terraform/org/ ``` @@ -186,6 +186,9 @@ terraform/state/ | `team-provisioner` | Syncs Google Groups, GitHub teams, AWS Budgets from registry team YAML | | `securityhub-summary` | Weekly Security Hub findings summary (Monday 08:00 UTC) | | `password-set` | Self-service password-set for new hero accounts (Function URL) | +| `budget-enforcer` | Scales ECS services to zero when team exceeds 200% budget | +| `resource-tagger` | EventBridge-triggered, auto-tags created-by + commit on new resources | +| `ci-broker` | Validates team membership, vends short-lived team role credentials | ### Scripts | Script | What | @@ -228,6 +231,8 @@ Scheduled: EventBridge (Create/Run) ──► compliance-reporter (report to Slack, no auto-fix) Hourly ──► override-cleanup (delete stale SSM override tokens) Registry merge ──► team-provisioner (Google/GitHub/Budget/Cognito/Identity Center sync + hero provisioning) +AWS Budgets (200%) ──► budget-enforcer Lambda ──► ECS scale-to-zero + #javabin-cost-alerts +EventBridge (Create/Run) ──► resource-tagger Lambda ──► Tag created-by + commit ``` ## SSM Parameters @@ -274,7 +279,7 @@ The SA JSON key is at `/javabin/platform/google-admin-sa`, the impersonation tar | 2c | IAM / OIDC | **Deployed** — 6 CI roles (infra, infra-plan, per-app, deploy, override-approver, registry) | | 2d | Compute | **Deployed** — ECS cluster + ECR repos | | 2e | Monitoring | **Deployed** — GuardDuty, Security Hub, Config, SNS | -| 2f | Lambda Functions | **Deployed** — 8 functions (Google/GitHub/Budget/Cognito/Identity Center sync live) | +| 2f | Lambda Functions | **Deployed** — 11 functions (budget-enforcer, resource-tagger, ci-broker added; Google/GitHub/Budget/Cognito/Identity Center sync live) | | 2g | Platform CI | **Done** — plan → LLM review → apply pipeline working | | 3a | Reusable Terraform Modules | **Code done** — 12 modules in repo | | 3b | GitHub Actions Workflows | **Code done** — 14 reusable workflows | diff --git a/docs/lambda-functions.md b/docs/lambda-functions.md index cd92517..4382d80 100644 --- a/docs/lambda-functions.md +++ b/docs/lambda-functions.md @@ -70,10 +70,40 @@ Queries Security Hub for active findings at HIGH and CRITICAL severity, aggregat ## team-provisioner -**Trigger:** (Future) Registry repo merge events -**Purpose:** Syncs team definitions across Google Workspace, GitHub, Cognito, and IAM. +**Trigger:** Registry repo merge events (via `provision-app.yml` workflow dispatch) +**Purpose:** Syncs team definitions from registry YAML across Google Groups, GitHub teams, AWS Budgets (80% warning + 200% enforcement thresholds), Cognito groups, and Identity Center groups. Also handles hero account provisioning. -**Status:** Stub only — logs event and returns success. Blocked on Google Admin access. +| SSM Parameter | Purpose | +|---------------|---------| +| `/javabin/platform/google-admin-sa` | GCP service account JSON key (domain-wide delegation) | +| `/javabin/platform/google-admin-email` | Admin email for Google Admin SDK impersonation | +| `/javabin/platform/github-app-id` | GitHub App ID for team management | +| `/javabin/platform/github-app-key` | GitHub App private key | +| `/javabin/platform/github-app-client-secret` | GitHub App client secret | + +## budget-enforcer + +**Trigger:** SNS notification from AWS Budgets (200% threshold) +**Purpose:** Scales a team's ECS services to `desired_count=0` when spending exceeds 200% of their monthly budget. Does NOT destroy resources — services can be scaled back up after resolution. + +**Flow:** Parse budget name (`javabin-team-{team}`) → list ECS services tagged with team → scale to zero → post Slack alert. + +| SSM Parameter | Channel | +|---------------|---------| +| `/javabin/slack/platform-cost-alerts-webhook` | #javabin-cost-alerts | + +**Environment vars:** `ECS_CLUSTER` (default: `javabin-platform`) + +## resource-tagger + +**Trigger:** EventBridge rule matching all AWS service creation events (`{"prefix": "aws."}` source, `Create*`/`Run*` event names) +**Purpose:** Auto-tags newly created AWS resources with `created-by` (actor) and `commit` (SHA) parsed from the CloudTrail session name. Tags are set via AWS Resource Groups Tagging API, outside Terraform management — no drift or plan noise. + +**Session name format:** `{actor}-{sha8}-{run_id}` (enriched in CI workflows) + +Idempotent: skips resources that already have a `created-by` tag (preserves original creator). + +**Environment vars:** `AWS_ACCOUNT_ID` ## Shared Module: pricing diff --git a/docs/platform-modules.md b/docs/platform-modules.md index 1cc09cf..e168b62 100644 --- a/docs/platform-modules.md +++ b/docs/platform-modules.md @@ -79,10 +79,11 @@ SNS topics, EventBridge rules, Config, GuardDuty, Security Hub. | GuardDuty | Threat detection | | Security Hub | Findings aggregation | | `javabin-alert-dedup` DynamoDB | Deduplication table used by slack-alert Lambda | +| Cost allocation tags | `aws_ce_cost_allocation_tag` resources activating 7 tags: team, service, repo, environment, managed-by, created-by, commit | ## lambdas -8 Lambda functions — see [lambda-functions.md](lambda-functions.md) for details. +11 Lambda functions — see [lambda-functions.md](lambda-functions.md) for details. ## identity diff --git a/docs/reusable-modules.md b/docs/reusable-modules.md index e1ed291..dacb9a6 100644 --- a/docs/reusable-modules.md +++ b/docs/reusable-modules.md @@ -53,6 +53,8 @@ additional_policy_jsons = { } ``` +**`trusted_services`** — controls which AWS service can assume the role. Default: `["ecs-tasks.amazonaws.com"]`. Can be set to `["ec2.amazonaws.com"]` or `["lambda.amazonaws.com"]` via `compute.trusted_service` in app.yaml. Enables cross-compute roles so EC2 instances and Lambda functions get the same auto-wired access policies. + **Outputs:** `role_arn`, `role_name`, `role_id` ## ecs-service @@ -61,6 +63,8 @@ ECS Fargate task definition + service + CloudWatch log group. Supports `environment` (map) and `secrets` (map of name => ARN) for container configuration. +**Tag propagation:** `enable_ecs_managed_tags = true` and `propagate_tags = "SERVICE"` ensure Fargate task-level compute costs are attributed to the team via Cost Explorer. + **Outputs:** `service_name`, `task_definition_arn`, `log_group_name` ## service-bucket @@ -91,6 +95,41 @@ SQS queue + dead-letter queue with configurable retention and visibility timeout **Naming:** `{project}-{name}` (queue), `{project}-{name}-dlq` (DLQ) **Outputs:** `queue_url`, `queue_arn`, `dlq_url`, `dlq_arn`, `access_policy_json` +## service-rds + +RDS PostgreSQL instance in private subnets. + +**Inputs:** + +| Input | Default | +|-------|---------| +| `name` | required | +| `engine_version` | `"16"` | +| `instance_class` | `db.t3.micro` | +| `allocated_storage` | 20 GB | +| `subnet_ids` | required | +| `vpc_id` | required | +| `allowed_security_group_ids` | required | +| `backup_retention_period` | 7 | +| `multi_az` | false | +| `deletion_protection` | true | + +**Password:** Managed by AWS via `manage_master_user_password = true` (Secrets Manager). + +**Outputs:** `endpoint`, `port`, `db_name`, `access_policy_json`, `security_group_id` + +**Auto-wiring:** `access_policy_json` grants `rds-db:connect` + `secretsmanager:GetSecretValue`. Auto-attached to task role via `collect:access_policy_json`. + +**app.yaml:** +```yaml +databases: + - name: main + engine: postgres + instance_class: db.t3.micro + allocated_storage: 20 + engine_version: "16" +``` + ## service-alarm CloudWatch alarms for ECS services: CPU high, memory high, unhealthy targets, 5xx errors.