From 9a63b31e7e7d70b8e60fcc2b61ce7a6a3326ee3a Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Thu, 26 Mar 2026 22:43:11 +0100 Subject: [PATCH] Add CUR 2.0 cost analytics, Athena querying, and billing protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New cost-analytics module: CUR 2.0 export (daily Parquet with resource IDs), Glue database + crawler, Athena workgroup, CloudWatch billing alarm ($200), account-level budget with 80%/100% notifications ($500) - Shared athena.py utility for synchronous Athena queries with graceful degradation - Weekly cost report: resource-level drilldown (top 10 resources), enriched LLM narrative - Daily spike check: per-spike resource breakdown (top 5 resources per spiking service) - Lambda IAM policies updated for Athena/Glue/S3 access - Lambda timeouts increased (cost-report 60→120s, daily-check 60→90s) --- terraform/lambda-src/cost_report/handler.py | 125 ++++++- .../lambda-src/daily_cost_check/handler.py | 66 ++++ terraform/lambda-src/shared/athena.py | 93 +++++ terraform/platform/cost-analytics/main.tf | 343 ++++++++++++++++++ terraform/platform/cost-analytics/outputs.tf | 24 ++ .../platform/cost-analytics/variables.tf | 31 ++ terraform/platform/lambdas/main.tf | 104 +++++- terraform/platform/lambdas/variables.tf | 27 ++ terraform/platform/main.tf | 27 +- 9 files changed, 833 insertions(+), 7 deletions(-) create mode 100644 terraform/lambda-src/shared/athena.py create mode 100644 terraform/platform/cost-analytics/main.tf create mode 100644 terraform/platform/cost-analytics/outputs.tf create mode 100644 terraform/platform/cost-analytics/variables.tf diff --git a/terraform/lambda-src/cost_report/handler.py b/terraform/lambda-src/cost_report/handler.py index 8b59cee..632f56f 100644 --- a/terraform/lambda-src/cost_report/handler.py +++ b/terraform/lambda-src/cost_report/handler.py @@ -7,6 +7,7 @@ import boto3 from botocore.config import Config +from shared.athena import run_query from shared.constants import USD_TO_NOK, risk_emoji from shared.slack import get_webhook_url, post_to_slack @@ -16,6 +17,9 @@ ssm = boto3.client("ssm") COST_WEBHOOK_PARAM = os.environ["COST_WEBHOOK_PARAM"] +CUR_DATABASE = os.environ.get("CUR_DATABASE", "") +CUR_TABLE = os.environ.get("CUR_TABLE", "") +ATHENA_WORKGROUP = os.environ.get("ATHENA_WORKGROUP", "") # --------------------------------------------------------------------------- # Service categorisation — pattern-based, not hardcoded lists @@ -315,7 +319,7 @@ def _llm_structured(prompt, tool_config): def generate_narrative(this_week, prev_week, mtd, tw_total, pw_total, - mtd_total, projected, month_name): + mtd_total, projected, month_name, resource_summary=""): """Generate executive summary. Returns dict with summary/notable or None.""" top_services = sorted(this_week.items(), key=lambda x: x[1], reverse=True)[:8] svc_summary = ", ".join(f"{s}: ${c:.2f}" for s, c in top_services) @@ -341,8 +345,9 @@ def generate_narrative(this_week, prev_week, mtd, tw_total, pw_total, Top services: {svc_summary} Biggest movers WoW: {mover_lines or 'no significant changes'} +{resource_summary} -Be specific about which services drove changes. Set notable=true only if there is a meaningful change worth highlighting, false if costs are stable week-over-week.""" +Be specific about which services and resources drove changes. Set notable=true only if there is a meaningful change worth highlighting, false if costs are stable week-over-week.""" return _llm_structured(prompt, _NARRATIVE_TOOL) @@ -375,6 +380,104 @@ def analyze_spike(this_week, prev_week, tw_total, pw_total): return _llm_structured(prompt, _SPIKE_TOOL) +# --------------------------------------------------------------------------- +# CUR resource-level drilldown via Athena +# --------------------------------------------------------------------------- +def _friendly_resource_id(resource_id): + """Shorten an ARN to a readable name.""" + if not resource_id: + return "(no resource ID)" + # Strip common ARN prefix, keep the useful tail + if "::" in resource_id: + # S3 bucket: arn:aws:s3:::bucket-name → bucket-name + return resource_id.rsplit(":::", 1)[-1] + if "/" in resource_id: + # ECS/Lambda/etc: ...service/cluster/name → name + parts = resource_id.split("/") + return parts[-1] if len(parts) <= 3 else "/".join(parts[-2:]) + if ":" in resource_id: + return resource_id.rsplit(":", 1)[-1] + return resource_id + + +def get_resource_drilldown(week_start, week_end): + """Query CUR via Athena for top resources this week. Returns dict or None.""" + if not (CUR_DATABASE and CUR_TABLE and ATHENA_WORKGROUP): + return None + + year = str(week_start.year) + month = f"{week_start.month:02d}" + + # Top 10 resources overall + top_query = f""" + SELECT line_item_resource_id, + line_item_product_code, + COALESCE(resource_tags_user_team, '') as team, + SUM(CAST(line_item_unblended_cost AS double)) as total_cost + FROM "{CUR_DATABASE}"."{CUR_TABLE}" + WHERE year = '{year}' AND month = '{month}' + AND line_item_usage_start_date >= TIMESTAMP '{week_start}' + AND line_item_usage_start_date < TIMESTAMP '{week_end + timedelta(days=1)}' + AND line_item_resource_id != '' + AND line_item_line_item_type = 'Usage' + GROUP BY line_item_resource_id, line_item_product_code, + COALESCE(resource_tags_user_team, '') + HAVING SUM(CAST(line_item_unblended_cost AS double)) >= 0.01 + ORDER BY total_cost DESC + LIMIT 10 + """ + + top_resources = run_query(CUR_DATABASE, top_query, ATHENA_WORKGROUP) + if not top_resources: + return None + + return {"top_resources": top_resources} + + +def build_resource_drilldown_blocks(drilldown): + """Build Block Kit blocks for resource-level cost drilldown.""" + blocks = [] + top = drilldown.get("top_resources", []) + if not top: + return blocks + + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": ":mag: *Resource-Level Drilldown*"} + }) + + header = [ + {"type": "raw_text", "text": "Resource"}, + {"type": "raw_text", "text": "Service"}, + {"type": "raw_text", "text": "Team"}, + {"type": "raw_text", "text": "Cost"}, + ] + rows = [header] + + for item in top: + cost = float(item.get("total_cost", 0)) + nok = cost * USD_TO_NOK + rows.append([ + {"type": "raw_text", "text": _friendly_resource_id(item.get("line_item_resource_id", ""))}, + {"type": "raw_text", "text": item.get("line_item_product_code", "")}, + {"type": "raw_text", "text": item.get("team", "(untagged)")}, + {"type": "raw_text", "text": f"${cost:.2f} (~{nok:.0f} NOK)"}, + ]) + + blocks.append({ + "type": "table", + "column_settings": [ + {"is_wrapped": True}, + {}, + {}, + {"align": "right"}, + ], + "rows": rows, + }) + + return blocks + + # --------------------------------------------------------------------------- # Block Kit builder # --------------------------------------------------------------------------- @@ -493,6 +596,14 @@ def build_blocks(this_week, prev_week, mtd, prev_mtd, project_costs, team_costs, blocks.append({"type": "divider"}) + # Resource-level drilldown from CUR (graceful — skipped if unavailable) + drilldown = get_resource_drilldown(tw_start, tw_end) + if drilldown: + blocks.extend(build_resource_drilldown_blocks(drilldown)) + blocks.append({"type": "divider"}) + else: + logger.info("CUR resource drilldown unavailable — skipping") + # LLM spike root cause (shown as a section — important) spike_result = analyze_spike(this_week, prev_week, tw_total, pw_total) if spike_result: @@ -506,9 +617,19 @@ def build_blocks(this_week, prev_week, mtd, prev_mtd, project_costs, team_costs, blocks.append({"type": "divider"}) # LLM narrative + footer as context blocks + resource_summary = "" + if drilldown: + top_res = drilldown.get("top_resources", [])[:5] + resource_summary = "\nTop resources by cost: " + ", ".join( + f"{_friendly_resource_id(r.get('line_item_resource_id',''))} " + f"({r.get('line_item_product_code','')}, {r.get('team','?')}): " + f"${float(r.get('total_cost',0)):.2f}" + for r in top_res + ) narrative_result = generate_narrative( this_week, prev_week, mtd, tw_total, pw_total, mtd_total, projected, curr_month_name, + resource_summary=resource_summary, ) ce_url = cost_explorer_url(tw_start, tw_end) diff --git a/terraform/lambda-src/daily_cost_check/handler.py b/terraform/lambda-src/daily_cost_check/handler.py index d958c27..cb2619f 100644 --- a/terraform/lambda-src/daily_cost_check/handler.py +++ b/terraform/lambda-src/daily_cost_check/handler.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta, timezone import boto3 +from shared.athena import run_query from shared.constants import USD_TO_NOK from shared.slack import get_webhook_url, post_to_slack @@ -17,6 +18,9 @@ SPIKE_THRESHOLD = float(os.environ.get("SPIKE_THRESHOLD", "1.2")) # 20% above average # Minimum daily spend (USD) to qualify as a spike — filters noise on tiny amounts MIN_SPIKE_AMOUNT = float(os.environ.get("MIN_SPIKE_AMOUNT", "1.00")) +CUR_DATABASE = os.environ.get("CUR_DATABASE", "") +CUR_TABLE = os.environ.get("CUR_TABLE", "") +ATHENA_WORKGROUP = os.environ.get("ATHENA_WORKGROUP", "") # --------------------------------------------------------------------------- @@ -129,6 +133,54 @@ def cost_explorer_url(start, end, service=None): return f"{base}?{urllib.parse.urlencode(params)}" +# --------------------------------------------------------------------------- +# CUR resource drilldown for spiking services +# --------------------------------------------------------------------------- +def get_spike_resources(day, service): + """Query CUR for top resources in a spiking service. Returns list or [].""" + if not (CUR_DATABASE and CUR_TABLE and ATHENA_WORKGROUP): + return [] + + year = str(day.year) + month = f"{day.month:02d}" + next_day = day + timedelta(days=1) + + query = f""" + SELECT line_item_resource_id, + line_item_usage_type, + COALESCE(resource_tags_user_team, '') as team, + SUM(CAST(line_item_unblended_cost AS double)) as total_cost + FROM "{CUR_DATABASE}"."{CUR_TABLE}" + WHERE year = '{year}' AND month = '{month}' + AND line_item_usage_start_date >= TIMESTAMP '{day}' + AND line_item_usage_start_date < TIMESTAMP '{next_day}' + AND line_item_product_code = '{service}' + AND line_item_resource_id != '' + AND line_item_line_item_type = 'Usage' + GROUP BY line_item_resource_id, line_item_usage_type, + COALESCE(resource_tags_user_team, '') + HAVING SUM(CAST(line_item_unblended_cost AS double)) >= 0.01 + ORDER BY total_cost DESC + LIMIT 5 + """ + + return run_query(CUR_DATABASE, query, ATHENA_WORKGROUP) + + +def _friendly_resource_id(resource_id): + """Shorten an ARN to a readable name.""" + if not resource_id: + return "(no resource ID)" + if ":::" in resource_id: + return resource_id.rsplit(":::", 1)[-1] + if "/" in resource_id: + parts = resource_id.split("/") + return parts[-1] if len(parts) <= 3 else "/".join(parts[-2:]) + if ":" in resource_id: + return resource_id.rsplit(":", 1)[-1] + return resource_id + + # --------------------------------------------------------------------------- # Spike detection # --------------------------------------------------------------------------- @@ -237,6 +289,16 @@ def build_alert_blocks(spikes, spike_details, yesterday_date): ) detail_parts.append(f"*By team:* {team_lines}") + resources = detail.get("resources") + if resources: + res_lines = "\n".join( + f" \u2022 {_friendly_resource_id(r.get('line_item_resource_id', ''))}: " + f"${float(r.get('total_cost', 0)):.2f}" + f"{' (' + r['team'] + ')' if r.get('team') else ''}" + for r in resources + ) + detail_parts.append(f"*Top resources:*\n{res_lines}") + ce_url = detail.get("url") if ce_url: detail_parts.append(f"<{ce_url}|View in Cost Explorer>") @@ -295,6 +357,10 @@ def handler(event, context): detail["team_tags"] = get_tag_breakdown(ce, yesterday, svc, tag_key="team") except Exception as e: logger.warning("Team tag query failed for %s: %s", svc, e) + try: + detail["resources"] = get_spike_resources(yesterday, svc) + except Exception as e: + logger.warning("CUR resource query failed for %s: %s", svc, e) spike_details[svc] = detail blocks = build_alert_blocks(spikes, spike_details, yesterday) diff --git a/terraform/lambda-src/shared/athena.py b/terraform/lambda-src/shared/athena.py new file mode 100644 index 0000000..a0134d1 --- /dev/null +++ b/terraform/lambda-src/shared/athena.py @@ -0,0 +1,93 @@ +"""Shared Athena query utility for CUR cost analytics.""" + +import logging +import time + +import boto3 + +logger = logging.getLogger(__name__) + + +def run_query(database, query, workgroup, timeout_seconds=30): + """Execute an Athena query synchronously and return rows as list of dicts. + + On failure (timeout, query error, missing table), logs a warning and returns + an empty list so callers can gracefully degrade. + """ + athena = boto3.client("athena") + + try: + start = athena.start_query_execution( + QueryString=query, + QueryExecutionContext={"Database": database}, + WorkGroup=workgroup, + ) + execution_id = start["QueryExecutionId"] + except Exception as e: + logger.warning("Athena start_query_execution failed: %s", e) + return [] + + # Poll until done + deadline = time.time() + timeout_seconds + while time.time() < deadline: + try: + status = athena.get_query_execution(QueryExecutionId=execution_id) + state = status["QueryExecution"]["Status"]["State"] + except Exception as e: + logger.warning("Athena get_query_execution failed: %s", e) + return [] + + if state == "SUCCEEDED": + break + if state in ("FAILED", "CANCELLED"): + reason = status["QueryExecution"]["Status"].get( + "StateChangeReason", "unknown" + ) + logger.warning("Athena query %s: %s", state, reason) + return [] + + time.sleep(1) + else: + logger.warning("Athena query timed out after %ds", timeout_seconds) + try: + athena.stop_query_execution(QueryExecutionId=execution_id) + except Exception: + pass + return [] + + # Fetch results with pagination + rows = [] + columns = None + next_token = None + + while True: + kwargs = {"QueryExecutionId": execution_id} + if next_token: + kwargs["NextToken"] = next_token + + try: + result = athena.get_query_results(**kwargs) + except Exception as e: + logger.warning("Athena get_query_results failed: %s", e) + return rows + + result_set = result["ResultSet"] + + if columns is None: + columns = [ + col["Name"] for col in result_set["ResultSetMetadata"]["ColumnInfo"] + ] + # First page includes the header row — skip it + data_rows = result_set["Rows"][1:] + else: + data_rows = result_set["Rows"] + + for row in data_rows: + values = [d.get("VarCharValue", "") for d in row["Data"]] + rows.append(dict(zip(columns, values))) + + next_token = result.get("NextToken") + if not next_token: + break + + return rows diff --git a/terraform/platform/cost-analytics/main.tf b/terraform/platform/cost-analytics/main.tf new file mode 100644 index 0000000..0ca0e66 --- /dev/null +++ b/terraform/platform/cost-analytics/main.tf @@ -0,0 +1,343 @@ +################################################################################ +# Cost Analytics — CUR, Athena, Glue, billing protection +# +# Resources: +# S3 buckets CUR data + Athena query results +# CUR report Daily Parquet export with resource IDs +# Glue database Catalog for CUR data +# Glue crawler Auto-discovers CUR Parquet schema daily +# Athena workgroup Scoped workgroup for cost queries +# Billing alarm CloudWatch alarm on EstimatedCharges (~4-6h delay) +# Account budget Monthly budget with notifications at 80% and 100% +################################################################################ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + configuration_aliases = [aws, aws.us_east_1] + } + } +} + +locals { + cur_bucket_name = "${var.project}-cur-${var.aws_account_id}" + athena_bucket_name = "${var.project}-athena-results-${var.aws_account_id}" + glue_database_name = "${var.project}_cur" + crawler_role_name = "${var.project}-cur-crawler" + athena_workgroup = "${var.project}-cost-analytics" + cur_report_name = "${var.project}-cur" + cur_s3_prefix = "cur" +} + +################################################################################ +# S3 — CUR data bucket +################################################################################ + +resource "aws_s3_bucket" "cur_data" { + bucket = local.cur_bucket_name +} + +resource "aws_s3_bucket_public_access_block" "cur_data" { + bucket = aws_s3_bucket.cur_data.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "cur_data" { + bucket = aws_s3_bucket.cur_data.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "cur_data" { + bucket = aws_s3_bucket.cur_data.id + + rule { + id = "archive-and-expire" + status = "Enabled" + filter {} + + transition { + days = 90 + storage_class = "STANDARD_IA" + } + + expiration { + days = 365 + } + } +} + +resource "aws_s3_bucket_policy" "cur_data" { + bucket = aws_s3_bucket.cur_data.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowDataExportsDelivery" + Effect = "Allow" + Principal = { Service = "bcm-data-exports.amazonaws.com" } + Action = [ + "s3:PutObject", + "s3:GetBucketPolicy", + ] + Resource = [ + aws_s3_bucket.cur_data.arn, + "${aws_s3_bucket.cur_data.arn}/*", + ] + Condition = { + StringEquals = { + "aws:SourceAccount" = var.aws_account_id + } + } + }, + ] + }) +} + +################################################################################ +# S3 — Athena query results bucket +################################################################################ + +resource "aws_s3_bucket" "athena_results" { + bucket = local.athena_bucket_name +} + +resource "aws_s3_bucket_public_access_block" "athena_results" { + bucket = aws_s3_bucket.athena_results.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "athena_results" { + bucket = aws_s3_bucket.athena_results.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "athena_results" { + bucket = aws_s3_bucket.athena_results.id + + rule { + id = "expire-query-results" + status = "Enabled" + filter {} + + expiration { + days = 7 + } + } +} + +################################################################################ +# CUR 2.0 export via Data Exports (must be in us-east-1) +################################################################################ + +resource "aws_bcmdataexports_export" "cur" { + provider = aws.us_east_1 + + export { + name = local.cur_report_name + + data_query { + query_statement = "SELECT identity_line_item_id, identity_time_interval, bill_invoice_id, bill_invoicing_entity, bill_billing_entity, bill_bill_type, bill_payer_account_id, bill_billing_period_start_date, bill_billing_period_end_date, line_item_usage_account_id, line_item_line_item_type, line_item_usage_start_date, line_item_usage_end_date, line_item_product_code, line_item_usage_type, line_item_operation, line_item_availability_zone, line_item_resource_id, line_item_usage_amount, line_item_normalization_factor, line_item_normalized_usage_amount, line_item_currency_code, line_item_unblended_rate, line_item_unblended_cost, line_item_blended_rate, line_item_blended_cost, line_item_line_item_description, product_product_name, product_region, pricing_unit, pricing_public_on_demand_cost, pricing_public_on_demand_rate, pricing_term, pricing_offering_class, resource_tags_user_team, resource_tags_user_service, resource_tags_user_environment, resource_tags_user_repo, resource_tags_user_managed_by FROM COST_AND_USAGE_REPORT" + + table_configurations = { + COST_AND_USAGE_REPORT = { + TIME_GRANULARITY = "DAILY" + INCLUDE_RESOURCES = "TRUE" + INCLUDE_MANUAL_DISCOUNT_COMPATIBILITY = "FALSE" + INCLUDE_SPLIT_COST_ALLOCATION_DATA = "FALSE" + } + } + } + + destination_configurations { + s3_destination { + s3_bucket = aws_s3_bucket.cur_data.id + s3_prefix = local.cur_s3_prefix + s3_region = var.region + + s3_output_configurations { + overwrite = "OVERWRITE_REPORT" + format = "PARQUET" + compression = "PARQUET" + output_type = "CUSTOM" + } + } + } + + refresh_cadence { + frequency = "SYNCHRONOUS" + } + } +} + +################################################################################ +# Glue — catalog database + crawler +################################################################################ + +resource "aws_glue_catalog_database" "cur" { + name = local.glue_database_name +} + +resource "aws_iam_role" "cur_crawler" { + name = local.crawler_role_name + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "glue.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) +} + +resource "aws_iam_role_policy_attachment" "cur_crawler_glue" { + role = aws_iam_role.cur_crawler.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" +} + +resource "aws_iam_role_policy" "cur_crawler_s3" { + name = "${local.crawler_role_name}-s3" + role = aws_iam_role.cur_crawler.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Sid = "ReadCURData" + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:ListBucket", + ] + Resource = [ + aws_s3_bucket.cur_data.arn, + "${aws_s3_bucket.cur_data.arn}/*", + ] + }] + }) +} + +resource "aws_glue_crawler" "cur" { + name = "${var.project}-cur-crawler" + database_name = aws_glue_catalog_database.cur.name + role = aws_iam_role.cur_crawler.arn + schedule = "cron(0 6 * * ? *)" # Daily at 06:00 UTC, before 08:00 reports + + s3_target { + path = "s3://${aws_s3_bucket.cur_data.id}/${local.cur_s3_prefix}/" + } + + schema_change_policy { + update_behavior = "UPDATE_IN_DATABASE" + delete_behavior = "DELETE_FROM_DATABASE" + } + + configuration = jsonencode({ + Version = 1.0 + Grouping = { + TableGroupingPolicy = "CombineCompatibleSchemas" + } + }) +} + +################################################################################ +# Athena workgroup +################################################################################ + +resource "aws_athena_workgroup" "cur" { + name = local.athena_workgroup + + configuration { + enforce_workgroup_configuration = true + bytes_scanned_cutoff_per_query = 104857600 # 100 MB safety limit + + result_configuration { + output_location = "s3://${aws_s3_bucket.athena_results.id}/" + + encryption_configuration { + encryption_option = "SSE_S3" + } + } + } +} + +################################################################################ +# Billing protection — CloudWatch billing alarm (us-east-1) +################################################################################ + +# SNS topic in us-east-1 — CloudWatch billing alarms can only target same-region SNS. +# A Lambda forwarder (us-east-1 → eu-central-1) can be added later for Slack integration. +# For now, the budget notifications below go directly to the main alerts topic. +resource "aws_sns_topic" "billing_alarm" { + provider = aws.us_east_1 + name = "${var.project}-billing-alarm" +} + +resource "aws_cloudwatch_metric_alarm" "billing" { + provider = aws.us_east_1 + + alarm_name = "${var.project}-billing-alarm" + alarm_description = "Account estimated charges exceeded $${var.billing_alarm_threshold_usd}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + metric_name = "EstimatedCharges" + namespace = "AWS/Billing" + period = 21600 # 6 hours + statistic = "Maximum" + threshold = var.billing_alarm_threshold_usd + treat_missing_data = "missing" + + dimensions = { + Currency = "USD" + } + + alarm_actions = [aws_sns_topic.billing_alarm.arn] + ok_actions = [aws_sns_topic.billing_alarm.arn] +} + +################################################################################ +# Billing protection — account-level AWS Budget (notifications only) +################################################################################ + +resource "aws_budgets_budget" "account" { + name = "${var.project}-account-monthly" + budget_type = "COST" + limit_amount = tostring(var.account_budget_usd) + limit_unit = "USD" + time_unit = "MONTHLY" + + # 80% warning + notification { + comparison_operator = "GREATER_THAN" + notification_type = "ACTUAL" + threshold = 80 + threshold_type = "PERCENTAGE" + subscriber_sns_topic_arns = [var.alerts_topic_arn] + } + + # 100% critical + notification { + comparison_operator = "GREATER_THAN" + notification_type = "ACTUAL" + threshold = 100 + threshold_type = "PERCENTAGE" + subscriber_sns_topic_arns = [var.alerts_topic_arn] + } +} diff --git a/terraform/platform/cost-analytics/outputs.tf b/terraform/platform/cost-analytics/outputs.tf new file mode 100644 index 0000000..64d1160 --- /dev/null +++ b/terraform/platform/cost-analytics/outputs.tf @@ -0,0 +1,24 @@ +output "glue_database_name" { + description = "Glue catalog database name for CUR data" + value = aws_glue_catalog_database.cur.name +} + +output "glue_table_name" { + description = "Glue catalog table name (discovered by crawler)" + value = "${var.project}_cur" +} + +output "athena_workgroup_name" { + description = "Athena workgroup name for cost queries" + value = aws_athena_workgroup.cur.name +} + +output "athena_results_bucket_arn" { + description = "ARN of the S3 bucket for Athena query results" + value = aws_s3_bucket.athena_results.arn +} + +output "cur_data_bucket_arn" { + description = "ARN of the S3 bucket containing CUR Parquet data" + value = aws_s3_bucket.cur_data.arn +} diff --git a/terraform/platform/cost-analytics/variables.tf b/terraform/platform/cost-analytics/variables.tf new file mode 100644 index 0000000..d6e4312 --- /dev/null +++ b/terraform/platform/cost-analytics/variables.tf @@ -0,0 +1,31 @@ +variable "project" { + description = "Project name used for resource naming" + type = string +} + +variable "region" { + description = "AWS region for Glue/Athena resources" + type = string +} + +variable "aws_account_id" { + description = "AWS account ID" + type = string +} + +variable "alerts_topic_arn" { + description = "ARN of the javabin-alerts SNS topic for budget notifications" + type = string +} + +variable "billing_alarm_threshold_usd" { + description = "CloudWatch billing alarm threshold in USD" + type = number + default = 200 +} + +variable "account_budget_usd" { + description = "Account-level monthly budget in USD — auto-deny at 100%" + type = number + default = 500 +} diff --git a/terraform/platform/lambdas/main.tf b/terraform/platform/lambdas/main.tf index 2156008..5894fbd 100644 --- a/terraform/platform/lambdas/main.tf +++ b/terraform/platform/lambdas/main.tf @@ -69,6 +69,10 @@ data "archive_file" "cost_report" { content = file("${local.lambda_src_path}/shared/constants.py") filename = "shared/constants.py" } + source { + content = file("${local.lambda_src_path}/shared/athena.py") + filename = "shared/athena.py" + } } data "archive_file" "daily_cost_check" { @@ -92,6 +96,10 @@ data "archive_file" "daily_cost_check" { content = file("${local.lambda_src_path}/shared/constants.py") filename = "shared/constants.py" } + source { + content = file("${local.lambda_src_path}/shared/athena.py") + filename = "shared/athena.py" + } } data "archive_file" "compliance_reporter" { @@ -313,6 +321,49 @@ resource "aws_iam_role_policy" "cost_report" { } } }, + { + Sid = "AthenaQuery" + Effect = "Allow" + Action = [ + "athena:StartQueryExecution", + "athena:GetQueryExecution", + "athena:GetQueryResults", + "athena:StopQueryExecution", + ] + Resource = "arn:aws:athena:${var.region}:${var.aws_account_id}:workgroup/${var.athena_workgroup}" + }, + { + Sid = "GlueReadCatalog" + Effect = "Allow" + Action = [ + "glue:GetDatabase", + "glue:GetTable", + "glue:GetPartitions", + ] + Resource = [ + "arn:aws:glue:${var.region}:${var.aws_account_id}:catalog", + "arn:aws:glue:${var.region}:${var.aws_account_id}:database/${var.cur_glue_database}", + "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.cur_glue_database}/${var.cur_glue_table}", + ] + }, + { + Sid = "S3ReadCURData" + Effect = "Allow" + Action = ["s3:GetObject", "s3:ListBucket"] + Resource = [ + var.cur_data_bucket_arn, + "${var.cur_data_bucket_arn}/*", + ] + }, + { + Sid = "S3WriteAthenaResults" + Effect = "Allow" + Action = ["s3:PutObject", "s3:GetObject", "s3:GetBucketLocation", "s3:AbortMultipartUpload", "s3:ListBucket"] + Resource = [ + var.athena_results_bucket_arn, + "${var.athena_results_bucket_arn}/*", + ] + }, ] }) } @@ -355,6 +406,49 @@ resource "aws_iam_role_policy" "daily_cost_check" { Action = "ce:GetCostAndUsage" Resource = "*" }, + { + Sid = "AthenaQuery" + Effect = "Allow" + Action = [ + "athena:StartQueryExecution", + "athena:GetQueryExecution", + "athena:GetQueryResults", + "athena:StopQueryExecution", + ] + Resource = "arn:aws:athena:${var.region}:${var.aws_account_id}:workgroup/${var.athena_workgroup}" + }, + { + Sid = "GlueReadCatalog" + Effect = "Allow" + Action = [ + "glue:GetDatabase", + "glue:GetTable", + "glue:GetPartitions", + ] + Resource = [ + "arn:aws:glue:${var.region}:${var.aws_account_id}:catalog", + "arn:aws:glue:${var.region}:${var.aws_account_id}:database/${var.cur_glue_database}", + "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.cur_glue_database}/${var.cur_glue_table}", + ] + }, + { + Sid = "S3ReadCURData" + Effect = "Allow" + Action = ["s3:GetObject", "s3:ListBucket"] + Resource = [ + var.cur_data_bucket_arn, + "${var.cur_data_bucket_arn}/*", + ] + }, + { + Sid = "S3WriteAthenaResults" + Effect = "Allow" + Action = ["s3:PutObject", "s3:GetObject", "s3:GetBucketLocation", "s3:AbortMultipartUpload", "s3:ListBucket"] + Resource = [ + var.athena_results_bucket_arn, + "${var.athena_results_bucket_arn}/*", + ] + }, ] }) } @@ -695,7 +789,7 @@ resource "aws_lambda_function" "cost_report" { role = aws_iam_role.cost_report.arn handler = "handler.handler" runtime = "python3.12" - timeout = 60 + timeout = 120 memory_size = 256 filename = data.archive_file.cost_report.output_path source_code_hash = data.archive_file.cost_report.output_base64sha256 @@ -704,6 +798,9 @@ resource "aws_lambda_function" "cost_report" { variables = { COST_WEBHOOK_PARAM = "/javabin/slack/platform-cost-alerts-webhook" DEPLOY_REGION = var.region + CUR_DATABASE = var.cur_glue_database + CUR_TABLE = var.cur_glue_table + ATHENA_WORKGROUP = var.athena_workgroup } } } @@ -713,7 +810,7 @@ resource "aws_lambda_function" "daily_cost_check" { role = aws_iam_role.daily_cost_check.arn handler = "handler.handler" runtime = "python3.12" - timeout = 60 + timeout = 90 memory_size = 128 filename = data.archive_file.daily_cost_check.output_path source_code_hash = data.archive_file.daily_cost_check.output_base64sha256 @@ -721,6 +818,9 @@ resource "aws_lambda_function" "daily_cost_check" { environment { variables = { COST_WEBHOOK_PARAM = "/javabin/slack/platform-cost-alerts-webhook" + CUR_DATABASE = var.cur_glue_database + CUR_TABLE = var.cur_glue_table + ATHENA_WORKGROUP = var.athena_workgroup } } } diff --git a/terraform/platform/lambdas/variables.tf b/terraform/platform/lambdas/variables.tf index 9e3d348..ffc97cd 100644 --- a/terraform/platform/lambdas/variables.tf +++ b/terraform/platform/lambdas/variables.tf @@ -95,3 +95,30 @@ variable "developer_boundary_arn" { type = string } +# --- Cost analytics (CUR / Athena) --- + +variable "cur_glue_database" { + description = "Glue catalog database name for CUR data" + type = string +} + +variable "cur_glue_table" { + description = "Glue catalog table name for CUR data" + type = string +} + +variable "athena_workgroup" { + description = "Athena workgroup name for cost queries" + type = string +} + +variable "athena_results_bucket_arn" { + description = "ARN of the S3 bucket for Athena query results" + type = string +} + +variable "cur_data_bucket_arn" { + description = "ARN of the S3 bucket containing CUR Parquet data" + type = string +} + diff --git a/terraform/platform/main.tf b/terraform/platform/main.tf index 6f7a931..2eca0f1 100644 --- a/terraform/platform/main.tf +++ b/terraform/platform/main.tf @@ -9,9 +9,10 @@ # ingress ALB, ACM wildcard cert, Route53 DNS # iam GitHub OIDC, CI roles, permission boundary, ECS execution role # compute ECS cluster, public ECR images -# monitoring SNS topics, EventBridge rules, Config, GuardDuty, Security Hub -# lambdas slack-alert, cost-report, daily-cost-check, auto-tagger -# identity Cognito pools (Identity Center is in terraform/org/) +# monitoring SNS topics, EventBridge rules, Config, GuardDuty, Security Hub +# cost-analytics CUR, Athena, Glue, billing alarms, account budget +# lambdas slack-alert, cost-report, daily-cost-check, auto-tagger +# identity Cognito pools (Identity Center is in terraform/org/) ################################################################################ module "networking" { @@ -53,6 +54,21 @@ module "monitoring" { aws_account_id = var.aws_account_id } +module "cost_analytics" { + source = "./cost-analytics" + project = var.project + region = var.region + aws_account_id = var.aws_account_id + alerts_topic_arn = module.monitoring.alerts_topic_arn + billing_alarm_threshold_usd = 200 + account_budget_usd = 500 + + providers = { + aws = aws + aws.us_east_1 = aws.us_east_1 + } +} + module "lambdas" { source = "./lambdas" project = var.project @@ -74,6 +90,11 @@ module "lambdas" { route53_zone_id = module.ingress.route53_zone_id org_boundary_arn = module.iam.org_boundary_arn developer_boundary_arn = module.iam.developer_boundary_arn + cur_glue_database = module.cost_analytics.glue_database_name + cur_glue_table = module.cost_analytics.glue_table_name + athena_workgroup = module.cost_analytics.athena_workgroup_name + athena_results_bucket_arn = module.cost_analytics.athena_results_bucket_arn + cur_data_bucket_arn = module.cost_analytics.cur_data_bucket_arn } module "identity" {