From b7aa1b80209d025915e1d0d2deb06d26c910d254 Mon Sep 17 00:00:00 2001 From: Josh Olson Date: Thu, 7 May 2026 17:57:38 -0500 Subject: [PATCH 1/3] MSK: Do not include additional brokers as part of Replication Factor --- terraform/aws/modules/2-nbs7/msk/README.md | 8 ++--- terraform/aws/modules/2-nbs7/msk/main.tf | 31 ++++++++++--------- terraform/aws/modules/2-nbs7/msk/variables.tf | 21 ++++++++----- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/terraform/aws/modules/2-nbs7/msk/README.md b/terraform/aws/modules/2-nbs7/msk/README.md index 5a9abaeb..e61f65fa 100644 --- a/terraform/aws/modules/2-nbs7/msk/README.md +++ b/terraform/aws/modules/2-nbs7/msk/README.md @@ -10,12 +10,12 @@ Below are the input parameter variables for the MSK: | Key | Type | Default | Description | | -------------- | -------------- | -------------- | -------------- | -| create_msk | bool | true | Create msk cluser and required resources? | -| environment | string | `development` | The environment, either 'development' or 'production'. This module creates kafka.t3.small brokers for 'development', otherwise kafka.m5.large brokers are created. | -| additional_brokers_to_create | number | `0` | How many additional brokers to create - beyond two for 'development' or otherwise three. | +| create_msk | bool | true | Create MSK cluser and required resources? | +| environment | string | `development` | The environment - either 'development' or 'production', which means by default two brokers of size kafka.t3.small or three kafka.m5.large brokers, respectively. | +| additional_brokers_to_create | number | `0` | How many additional brokers to create - beyond the default of two for 'development' or otherwise three. AWS MSK requires that the number of brokers must be a multiple of the number of Availability Zones. | | msk_ebs_volume_size | number | | EBS volume size for the MSK broker nodes in GB | | msk_security_groups | list(string) | | A list of security groups to use for the MSK cluster | -| msk_subnet_ids | list(string) | | The list of subnets to use for the MSK cluster. There must be 2+ subnets for a 'development' environment, otherwise 3+ subnets. | +| msk_subnet_ids | list(string) | | The list of subnets to use, which determines how many AZs (Availability Zones) the cluster uses. There must be 2+ subnets for a 'development' environment, otherwise 3+ subnets. | | resource_prefix | string | `cdc-nbs` | Prefix for resource names | | vpc_id | string | | VPC Id to be used with cluster | | cidr_blocks | list(any) | | | diff --git a/terraform/aws/modules/2-nbs7/msk/main.tf b/terraform/aws/modules/2-nbs7/msk/main.tf index f35eb036..c610f6ea 100644 --- a/terraform/aws/modules/2-nbs7/msk/main.tf +++ b/terraform/aws/modules/2-nbs7/msk/main.tf @@ -1,18 +1,20 @@ locals { module_name = "msk" - module_serial_number = "2026-05-06_01" # Update with each commit? Date plus two digit increment. + module_serial_number = "2026-05-07_01" # Update with each commit? Date plus two digit increment. - # The "Best practices for Standard brokers" page (https://docs.aws.amazon.com/msk/latest/developerguide/bestpractices.html) specifies how many partitions at most there should be per broker, for each broker size (https://docs.aws.amazon.com/msk/latest/developerguide/broker-instance-sizes.html). - # That page states the "recommended number of partitions are not enforced", but when running a `terraform apply` command to step-up your MSK cluster to a new configuration - # revision, if your cluster is not in compliance with the recommendation then the command will fail with: + # The "Best practices for Standard brokers" page (https://docs.aws.amazon.com/msk/latest/developerguide/bestpractices.html) specifies the max number of partitions per broker, for each broker size (https://docs.aws.amazon.com/msk/latest/developerguide/broker-instance-sizes.html). + # That page states if the number of partitions on any given broker "exceeds the maximum allowed value" then certain operations on the cluster will not be allowed - e.g. when + # running a `terraform apply` command to step-up your MSK cluster to a new configuration revision, the command will fail with: # "api error HighPartitionCountException: The number of partitions per broker is above the recommended limit. Add more brokers and rearrange the partitions per broker to be below the recommended limit, then retry the request" - # Other options to resolve that error (by getting the number of partitions in your cluster below the limit) is to choose a larger broker size, or to delete unneeded topics. - # To get the number of partitions in your cluster: in the AWS Management Console go to CloudWatch, All metrics, search for "AWS/Kafka", click "Kafka > Broker ID, Cluster Name", and filter on your cluster name and on the PartitionCount metric. - # * That metric is what AWS uses to determine whether the limit is exceeded, and when you make a change to your cluster such as deleting topics it can take up to 10 metrics for that metric to be updated accordingly. + # Other options to resolve that error (i.e. getting the number of partitions on each broker below the limit) are to choose a larger broker size, or to delete unneeded topics (if there are enough such topics). + # To get the number of partitions on each broker in your cluster: in the AWS Management Console go to CloudWatch, All metrics, search for "AWS/Kafka", click "Kafka > Broker ID, Cluster Name", and filter on your cluster name and on the PartitionCount metric. + # * That metric is what AWS uses to determine whether the limit is exceeded on any broker, and when you make a change to your cluster such as deleting topics it can take up to 10 minutes for that metric to be updated accordingly. # For production: typically at minimum use kafka.m5.large, and for high-throughput environments consider using kafka.m5.2xlarge or higher. broker_instance_type = var.environment == "development" ? "kafka.t3.small" : "kafka.m5.large" - number_of_brokers = var.environment == "development" ? (2 + var.additional_brokers_to_create) : (3 + var.additional_brokers_to_create) + # Do not include additional brokers as part of Replication Factor (explained further below). + base_number_of_brokers = var.environment == "development" ? 2 : 3 + total_number_of_brokers = local.base_number_of_brokers + var.additional_brokers_to_create } # Create an IAM role for MSK @@ -136,7 +138,7 @@ resource "aws_msk_cluster" "this" { count = var.create_msk ? 1 : 0 cluster_name = "${var.resource_prefix}-${var.environment}-msk-cluster" kafka_version = var.kafka_version - number_of_broker_nodes = local.number_of_brokers + number_of_broker_nodes = local.total_number_of_brokers #iam_instance_profile = aws_iam_role.msk.arn configuration_info { @@ -149,7 +151,6 @@ resource "aws_msk_cluster" "this" { client_subnets = var.msk_subnet_ids - #security_groups = var.msk_security_groups security_groups = [aws_security_group.msk_cluster_sg[0].id] storage_info { ebs_storage_info { @@ -209,13 +210,13 @@ locals { # * https://repost.aws/knowledge-center/msk-avoid-disruption-during-patching # * https://docs.aws.amazon.com/securityhub/latest/userguide/msk-controls.html - min_ISR = local.number_of_brokers - 1 + min_ISR = local.base_number_of_brokers - 1 - # unclean.leader.election.enable should almost always be set to false, to prevent out-of-sync replicas from becoming leaders (which could cause silent data loss). + # The unclean.leader.election.enable configuration setting should almost always be set to false, to prevent out-of-sync replicas from becoming leaders (which could cause silent data loss). server_properties = <= 2) || (length(var.msk_subnet_ids) >= 3) + # Heads up that if at some point after creating a cluster you add a subnet to it, that will require the cluster to be re-created - which causes the topics and other data in the cluster to be deleted. + condition = ((var.environment == "development" && length(var.msk_subnet_ids) >= 2)) || (length(var.msk_subnet_ids) >= 3) error_message = "There must be 2+ subnets for a 'development' environment, otherwise 3+ subnets." } } variable "additional_brokers_to_create" { - type = number - description = "How many additional brokers to create - beyond two for 'development' or otherwise three." + type = number + # The MSK requirement mentioned below is documented at https://docs.aws.amazon.com/msk/latest/developerguide/msk-update-broker-count.html + description = "How many additional brokers to create - beyond the default of two for 'development' or otherwise three. AWS MSK requires that the number of brokers must be a multiple of the number of Availability Zones." default = 0 + validation { + condition = ((var.environment == "development") && ((2 + var.additional_brokers_to_create) % length(var.msk_subnet_ids) == 0)) || ((var.environment == "production") && ((3 + var.additional_brokers_to_create) % length(var.msk_subnet_ids) == 0)) + error_message = "Invalid combo of number of subnets and brokers specified. AWS MSK requires that the number of brokers must be a multiple of the number of Availability Zones." + } } variable "msk_ebs_volume_size" { From 1dc05f771644d3a66e29ac31ad173b62fc3f7645 Mon Sep 17 00:00:00 2001 From: Josh Olson Date: Thu, 7 May 2026 18:03:36 -0500 Subject: [PATCH 2/3] MSK: Delete the unused IAM role and policy --- terraform/aws/modules/2-nbs7/msk/main.tf | 62 ------------------------ 1 file changed, 62 deletions(-) diff --git a/terraform/aws/modules/2-nbs7/msk/main.tf b/terraform/aws/modules/2-nbs7/msk/main.tf index c610f6ea..7e2b53c1 100644 --- a/terraform/aws/modules/2-nbs7/msk/main.tf +++ b/terraform/aws/modules/2-nbs7/msk/main.tf @@ -17,67 +17,6 @@ locals { total_number_of_brokers = local.base_number_of_brokers + var.additional_brokers_to_create } -# Create an IAM role for MSK -resource "aws_iam_role" "msk" { - count = var.create_msk ? 1 : 0 - - name = "${var.resource_prefix}-${var.environment}-msk-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = "kafka.amazonaws.com" - } - } - ] - }) - tags = { - ModuleVersion = "${local.module_name}-${local.module_serial_number}" - } -} - -# Create an IAM policy for MSK -resource "aws_iam_policy" "msk" { - count = var.create_msk ? 1 : 0 - name = "${var.resource_prefix}-${var.environment}-msk-policy" - policy = jsonencode({ - Version : "2012-10-17" - Statement = [ - { - Effect = "Allow", - Action = [ - "cloudwatch:PutMetricData", - "logs:CreateLogGroup", - "logs:CreateLogStream", - "logs:DescribeLogStreams", - "logs:PutLogEvents", - "logs:GetLogEvents", - "logs:FilterLogEvents", - "ec2:CreateNetworkInterface", - "ec2:DescribeNetworkInterfaces", - "ec2:DeleteNetworkInterface", - "kms:Decrypt" - ], - Resource = "*" - } - ] - }) - tags = { - ModuleVersion = "${local.module_name}-${local.module_serial_number}" - } -} - -# Attach the IAM policy to the MSK role -resource "aws_iam_role_policy_attachment" "msk" { - count = var.create_msk ? 1 : 0 - policy_arn = aws_iam_policy.msk[0].arn - role = aws_iam_role.msk[0].name -} - resource "aws_cloudwatch_log_group" "test" { count = var.create_msk ? 1 : 0 name = "${var.resource_prefix}-msk-broker-logs" @@ -139,7 +78,6 @@ resource "aws_msk_cluster" "this" { cluster_name = "${var.resource_prefix}-${var.environment}-msk-cluster" kafka_version = var.kafka_version number_of_broker_nodes = local.total_number_of_brokers - #iam_instance_profile = aws_iam_role.msk.arn configuration_info { arn = aws_msk_configuration.msk_configuration_environment[0].arn From 303754dad6298a9c7fce3ac7229d4b6034d8234c Mon Sep 17 00:00:00 2001 From: Josh Olson Date: Thu, 7 May 2026 18:27:36 -0500 Subject: [PATCH 3/3] Commentary tweak --- terraform/aws/modules/2-nbs7/msk/README.md | 2 +- terraform/aws/modules/2-nbs7/msk/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/aws/modules/2-nbs7/msk/README.md b/terraform/aws/modules/2-nbs7/msk/README.md index e61f65fa..aea9719f 100644 --- a/terraform/aws/modules/2-nbs7/msk/README.md +++ b/terraform/aws/modules/2-nbs7/msk/README.md @@ -11,7 +11,7 @@ Below are the input parameter variables for the MSK: | Key | Type | Default | Description | | -------------- | -------------- | -------------- | -------------- | | create_msk | bool | true | Create MSK cluser and required resources? | -| environment | string | `development` | The environment - either 'development' or 'production', which means by default two brokers of size kafka.t3.small or three kafka.m5.large brokers, respectively. | +| environment | string | `development` | The environment, either 'development' or 'production'; which means by default two brokers of size kafka.t3.small or three kafka.m5.large brokers, and RF=2 or RF=3, respectively. | | additional_brokers_to_create | number | `0` | How many additional brokers to create - beyond the default of two for 'development' or otherwise three. AWS MSK requires that the number of brokers must be a multiple of the number of Availability Zones. | | msk_ebs_volume_size | number | | EBS volume size for the MSK broker nodes in GB | | msk_security_groups | list(string) | | A list of security groups to use for the MSK cluster | diff --git a/terraform/aws/modules/2-nbs7/msk/variables.tf b/terraform/aws/modules/2-nbs7/msk/variables.tf index 53ea4944..f4352ede 100644 --- a/terraform/aws/modules/2-nbs7/msk/variables.tf +++ b/terraform/aws/modules/2-nbs7/msk/variables.tf @@ -12,7 +12,7 @@ variable "create_msk" { variable "environment" { type = string - description = "The environment - either 'development' or 'production', which means by default two brokers of size kafka.t3.small or three kafka.m5.large brokers, respectively." + description = "The environment, either 'development' or 'production'; which means by default two brokers of size kafka.t3.small or three kafka.m5.large brokers, and RF=2 or RF=3, respectively." default = "development" validation { # Note that `terraform validate` can only perform some checks, but all validation rules will be evaluated by `terraform plan`. condition = contains(["development", "production"], var.environment)