diff --git a/account-setup/azure/main.tf b/account-setup/azure/main.tf index b91b4d3d..66dd9224 100644 --- a/account-setup/azure/main.tf +++ b/account-setup/azure/main.tf @@ -2,10 +2,494 @@ data "azurerm_resource_group" "rg" { name = var.resource_group_name } +locals { + # Constants for NSG rule priorities + nsg_priority_base_vnet_inbound = 1000 + nsg_priority_base_loadbalancer = 1050 + nsg_priority_base_http = 1100 + nsg_priority_base_https = 1200 + nsg_priority_base_outbound = 2000 + nsg_priority_base_database_allow = 1000 + nsg_priority_increment_per_subnet = 10 + nsg_priority_deny_all = 4000 + + # Database port constants + postgresql_port = 5432 + mysql_port = 3306 + + # For ostronaut compatibility + vnet_config_merged = length(var.vnet_config) > 0 ? var.vnet_config : ( + var.vpc_configs != "" ? { + var.vpc_configs = { + address_space = ["10.0.0.0/16"] + private_subnets_cidr = ["10.0.1.0/24", "10.0.2.0/24"] + database_subnets_cidr = [] + redis_subnets_cidr = [] + } + } : {} + ) + + private_subnet_map = merge([ + for vnet_name, vnet_config in local.vnet_config_merged : tomap({ + for idx, cidr in vnet_config.private_subnets_cidr : + "${vnet_name}-${idx}" => { + vnet_name = vnet_name + cidr = cidr + } + }) + ]...) + + # Database subnets - Azure doesn't allow MySQL and PostgreSQL delegations on the same subnet + # + # Subnet allocation pattern: + # - PostgreSQL uses even indices (0, 2, 4, 6...) + # - MySQL uses odd indices (1, 3, 5, 7...) + # + # Example configuration: + # database_subnets_cidr = ["10.0.2.0/24", "10.0.3.0/24", "10.0.4.0/24", "10.0.5.0/24"] + # Results in: + # - Index 0 (10.0.2.0/24) -> PostgreSQL subnet: ${vnet_name}-postgresql-subnet + # - Index 1 (10.0.3.0/24) -> MySQL subnet: ${vnet_name}-mysql-subnet + # - Index 2 (10.0.4.0/24) -> PostgreSQL subnet: ${vnet_name}-postgresql-subnet + # - Index 3 (10.0.5.0/24) -> MySQL subnet: ${vnet_name}-mysql-subnet + postgresql_subnet_map = merge([ + for vnet_name, vnet_config in local.vnet_config_merged : tomap({ + for idx, cidr in try(vnet_config.database_subnets_cidr, []) : + "${vnet_name}-postgres-${idx}" => { + vnet_name = vnet_name + cidr = cidr + idx = idx + } if idx % 2 == 0 + }) + ]...) + + mysql_subnet_map = merge([ + for vnet_name, vnet_config in local.vnet_config_merged : tomap({ + for idx, cidr in try(vnet_config.database_subnets_cidr, []) : + "${vnet_name}-mysql-${idx}" => { + vnet_name = vnet_name + cidr = cidr + idx = idx + } if idx % 2 == 1 + }) + ]...) + + # Combined database subnet map for outputs (backwards compatibility) + database_subnet_map = merge(local.postgresql_subnet_map, local.mysql_subnet_map) + + # Redis subnets - for Private Endpoints + redis_subnet_map = merge([ + for vnet_name, vnet_config in local.vnet_config_merged : tomap({ + for idx, cidr in try(vnet_config.redis_subnets_cidr, []) : + "${vnet_name}-redis-${idx}" => { + vnet_name = vnet_name + cidr = cidr + idx = idx + } + }) + ]...) + + # Redis port constant + redis_port = 6380 + + dns_enabled = length(local.vnet_config_merged) > 0 +} + resource "azurerm_virtual_network" "vnet" { - count = var.vnet != "" ? 1 : 0 - name = var.vnet + for_each = local.vnet_config_merged + name = each.key + location = data.azurerm_resource_group.rg.location + resource_group_name = data.azurerm_resource_group.rg.name + address_space = each.value.address_space +} + +# Subnet for AKS nodes (nodes will have public IPs for internet access) +resource "azurerm_subnet" "private" { + for_each = local.private_subnet_map + name = "${each.value.vnet_name}-private-subnet" + resource_group_name = data.azurerm_resource_group.rg.name + virtual_network_name = azurerm_virtual_network.vnet[each.value.vnet_name].name + address_prefixes = [each.value.cidr] +} + +# Network Security Group for private subnet - allow all internal VNet communication +resource "azurerm_network_security_group" "private" { + for_each = local.vnet_config_merged + name = "${each.key}-private-nsg" + location = data.azurerm_resource_group.rg.location + resource_group_name = data.azurerm_resource_group.rg.name + + # Allow all traffic within VNet (default Azure behavior, but explicit) + security_rule { + name = "AllowVNetInbound" + priority = local.nsg_priority_base_vnet_inbound + direction = "Inbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "VirtualNetwork" + destination_address_prefix = "VirtualNetwork" + } + + # Allow Azure LoadBalancer health probes (required for LoadBalancer services) + security_rule { + name = "AllowAzureLoadBalancerInbound" + priority = local.nsg_priority_base_loadbalancer + direction = "Inbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "AzureLoadBalancer" + destination_address_prefix = "*" + description = "Allow Azure LoadBalancer health probes and traffic" + } + + # Allow HTTP (80) from internet for ingress LoadBalancer + security_rule { + name = "AllowHTTPInbound" + priority = local.nsg_priority_base_http + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = "80" + source_address_prefix = "*" + destination_address_prefix = "*" + description = "Allow HTTP traffic from internet for ingress LoadBalancer" + } + + # Allow HTTPS (443) from internet for ingress LoadBalancer + security_rule { + name = "AllowHTTPSInbound" + priority = local.nsg_priority_base_https + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = "443" + source_address_prefix = "*" + destination_address_prefix = "*" + description = "Allow HTTPS traffic from internet for ingress LoadBalancer" + } + + # Allow all outbound (nodes have public IPs for direct internet access) + security_rule { + name = "AllowAllOutbound" + priority = local.nsg_priority_base_outbound + direction = "Outbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + } + + tags = { + Name = "${each.key}-private-nsg" + } +} + +# Associate NSG with private subnet +resource "azurerm_subnet_network_security_group_association" "private" { + for_each = local.private_subnet_map + subnet_id = azurerm_subnet.private[each.key].id + network_security_group_id = azurerm_network_security_group.private[each.value.vnet_name].id +} + +# Subnet for PostgreSQL databases - Azure requires separate subnet per service delegation +resource "azurerm_subnet" "postgresql" { + for_each = local.postgresql_subnet_map + name = "${each.value.vnet_name}-postgresql-subnet" + resource_group_name = data.azurerm_resource_group.rg.name + virtual_network_name = azurerm_virtual_network.vnet[each.value.vnet_name].name + address_prefixes = [each.value.cidr] + + delegation { + name = "database-delegation" + service_delegation { + name = "Microsoft.DBforPostgreSQL/flexibleServers" + actions = ["Microsoft.Network/virtualNetworks/subnets/join/action"] + } + } +} + +# Subnet for MySQL databases - Azure requires separate subnet per service delegation +resource "azurerm_subnet" "mysql" { + for_each = local.mysql_subnet_map + name = "${each.value.vnet_name}-mysql-subnet" + resource_group_name = data.azurerm_resource_group.rg.name + virtual_network_name = azurerm_virtual_network.vnet[each.value.vnet_name].name + address_prefixes = [each.value.cidr] + + delegation { + name = "database-delegation" + service_delegation { + name = "Microsoft.DBforMySQL/flexibleServers" + actions = ["Microsoft.Network/virtualNetworks/subnets/join/action"] + } + } +} + +# Network Security Group for PostgreSQL subnet - ONLY allow traffic from private subnet (cluster) +resource "azurerm_network_security_group" "postgresql" { + for_each = local.postgresql_subnet_map + name = "${each.value.vnet_name}-postgresql-nsg" + location = data.azurerm_resource_group.rg.location + resource_group_name = data.azurerm_resource_group.rg.name + + # Allow PostgreSQL from private subnets + dynamic "security_rule" { + for_each = local.vnet_config_merged[each.value.vnet_name].private_subnets_cidr + content { + name = "AllowPostgreSQLFromPrivateSubnet-${security_rule.key}" + priority = local.nsg_priority_base_database_allow + (security_rule.key * local.nsg_priority_increment_per_subnet) + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = tostring(local.postgresql_port) + source_address_prefix = security_rule.value + destination_address_prefix = "*" + description = "Allow PostgreSQL traffic from AKS cluster (private subnet)" + } + } + + # Deny all other inbound traffic (Azure default, but explicit) + security_rule { + name = "DenyAllInbound" + priority = local.nsg_priority_deny_all + direction = "Inbound" + access = "Deny" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + description = "Deny all other inbound traffic" + } + + # Allow all outbound (for database connections back to cluster if needed) + security_rule { + name = "AllowAllOutbound" + priority = local.nsg_priority_base_outbound + direction = "Outbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + } + + tags = { + Name = "${each.value.vnet_name}-postgresql-nsg" + } +} + +# Network Security Group for MySQL subnet - ONLY allow traffic from private subnet (cluster) +resource "azurerm_network_security_group" "mysql" { + for_each = local.mysql_subnet_map + name = "${each.value.vnet_name}-mysql-nsg" + location = data.azurerm_resource_group.rg.location + resource_group_name = data.azurerm_resource_group.rg.name + + # Allow MySQL from private subnets + dynamic "security_rule" { + for_each = local.vnet_config_merged[each.value.vnet_name].private_subnets_cidr + content { + name = "AllowMySQLFromPrivateSubnet-${security_rule.key}" + priority = local.nsg_priority_base_database_allow + (security_rule.key * local.nsg_priority_increment_per_subnet) + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = tostring(local.mysql_port) + source_address_prefix = security_rule.value + destination_address_prefix = "*" + description = "Allow MySQL traffic from AKS cluster (private subnet)" + } + } + + # Deny all other inbound traffic (Azure default, but explicit) + security_rule { + name = "DenyAllInbound" + priority = local.nsg_priority_deny_all + direction = "Inbound" + access = "Deny" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + description = "Deny all other inbound traffic" + } + + # Allow all outbound (for database connections back to cluster if needed) + security_rule { + name = "AllowAllOutbound" + priority = local.nsg_priority_base_outbound + direction = "Outbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + } + + tags = { + Name = "${each.value.vnet_name}-mysql-nsg" + } +} + +# Associate NSG with PostgreSQL subnet +resource "azurerm_subnet_network_security_group_association" "postgresql" { + for_each = local.postgresql_subnet_map + subnet_id = azurerm_subnet.postgresql[each.key].id + network_security_group_id = azurerm_network_security_group.postgresql[each.key].id +} + +# Associate NSG with MySQL subnet +resource "azurerm_subnet_network_security_group_association" "mysql" { + for_each = local.mysql_subnet_map + subnet_id = azurerm_subnet.mysql[each.key].id + network_security_group_id = azurerm_network_security_group.mysql[each.key].id +} + +# Subnet for Redis Private Endpoints +resource "azurerm_subnet" "redis" { + for_each = local.redis_subnet_map + name = "${each.value.vnet_name}-redis-subnet" + resource_group_name = data.azurerm_resource_group.rg.name + virtual_network_name = azurerm_virtual_network.vnet[each.value.vnet_name].name + address_prefixes = [each.value.cidr] +} + +# Network Security Group for Redis subnet - ONLY allow traffic from private subnet (cluster) +resource "azurerm_network_security_group" "redis" { + for_each = local.redis_subnet_map + name = "${each.value.vnet_name}-redis-nsg" location = data.azurerm_resource_group.rg.location resource_group_name = data.azurerm_resource_group.rg.name - address_space = var.address_space -} \ No newline at end of file + + # Allow Redis from private subnets + dynamic "security_rule" { + for_each = local.vnet_config_merged[each.value.vnet_name].private_subnets_cidr + content { + name = "AllowRedisFromPrivateSubnet-${security_rule.key}" + priority = local.nsg_priority_base_database_allow + (security_rule.key * local.nsg_priority_increment_per_subnet) + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = tostring(local.redis_port) + source_address_prefix = security_rule.value + destination_address_prefix = "*" + description = "Allow Redis traffic from AKS cluster (private subnet)" + } + } + + # Deny all other inbound traffic (Azure default, but explicit) + security_rule { + name = "DenyAllInbound" + priority = local.nsg_priority_deny_all + direction = "Inbound" + access = "Deny" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + description = "Deny all other inbound traffic" + } + + # Allow all outbound + security_rule { + name = "AllowAllOutbound" + priority = local.nsg_priority_base_outbound + direction = "Outbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + } + + tags = { + Name = "${each.value.vnet_name}-redis-nsg" + } +} + +# Associate NSG with Redis subnet +resource "azurerm_subnet_network_security_group_association" "redis" { + for_each = local.redis_subnet_map + subnet_id = azurerm_subnet.redis[each.key].id + network_security_group_id = azurerm_network_security_group.redis[each.key].id +} + +# DNS zones are created ONCE per resource group. +# VNet links are created for EACH VNet to connect them to the shared zones + +# Private DNS Zone for PostgreSQL (created once if any VNet exists) +resource "azurerm_private_dns_zone" "postgresql" { + count = local.dns_enabled ? 1 : 0 + name = "privatelink.postgres.database.azure.com" + resource_group_name = data.azurerm_resource_group.rg.name + + tags = { + Name = "postgresql-private-dns-zone" + } +} + +# VNet links for PostgreSQL Private DNS Zone (one per VNet) +resource "azurerm_private_dns_zone_virtual_network_link" "postgresql" { + for_each = local.dns_enabled ? local.vnet_config_merged : {} + name = "${each.key}-postgresql-dns-link" + private_dns_zone_name = azurerm_private_dns_zone.postgresql[0].name + virtual_network_id = azurerm_virtual_network.vnet[each.key].id + resource_group_name = data.azurerm_resource_group.rg.name +} + +# Private DNS Zone for MySQL (created once if any VNet exists) +resource "azurerm_private_dns_zone" "mysql" { + count = local.dns_enabled ? 1 : 0 + name = "privatelink.mysql.database.azure.com" + resource_group_name = data.azurerm_resource_group.rg.name + + tags = { + Name = "mysql-private-dns-zone" + } +} + +# VNet links for MySQL Private DNS Zone (one per VNet) +resource "azurerm_private_dns_zone_virtual_network_link" "mysql" { + for_each = local.dns_enabled ? local.vnet_config_merged : {} + name = "${each.key}-mysql-dns-link" + private_dns_zone_name = azurerm_private_dns_zone.mysql[0].name + virtual_network_id = azurerm_virtual_network.vnet[each.key].id + resource_group_name = data.azurerm_resource_group.rg.name +} + +# Private DNS Zone for Redis (created once if any VNet exists) +resource "azurerm_private_dns_zone" "redis" { + count = local.dns_enabled ? 1 : 0 + name = "privatelink.redis.cache.windows.net" + resource_group_name = data.azurerm_resource_group.rg.name + + tags = { + Name = "redis-private-dns-zone" + } +} + +# VNet links for Redis Private DNS Zone (one per VNet) +resource "azurerm_private_dns_zone_virtual_network_link" "redis" { + for_each = local.dns_enabled ? local.vnet_config_merged : {} + name = "${each.key}-redis-dns-link" + private_dns_zone_name = azurerm_private_dns_zone.redis[0].name + virtual_network_id = azurerm_virtual_network.vnet[each.key].id + resource_group_name = data.azurerm_resource_group.rg.name +} diff --git a/account-setup/azure/outputs.tf b/account-setup/azure/outputs.tf index ee519e23..2111b342 100644 --- a/account-setup/azure/outputs.tf +++ b/account-setup/azure/outputs.tf @@ -1,3 +1,58 @@ output "vnet" { - value = try(azurerm_virtual_network.vnet[0].address_space,0) -} \ No newline at end of file + description = "Map of VNet names to their IDs" + value = { + for k, v in azurerm_virtual_network.vnet : k => v.id + } +} + +output "private_subnets" { + description = "List of private subnet names" + value = [for subnet in azurerm_subnet.private : subnet.name] +} + +output "database_subnets" { + description = "List of database subnet names (includes both PostgreSQL and MySQL subnets)" + value = concat( + [for subnet in azurerm_subnet.postgresql : subnet.name], + [for subnet in azurerm_subnet.mysql : subnet.name] + ) +} + +output "postgresql_subnets" { + description = "List of PostgreSQL subnet names" + value = [for subnet in azurerm_subnet.postgresql : subnet.name] +} + +output "mysql_subnets" { + description = "List of MySQL subnet names" + value = [for subnet in azurerm_subnet.mysql : subnet.name] +} + +output "private_subnet_ids" { + description = "Map of private subnet names to their IDs" + value = { + for k, v in azurerm_subnet.private : v.name => v.id + } +} + +output "database_subnet_ids" { + description = "Map of database subnet names to their IDs (includes both PostgreSQL and MySQL)" + value = merge( + { for k, v in azurerm_subnet.postgresql : v.name => v.id }, + { for k, v in azurerm_subnet.mysql : v.name => v.id } + ) +} + +output "postgresql_subnet_ids" { + description = "Map of PostgreSQL subnet names to their IDs" + value = { + for k, v in azurerm_subnet.postgresql : v.name => v.id + } +} + +output "mysql_subnet_ids" { + description = "Map of MySQL subnet names to their IDs" + value = { + for k, v in azurerm_subnet.mysql : v.name => v.id + } +} diff --git a/account-setup/azure/vars.tf b/account-setup/azure/vars.tf index d1185e6a..0355d766 100644 --- a/account-setup/azure/vars.tf +++ b/account-setup/azure/vars.tf @@ -4,14 +4,20 @@ variable "resource_group_name" { default = "" } -variable "vnet" { - description = "Name of the virtual network where the AKS will deploy" +# For ostronaut compatibility +variable "vpc_configs" { + description = "Legacy VPC name as string (for backward compatibility). Use vnet_config instead." type = string default = "" } -variable "address_space" { - description = "The address space that is used the virtual network" - type = list(string) - default = ["10.0.0.0/16"] +variable "vnet_config" { + description = "VNet configuration - map of VNet names to their configuration. Note: database_subnets_cidr should have even number of entries (even indices for PostgreSQL, odd for MySQL)" + type = map(object({ + address_space = list(string) + private_subnets_cidr = list(string) + database_subnets_cidr = optional(list(string)) + redis_subnets_cidr = optional(list(string)) + })) + default = {} } \ No newline at end of file diff --git a/artifact/aws/main.tf b/artifact/aws/main.tf index 953efce9..f62b04cb 100644 --- a/artifact/aws/main.tf +++ b/artifact/aws/main.tf @@ -1,7 +1,9 @@ resource "aws_ecr_repository" "ecr_repo" { for_each = toset(var.services) - name = each.value - image_tag_mutability = "MUTABLE" + + name = each.value + + image_tag_mutability = var.immutable_image_tags ? "IMMUTABLE" : "MUTABLE" image_scanning_configuration { scan_on_push = true diff --git a/artifact/aws/vars.tf b/artifact/aws/vars.tf index a9dcd08b..8e5472ea 100644 --- a/artifact/aws/vars.tf +++ b/artifact/aws/vars.tf @@ -3,4 +3,10 @@ variable "services" { description = "List of services to be deployed within the namespace" type = list(string) default = [] +} + +variable "immutable_image_tags" { + description = "Specifies the ECR image tags are immutable" + type = bool + default = true } \ No newline at end of file diff --git a/artifact/gcp/main.tf b/artifact/gcp/main.tf index 3a66f75d..f12846c3 100644 --- a/artifact/gcp/main.tf +++ b/artifact/gcp/main.tf @@ -13,6 +13,10 @@ resource "google_artifact_registry_repository" "gcr_repo" { description = "${each.value} docker repository" format = "DOCKER" + docker_config { + immutable_tags = var.immutable_image_tags + } + depends_on = [google_project_service.enable_artifact_registry] } diff --git a/artifact/gcp/vars.tf b/artifact/gcp/vars.tf index e22725fb..35f15763 100644 --- a/artifact/gcp/vars.tf +++ b/artifact/gcp/vars.tf @@ -16,4 +16,10 @@ variable "registry_permissions" { users = list(string) })) default = {} +} + +variable "immutable_image_tags" { + description = "Specifies whether the GAR image tags are immutable" + type = bool + default = true } \ No newline at end of file diff --git a/k8s/aws/eks/autoscale.tf b/k8s/aws/eks/autoscale.tf index e3114b5d..31788014 100644 --- a/k8s/aws/eks/autoscale.tf +++ b/k8s/aws/eks/autoscale.tf @@ -1,10 +1,12 @@ -data "template_file" "autoscale_template" { - template = file("./templates/cluster-auto-scaler-values.yaml") - vars = { - CLUSTER_REGION = var.app_region - CLUSTER_NAME = local.cluster_name - ACCOUNT_ID = data.aws_caller_identity.current.account_id - } +locals { + autoscale_template = templatefile( + "${path.module}/templates/cluster-auto-scaler-values.yaml", + { + CLUSTER_REGION = var.app_region + CLUSTER_NAME = local.cluster_name + ACCOUNT_ID = data.aws_caller_identity.current.account_id + } + ) } resource "helm_release" "auto_scaler" { @@ -14,7 +16,7 @@ resource "helm_release" "auto_scaler" { namespace = "kube-system" version = "9.50.0" - values = [data.template_file.autoscale_template.rendered] + values = [local.autoscale_template] depends_on = [null_resource.wait_for_cluster] } \ No newline at end of file diff --git a/k8s/aws/eks/cert-manager.tf b/k8s/aws/eks/cert-manager.tf index b275f308..7c1ca55b 100644 --- a/k8s/aws/eks/cert-manager.tf +++ b/k8s/aws/eks/cert-manager.tf @@ -1,3 +1,37 @@ +locals { + cert_manager_template = templatefile( + "${path.module}/templates/cert-manager-values.yaml", + { + CLUSTER_NAME = local.cluster_name + role_arn = aws_iam_role.cluster_issuer_role.arn + } + ) + + cluster_wildcard_issuer = templatefile( + "${path.module}/templates/cluster-issuer.yaml", + { + dns = local.domain_name + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" + ? "https://acme-staging-v02.api.letsencrypt.org/directory" + : "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + location = var.app_region + zone_id = data.aws_route53_zone.zone.0.zone_id + secret_name = "${local.cluster_name}-cluster-issuer-creds" + email = var.cert_issuer_config.email + } + ) + + cluster_wildcard_certificate = templatefile( + "${path.module}/templates/cluster-certificate.yaml", + { + dns = local.domain_name + } + ) +} + resource "null_resource" "wait_for_cluster" { provisioner "local-exec" { command = "sleep 60" # Adjust the duration as needed @@ -6,14 +40,6 @@ resource "null_resource" "wait_for_cluster" { depends_on = [module.eks] } -data "template_file" "cert_manager_template" { - template = file("./templates/cert-manager-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - role_arn = aws_iam_role.cluster_issuer_role.arn - } -} - resource "helm_release" "cert-manager" { name = "cert-manager" repository = "https://charts.jetstack.io" @@ -27,7 +53,7 @@ resource "helm_release" "cert-manager" { value = "true" } - values = [data.template_file.cert_manager_template.rendered] + values = [local.cert_manager_template] depends_on = [null_resource.wait_for_cluster] } @@ -113,33 +139,14 @@ resource "kubernetes_secret" "cluster_issuer_credentials" { depends_on = [helm_release.cert-manager] } -data "template_file" "cluster_wildcard_issuer" { - template = file("./templates/cluster-issuer.yaml") - vars = { - dns = local.domain_name - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - location = var.app_region - zone_id = data.aws_route53_zone.zone.0.zone_id - secret_name = "${local.cluster_name}-cluster-issuer-creds" - email = var.cert_issuer_config.email - } - depends_on = [helm_release.cert-manager,kubernetes_namespace.monitoring] -} - resource "kubectl_manifest" "cluster_wildcard_issuer" { - yaml_body = data.template_file.cluster_wildcard_issuer.rendered -} - -data "template_file" "cluster_wildcard_certificate" { - template = file("./templates/cluster-certificate.yaml") - vars = { - dns = local.domain_name - } - depends_on = [kubectl_manifest.cluster_wildcard_issuer] + yaml_body = local.cluster_wildcard_issuer + depends_on = [kubernetes_secret.cluster_issuer_credentials] } resource "kubectl_manifest" "cluster_wildcard_certificate" { - yaml_body = data.template_file.cluster_wildcard_certificate.rendered + yaml_body = local.cluster_wildcard_certificate + depends_on = [kubectl_manifest.cluster_wildcard_issuer] } resource "kubernetes_secret_v1" "certificate_replicator" { diff --git a/k8s/aws/eks/fluentbit.tf b/k8s/aws/eks/fluentbit.tf index de10217e..5abf5280 100644 --- a/k8s/aws/eks/fluentbit.tf +++ b/k8s/aws/eks/fluentbit.tf @@ -114,30 +114,32 @@ data "aws_iam_policy_document" "fluent_bit_policy" { } } -data template_file "fluent-bit"{ - count = local.fluent_bit_enable ? 1 : 0 - template = file("./templates/fluent-bit-values.yaml") - vars = { - "CLUSTER_NAME" = local.cluster_name - "AWS_REGION" = var.app_region - "TAGS" = join(",", [for key, value in local.common_tags : "${key}=${value}"]) - - "HTTP_SERVER" = "On" - "HTTP_PORT" = "2020" - - "READ_FROM_HEAD" = "Off" - "READ_FROM_TAIL" = "On" - - fluent_bit_cloud_watch_enable = local.fluent_bit_cloud_watch_enable - fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) - fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) - fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) - fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) - fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) - fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) - } +locals { + fluent_bit = local.fluent_bit_enable ? templatefile( + "${path.module}/templates/fluent-bit-values.yaml", + { + CLUSTER_NAME = local.cluster_name + AWS_REGION = var.app_region + TAGS = join(",", [for key, value in local.common_tags : "${key}=${value}"]) + + HTTP_SERVER = "On" + HTTP_PORT = "2020" + + READ_FROM_HEAD = "Off" + READ_FROM_TAIL = "On" + + fluent_bit_cloud_watch_enable = local.fluent_bit_cloud_watch_enable + fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) + fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) + fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) + fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) + fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) + fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) + } + ) : null } + resource "helm_release" "fluntbit-config" { count = local.fluent_bit_enable ? 1 : 0 repository = "https://fluent.github.io/helm-charts" @@ -147,7 +149,7 @@ resource "helm_release" "fluntbit-config" { namespace = kubernetes_namespace.monitoring.metadata.0.name values = [ - data.template_file.fluent-bit[0].rendered + local.fluent_bit ] depends_on = [ kubernetes_namespace.monitoring diff --git a/k8s/aws/eks/grafana.tf b/k8s/aws/eks/grafana.tf index 67042a42..3475c5a5 100644 --- a/k8s/aws/eks/grafana.tf +++ b/k8s/aws/eks/grafana.tf @@ -7,6 +7,43 @@ locals { grafana_enable = try(var.observability_config.grafana != null ? var.observability_config.grafana.enable : false, false) grafana_host = try(var.observability_config.grafana.url != null ? var.observability_config.grafana.url : (local.domain_name != "" && !var.public_ingress ? "grafana.${local.domain_name}" : ""), "") + grafana_template = local.grafana_enable ? templatefile( + "${path.module}/templates/grafana-values.yaml", + { + NAMESPACE = "monitoring" + GRAFANA_HOST = local.grafana_host + GRAFANA_ENABLED = local.grafana_enable + GRAFANA_TLS_HOST = "*.${local.domain_name}" + GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") + CLUSTER_NAME = var.app_name + PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) + PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) + PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") + GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") + GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_type : "", "") + GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_url : "", "") + GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_password : "", "") + GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_admin_user : "", "") + GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) + GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) + GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") + GRAFANA_REQUEST_CPU = try(var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") + GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory != null ? var.observability_config.grafana.limit_memory: "500Mi", "500Mi") + GRAFANA_LIMIT_CPU = try(var.observability_config.grafana.limit_cpu != null ? var.observability_config.grafana.limit_cpu : "500m", "500m") + GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") + GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") + GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") + GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") + GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") + GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") + GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") + GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") + ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) : false, false) + ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" + OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.aws_secretsmanager_secret_version.oauth_client_id[0].secret_string : null) : null, null) + OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.aws_secretsmanager_secret_version.oauth_client_secret[0].secret_string : null) : null, null) + } + ) : null } resource "random_password" "observability_admin" { @@ -15,45 +52,6 @@ resource "random_password" "observability_admin" { special = false } -data "template_file" "grafana_template" { - count = local.grafana_enable ? 1 : 0 - template = file("./templates/grafana-values.yaml") - vars = { - NAMESPACE = "monitoring" - GRAFANA_HOST = local.grafana_host - GRAFANA_ENABLED = local.grafana_enable - GRAFANA_TLS_HOST = "*.${local.domain_name}" - GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") - CLUSTER_NAME = var.app_name - PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) - PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) - PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") - GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") - GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_type : "", "") - GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_url : "", "") - GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_password : "", "") - GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.rds[0].db_admin_user : "", "") - GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) - GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) - GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") - GRAFANA_REQUEST_CPU = try( var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") - GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory != null ? var.observability_config.grafana.limit_memory: "500Mi", "500Mi") - GRAFANA_LIMIT_CPU = try( var.observability_config.grafana.limit_cpu != null ? var.observability_config.grafana.limit_cpu : "500m", "500m") - GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") - GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") - GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") - GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") - GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") - GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") - GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") - GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") - ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) :false, false) - ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" - OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.aws_secretsmanager_secret_version.oauth_client_id[0].secret_string : null) : null, null) - OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.aws_secretsmanager_secret_version.oauth_client_secret[0].secret_string : null) : null, null) - } -} - resource "helm_release" "grafana" { count = local.grafana_enable ? 1 : 0 chart = "grafana" @@ -65,7 +63,7 @@ resource "helm_release" "grafana" { repository = "https://grafana.github.io/helm-charts" values = [ - data.template_file.grafana_template[count.index].rendered + local.grafana_template ] depends_on = [helm_release.prometheus, kubernetes_storage_class.gp3_default] } @@ -175,9 +173,9 @@ data "aws_secretsmanager_secret_version" "oauth_client_secret" { } module "rds" { - source = "../../../sql/aws-rds" + source = "../../../sql/aws-rds" - count = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? 1 : 0, 0) + count = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? 1 : 0, 0) cluster_name = local.cluster_name namespace = "monitoring" @@ -186,9 +184,9 @@ module "rds" { vpc_id = local.vpc_id ext_rds_sg_cidr_block = local.ext_rds_sg_cidr_block rds_name = "${local.cluster_name}-monitoring-sql-db" - read_replica = false - admin_user = "postgresadmin" - databases = ["grafana"] + read_replica = false + admin_user = "postgresadmin" + databases = ["grafana"] rds_type = "postgresql" allocated_storage = 10 instance_class = "db.t3.small" @@ -200,6 +198,5 @@ module "rds" { monitoring_interval = 0 log_min_duration_statement = -1 postgresql_engine_version = "13.7" - - tags = local.common_tags + tags = local.common_tags } \ No newline at end of file diff --git a/k8s/aws/eks/k8s-events.tf b/k8s/aws/eks/k8s-events.tf index 1ff60c42..ee1a3a5d 100644 --- a/k8s/aws/eks/k8s-events.tf +++ b/k8s/aws/eks/k8s-events.tf @@ -1,6 +1,7 @@ locals { # Kubernetes event exporter local configs - enable_k8s_event_exporter = try(var.observability_config.kubernetes_event_exporter.enable != null ? var.observability_config.kubernetes_event_exporter.enable : false, false) + enable_k8s_event_exporter = try(var.observability_config.kubernetes_event_exporter.enable != null ? + var.observability_config.kubernetes_event_exporter.enable : false, false) loki_receivers = try([ for receiver in var.observability_config.kubernetes_event_exporter.loki_receivers : { name = receiver.name @@ -22,32 +23,62 @@ locals { ], []) - observability_loki_recievers = local.enable_loki ? [{ - name = local.cluster_name - url = "http://loki-distributor.loki:3100/loki/api/v1/push" - header_key = "X-Scope-OrgID" - header_value = random_uuid.grafana_standard_datasource_header_value.result - cluster_id = "" - }] : [] + observability_loki_recievers = local.enable_loki ? [ + { + name = local.cluster_name + url = "http://loki-distributor.loki:3100/loki/api/v1/push" + header_key = "X-Scope-OrgID" + header_value = random_uuid.grafana_standard_datasource_header_value.result + cluster_id = "" + } + ] : [] - all_loki_receivers = concat(local.loki_receivers,local.observability_loki_recievers) -} + all_loki_receivers = concat(local.loki_receivers, local.observability_loki_recievers) -data "template_file" "k8s_event_exporter" { - count = local.enable_k8s_event_exporter || local.enable_loki ? 1 : 0 - - template = file("./templates/event-exporter-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level != null ? var.observability_config.kubernetes_event_exporter.log_level : "error" , "error") - MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second != null ? var.observability_config.kubernetes_event_exporter.max_event_age_second : "150" , "150") - LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) - WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) - LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu : "400m", "400m") - LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory != null ? var.observability_config.kubernetes_event_exporter.resource.limit_memory : "250Mi", "250Mi") - REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.request_cpu : "100m", "100m") - REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory != null ? var.observability_config.kubernetes_event_exporter.resource.request_memory : "100Mi", "100Mi") - } + k8s_event_exporter = (local.enable_k8s_event_exporter || local.enable_loki) ? templatefile( + "${path.module}/templates/event-exporter-values.yaml", + { + CLUSTER_NAME = local.cluster_name + LOG_LEVEL = try( + var.observability_config.kubernetes_event_exporter.log_level != null + ? var.observability_config.kubernetes_event_exporter.log_level + : "error", + "error" + ) + MAX_EVENT_AGE_SECONDS = try( + var.observability_config.kubernetes_event_exporter.max_event_age_second != null + ? var.observability_config.kubernetes_event_exporter.max_event_age_second + : "150", + "150" + ) + LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) + WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) + LIMIT_CPU = try( + var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null + ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu + : "400m", + "400m" + ) + LIMIT_MEMORY = try( + var.observability_config.kubernetes_event_exporter.resource.limit_memory != null + ? var.observability_config.kubernetes_event_exporter.resource.limit_memory + : "250Mi", + "250Mi" + ) + REQUEST_CPU = try( + var.observability_config.kubernetes_event_exporter.resource.request_cpu != null + ? var.observability_config.kubernetes_event_exporter.resource.request_cpu + : "100m", + "100m" + ) + REQUEST_MEMORY = try( + var.observability_config.kubernetes_event_exporter.resource.request_memory != null + ? var.observability_config.kubernetes_event_exporter.resource.request_memory + : "100Mi", + "100Mi" + ) + } + ) : null } resource "helm_release" "kubernetes_event_exporter" { @@ -60,6 +91,6 @@ resource "helm_release" "kubernetes_event_exporter" { namespace = helm_release.prometheus[0].namespace values = [ - data.template_file.k8s_event_exporter[count.index].rendered + local.k8s_event_exporter ] } \ No newline at end of file diff --git a/k8s/aws/eks/kong-consumer.tf b/k8s/aws/eks/kong-consumer.tf index 028bc1fd..a613a5cf 100644 --- a/k8s/aws/eks/kong-consumer.tf +++ b/k8s/aws/eks/kong-consumer.tf @@ -115,7 +115,7 @@ # #resource "kubectl_manifest" "consumer" { # for_each = {for k in local.kong_consumer_list : k.name => k} -# yaml_body = data.template_file.consumer_template[each.key].rendered +# yaml_body = local.consumer_template[each.key] # depends_on = [kubernetes_secret.kong_acl_group] #} # @@ -133,6 +133,6 @@ # #resource "kubectl_manifest" "acl_allow_group" { # for_each = {for k in local.kong_acl_list : k.name => k} -# yaml_body = data.template_file.acl_template[each.key].rendered +# yaml_body = local.acl_template[each.key] # depends_on = [kubectl_manifest.consumer] #} diff --git a/k8s/aws/eks/main.tf b/k8s/aws/eks/main.tf index c261458a..f671dc31 100644 --- a/k8s/aws/eks/main.tf +++ b/k8s/aws/eks/main.tf @@ -64,8 +64,8 @@ module "eks" { resources = ["secrets"] } - // Cluster endpoint should not have public access - cluster_endpoint_private_access = false + // Cluster endpoint access + cluster_endpoint_private_access = true cluster_endpoint_public_access = true self_managed_node_group_defaults = { @@ -82,7 +82,7 @@ module "eks" { desired_size = var.node_config.min_count min_size = var.node_config.min_count max_size = var.node_config.max_count - bootstrap_extra_args = "--container-runtime containerd" + bootstrap_extra_args = "" # vpc_security_group_ids = var.internal_loadbalancer ? [aws_security_group.worker_group_mgmt.id] : [aws_security_group.external_worker_group_mgmt.id] # target_group_arns = var.public_ingress ? [aws_lb_target_group.cluster_tg.0.arn,aws_lb_target_group.kong_tg_admin.0.arn] : (var.public_app ? [aws_lb_target_group.cluster_alb_tg.0.arn] : [aws_lb_target_group.cluster_nlb_tg.0.arn]) # user_data_template_path = file("./templates/user-data.tpl") diff --git a/k8s/aws/eks/observability.tf b/k8s/aws/eks/observability.tf index e5379955..9c242cec 100644 --- a/k8s/aws/eks/observability.tf +++ b/k8s/aws/eks/observability.tf @@ -3,10 +3,11 @@ locals { enable_tempo = try(var.observability_config.tempo != null ? var.observability_config.tempo.enable : false, false) enable_cortex = try(var.observability_config.cortex != null ? var.observability_config.cortex.enable : false, false) enable_mimir = try(var.observability_config.mimir != null ? var.observability_config.mimir.enable : false, false) + enable_openobserve = try(var.observability_config.openobserve != null ? length(var.observability_config.openobserve) > 0 && anytrue([for instance in var.observability_config.openobserve : instance.enable]) : false, false) } module "observability" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 source = "../../../observability/aws" app_name = var.app_name @@ -21,11 +22,12 @@ module "observability" { tempo = var.observability_config.tempo cortex = var.observability_config.cortex mimir = var.observability_config.mimir + openobserve = try(var.observability_config.openobserve, []) depends_on = [helm_release.prometheus, helm_release.k8s_replicator] } resource "aws_iam_policy" "observability_s3_iam_policy" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 name = "observability-${local.environment}-policy" description = "IAM policy for Observability Cluster to access S3" @@ -78,29 +80,29 @@ resource "aws_iam_policy" "observability_s3_iam_policy" { } resource "aws_iam_user" "observability_s3_user" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 name = "${local.cluster_name}-s3-user" tags = local.common_tags } resource "aws_iam_user_policy_attachment" "observability_s3_attach" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 user = aws_iam_user.observability_s3_user.0.name policy_arn = aws_iam_policy.observability_s3_iam_policy.0.arn } resource "aws_iam_access_key" "observability_s3_user"{ - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 user = aws_iam_user.observability_s3_user.0.name } resource "aws_secretsmanager_secret" "observability_s3_user" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 name = "${local.cluster_name}-s3-user-secret-key" } resource "aws_secretsmanager_secret_version" "observability_s3_user"{ - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 secret_id = aws_secretsmanager_secret.observability_s3_user.0.id secret_string = jsonencode({ username = aws_iam_user.observability_s3_user.0.name, access_key = aws_iam_access_key.observability_s3_user.0.user, access_secret = aws_iam_access_key.observability_s3_user.0.secret }) diff --git a/k8s/aws/eks/outputs.tf b/k8s/aws/eks/outputs.tf index 0cc3907f..f2e82d18 100644 --- a/k8s/aws/eks/outputs.tf +++ b/k8s/aws/eks/outputs.tf @@ -158,4 +158,10 @@ output "grafana_datasources" { output "lbip" { value = data.kubernetes_service.ingress-controller.status.0.load_balancer.0.ingress.0.hostname +} + +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" + value = try(module.observability[0].openobserve_instances, {}) + sensitive = true } \ No newline at end of file diff --git a/k8s/aws/eks/prometheus.tf b/k8s/aws/eks/prometheus.tf index a68aa3ac..b06950b5 100644 --- a/k8s/aws/eks/prometheus.tf +++ b/k8s/aws/eks/prometheus.tf @@ -62,38 +62,77 @@ locals{ }] : [] remote_write_config = concat(local.remote_write_config_list, local.default_remote_write_config) -} - -data "template_file" "prom_template" { - count = local.prometheus_enable ? 1 : 0 - template = file("./templates/prometheus-values.yaml") - vars = { - PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") - PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "45GB", "45GB") - PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "10d", "10d") - REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) - CLUSTER_NAME = local.cluster_name - ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false - MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts == {} ? false : true - MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) == "" && jsonencode(local.cluster_teams_alerts) == "" ? false : true - MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) - MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key - MOOGSOFT_USERNAME = var.moogsoft_username - teams_webhook_alerts = jsonencode(local.cluster_alerts) - cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) - cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) - GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts == "" ? false : true - GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) - PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts == "" ? false : true - PAGER_DUTY_KEY = var.pagerduty_integration_key - PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) - GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" - SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts == "" ? false : true - WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts == "" ? false : true - SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) - WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) - } + prom_template = local.prometheus_enable ? templatefile( + "${path.module}/templates/prometheus-values.yaml", + { + PROMETHEUS_DISK_SIZE = try( + var.observability_config.prometheus.persistence.disk_size != null + ? var.observability_config.prometheus.persistence.disk_size + : "50Gi", + "50Gi" + ) + PROMETHEUS_RETENTION_SIZE = try( + var.observability_config.prometheus.persistence.retention_size != null + ? var.observability_config.prometheus.persistence.retention_size + : "45GB", + "45GB" + ) + PROMETHEUS_RETENTION_DURATION = try( + var.observability_config.prometheus.persistence.retention_duration != null + ? var.observability_config.prometheus.persistence.retention_duration + : "10d", + "10d" + ) + REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) + CLUSTER_NAME = local.cluster_name + ALERTS_ENABLED = ( + jsonencode(local.cluster_moogsoft_alerts) != "" || + jsonencode(local.namespace_teams_webhook) != "" || + jsonencode(local.cluster_teams_alerts) != "" || + jsonencode(local.google_chat_alerts) != "" || + jsonencode(local.cluster_slack_alerts) != "" || + jsonencode(local.cluster_webhook_alerts) != "" + ) ? true : false + MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts == {} ? false : true + MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) == "" && jsonencode(local.cluster_teams_alerts) == "" ? false : true + MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) + MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key + MOOGSOFT_USERNAME = var.moogsoft_username + teams_webhook_alerts = jsonencode(local.cluster_alerts) + cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) + cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) + GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts == "" ? false : true + GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) + PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts == "" ? false : true + PAGER_DUTY_KEY = var.pagerduty_integration_key + PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) + GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" + SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts == "" ? false : true + WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts == "" ? false : true + SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) + WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) + } + ) : "" + + cluster_alert_config = templatefile( + "${path.module}/templates/cluster-level-alerts.yaml", + { + cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) + cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) + cluster_node_count_max_value = var.node_config.max_count + cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) + cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count : 80) + cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation : 80) + cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation : 20) + cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization : 80) + cluster_name = local.cluster_name + cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) + nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold : 5) + cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) + prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) + } + ) } resource "helm_release" "prometheus" { @@ -108,7 +147,7 @@ resource "helm_release" "prometheus" { repository = "https://prometheus-community.github.io/helm-charts" values = [ - data.template_file.prom_template[count.index].rendered + local.prom_template ] } @@ -128,27 +167,8 @@ resource "helm_release" "alerts_teams" { depends_on = [helm_release.prometheus] } -data "template_file" "cluster-alerts" { - template = file("./templates/cluster-level-alerts.yaml") - vars = { - cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) - cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) - cluster_node_count_max_value = var.node_config.max_count - cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) - cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) - cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) - cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) - cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) - cluster_name = local.cluster_name - cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) - nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) - cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) - prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) - } -} - resource "kubectl_manifest" "cluster-alerts" { count = local.prometheus_enable ? 1 : 0 - yaml_body = data.template_file.cluster-alerts.rendered + yaml_body = local.cluster_alert_config depends_on = [helm_release.prometheus] } diff --git a/k8s/aws/eks/reloader.tf b/k8s/aws/eks/reloader.tf index ed503948..403dc2a2 100644 --- a/k8s/aws/eks/reloader.tf +++ b/k8s/aws/eks/reloader.tf @@ -1,7 +1,8 @@ -data "template_file" "reloader_template" { - template = file("${path.module}/templates/reloader-values.yaml") - vars = { - } +locals { + reloader_template = templatefile( + "${path.module}/templates/reloader-values.yaml", + {} + ) } resource "helm_release" "reloader" { @@ -11,7 +12,7 @@ resource "helm_release" "reloader" { version = "1.0.60" values = [ - data.template_file.reloader_template.rendered + local.reloader_template ] depends_on = [ diff --git a/k8s/aws/eks/sg.tf b/k8s/aws/eks/sg.tf index 43edf1d1..8a085c20 100644 --- a/k8s/aws/eks/sg.tf +++ b/k8s/aws/eks/sg.tf @@ -32,13 +32,6 @@ resource "aws_security_group" "external_worker_group_mgmt" { cidr_blocks = ["0.0.0.0/0"] } - ingress { - from_port = 22 - to_port = 22 - protocol = "tcp" - cidr_blocks = ["0.0.0.0/0"] - } - egress { from_port = 0 to_port = 0 @@ -59,7 +52,7 @@ resource "aws_security_group" "all_worker_mgmt" { protocol = "tcp" cidr_blocks = [ - "0.0.0.0/0" + "10.0.0.0/8" ] } diff --git a/k8s/aws/eks/templates/user-data.tpl b/k8s/aws/eks/templates/user-data.tpl index 10781070..77e3a052 100644 --- a/k8s/aws/eks/templates/user-data.tpl +++ b/k8s/aws/eks/templates/user-data.tpl @@ -1,5 +1,5 @@ cd ~/ -api_key="2a374411747e464fda7380af2a20264e89ec57393a36cf30cc2f66125f4e5d7a" +api_key="${nessus_api_key}" sudo unzip /tmp/agentPackages.zip -d /tmp sudo chmod +x /tmp/install.sh sudo /tmp/install.sh -i diff --git a/k8s/aws/eks/vars.tf b/k8s/aws/eks/vars.tf index ca13721c..c3f60b6b 100644 --- a/k8s/aws/eks/vars.tf +++ b/k8s/aws/eks/vars.tf @@ -722,6 +722,20 @@ variable "observability_config" { metrics_ingestion_time_range_slack = optional(string) })) })) + openobserve = optional(list(object({ + enable = bool + name = string + replicaCount = optional(number, 2) + min_cpu = optional(string, "250m") + max_cpu = optional(string, "1") + min_memory = optional(string, "1Gi") + max_memory = optional(string, "2Gi") + enable_ingress = optional(bool, true) + env = optional(list(object({ + name = string + value = string + })), []) + })), []) }) default = null } diff --git a/k8s/aws/eks/velero.tf b/k8s/aws/eks/velero.tf index 99b3282a..8e429ad5 100644 --- a/k8s/aws/eks/velero.tf +++ b/k8s/aws/eks/velero.tf @@ -23,12 +23,12 @@ resource "aws_iam_user_policy" "velero" { "s3:AbortMultipartUpload", "s3:ListMultipartUploadParts" ], - Resource = "arn:aws:s3:::*/*" + Resource = "arn:aws:s3:::k8s-resource-backups/*" }, { Effect = "Allow", Action = ["s3:ListBucket"], - Resource = "arn:aws:s3:::*" + Resource = "arn:aws:s3:::k8s-resource-backups" }, { Effect = "Allow", @@ -52,19 +52,19 @@ resource "aws_iam_access_key" "velero" { user = aws_iam_user.velero[0].name } -data "template_file" "velero_values" { - count = var.velero_enabled ? 1 : 0 - - template = file("${path.module}/templates/velero-values.yaml") - - vars = { - access_key = aws_iam_access_key.velero[0].id - secret_access_key = aws_iam_access_key.velero[0].secret - bucket_name = "k8s-resource-backups" - region = var.app_region - } +locals { + velero_values = var.velero_enabled ? templatefile( + "${path.module}/templates/velero-values.yaml", + { + access_key = aws_iam_access_key.velero[0].id + secret_access_key = aws_iam_access_key.velero[0].secret + bucket_name = "k8s-resource-backups" + region = var.app_region + } + ) : null } + resource "helm_release" "velero" { count = var.velero_enabled ? 1 : 0 name = "velero" @@ -75,7 +75,7 @@ resource "helm_release" "velero" { create_namespace = true depends_on = [module.eks] - values = [data.template_file.velero_values[0].rendered] + values = [local.velero_values] } resource "time_sleep" "wait_for_velero" { diff --git a/k8s/aws/namespace/issuer.tf b/k8s/aws/namespace/issuer.tf index 416f55a3..246e6c5e 100644 --- a/k8s/aws/namespace/issuer.tf +++ b/k8s/aws/namespace/issuer.tf @@ -1,15 +1,22 @@ -data "template_file" "issuer" { - template = file("./templates/issuer.yaml") - vars = { - namespace = kubernetes_namespace.app_environments.metadata[0].name - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - email = var.cert_issuer_config.email - } +locals { + issuer = templatefile( + "${path.module}/templates/issuer.yaml", + { + namespace = kubernetes_namespace.app_environments.metadata[0].name + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" + ? "https://acme-staging-v02.api.letsencrypt.org/directory" + : "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + email = var.cert_issuer_config.email + } + ) } resource "kubectl_manifest" "namespace_issuer" { count = length(local.service_custom_domain_list) != 0 ? 1 : 0 - yaml_body = data.template_file.issuer.rendered + yaml_body = local.issuer } resource "kubernetes_secret_v1" "namespace-cert-replicator" { diff --git a/k8s/azure/aks/cert-manager.tf b/k8s/azure/aks/cert-manager.tf index 68fca2d9..6450462e 100644 --- a/k8s/azure/aks/cert-manager.tf +++ b/k8s/azure/aks/cert-manager.tf @@ -1,5 +1,34 @@ -data "template_file" "cert_manager_template" { - template = file("./templates/cert-manager-values.yaml") +locals { + cert_manager_template = templatefile( + "${path.module}/templates/cert-manager-values.yaml", + {} + ) + + cluster_wildcard_issuer = templatefile( + "${path.module}/templates/cluster-issuer.yaml", + { + DNS = var.accessibility.domain_name + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" ? + "https://acme-staging-v02.api.letsencrypt.org/directory" : + "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + location = var.app_region + RESOURCE_GROUP_NAME = var.resource_group_name + SUBSCRIPTION_ID = data.azurerm_subscription.current.subscription_id + CLIENT_ID = data.azurerm_kubernetes_cluster.cluster.kubelet_identity[0].client_id + email = var.cert_issuer_config.email + dns_zone_list = join(",", var.dns_zone_list) + } + ) + + cluster_wildcard_certificate = templatefile( + "${path.module}/templates/cluster-certificate.yaml", + { + dns = local.domain_name + } + ) } resource "azurerm_role_assignment" "cert-manager" { @@ -37,38 +66,17 @@ resource "helm_release" "cert_manager" { value = "true" } - values = [data.template_file.cert_manager_template.rendered] -} - -data "template_file" "cluster_wildcard_issuer" { - template = file("./templates/cluster-issuer.yaml") - vars = { - DNS = var.accessibility.domain_name - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - location = var.app_region - RESOURCE_GROUP_NAME = var.resource_group_name - SUBSCRIPTION_ID = data.azurerm_subscription.current.subscription_id - CLIENT_ID = data.azurerm_kubernetes_cluster.cluster.kubelet_identity[0].client_id - email = var.cert_issuer_config.email - dns_zone_list = join(",", var.dns_zone_list) - } - depends_on = [helm_release.cert_manager, kubernetes_namespace.monitoring] + values = [local.cert_manager_template] } resource "kubectl_manifest" "cluster_wildcard_issuer" { - yaml_body = data.template_file.cluster_wildcard_issuer.rendered -} - -data "template_file" "cluster_wildcard_certificate" { - template = file("./templates/cluster-certificate.yaml") - vars = { - dns = local.domain_name - } - depends_on = [kubectl_manifest.cluster_wildcard_issuer] + yaml_body = local.cluster_wildcard_issuer + depends_on = [helm_release.cert_manager] } resource "kubectl_manifest" "cluster_wildcard_certificate" { - yaml_body = data.template_file.cluster_wildcard_certificate.rendered + yaml_body = local.cluster_wildcard_certificate + depends_on = [kubectl_manifest.cluster_wildcard_issuer] } resource "kubernetes_secret_v1" "certificate_replicator" { diff --git a/k8s/azure/aks/fluentbit.tf b/k8s/azure/aks/fluentbit.tf index 671b5bec..0a9845c6 100644 --- a/k8s/azure/aks/fluentbit.tf +++ b/k8s/azure/aks/fluentbit.tf @@ -68,28 +68,26 @@ locals { } if length(local.fluent_bit_slack) > 0 ] -} - -data template_file "fluent-bit"{ - count = local.fluent_bit_enable ? 1 : 0 - template = file("./templates/fluent-bit-values.yaml") - vars = { - "CLUSTER_NAME" = local.cluster_name - "TAGS" = join(",", [for key, value in local.common_tags : "${key}=${value}"]) + fluent_bit_values = local.fluent_bit_enable ? templatefile( + "${path.module}/templates/fluent-bit-values.yaml", + { + CLUSTER_NAME = local.cluster_name + TAGS = join(",", [for key, value in local.common_tags : "${key}=${value}"]) - "HTTP_SERVER" = "On" - "HTTP_PORT" = "2020" + HTTP_SERVER = "On" + HTTP_PORT = "2020" - "READ_FROM_HEAD" = "Off" - "READ_FROM_TAIL" = "On" + READ_FROM_HEAD = "Off" + READ_FROM_TAIL = "On" - fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) - fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) - fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) - fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) - fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) - fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) - } + fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) + fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) + fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) + fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) + fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) + fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) + } + ) : null } resource "helm_release" "fluentbit-config" { @@ -101,7 +99,7 @@ resource "helm_release" "fluentbit-config" { namespace = kubernetes_namespace.monitoring.metadata.0.name values = [ - data.template_file.fluent-bit[0].rendered + local.fluent_bit_values ] depends_on = [ kubernetes_namespace.monitoring diff --git a/k8s/azure/aks/grafana-dashboard.tf b/k8s/azure/aks/grafana-dashboard.tf index 9f23d7ef..5fd789ec 100644 --- a/k8s/azure/aks/grafana-dashboard.tf +++ b/k8s/azure/aks/grafana-dashboard.tf @@ -32,7 +32,7 @@ locals { users_with_roles = flatten([ for role, emails in var.grafana_access: [ - for email in emails : { + for email in coalesce(try(emails, null), []) : { email = email role = local.role_map[role] } diff --git a/k8s/azure/aks/grafana.tf b/k8s/azure/aks/grafana.tf index 04aff88d..d95dcd1a 100644 --- a/k8s/azure/aks/grafana.tf +++ b/k8s/azure/aks/grafana.tf @@ -5,46 +5,44 @@ locals { grafana_allowed_domains = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.domains != null ? join(",", var.observability_config.grafana.configs.domains) : "") : "", "") grafana_enable = try(var.observability_config.grafana != null ? var.observability_config.grafana.enable : false, false) grafana_host = try(var.observability_config.grafana.url != null ? var.observability_config.grafana.url : (local.domain_name != "" ? "grafana.${local.domain_name}" : ""), "") - -} - -data "template_file" "grafana_template" { - count = local.grafana_enable ? 1 : 0 - template = file("./templates/grafana-values.yaml") - vars = { - NAMESPACE = "monitoring" - GRAFANA_HOST = local.grafana_host - GRAFANA_ENABLED = local.grafana_enable - GRAFANA_TLS_HOST = "*.${local.domain_name}" - GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") - CLUSTER_NAME = var.app_name - PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) - PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) - PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") - GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") - GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "postgres" : "", "") - GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_url : "", "") - GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_password : "", "") - GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_admin_user : "", "") - GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) - GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) - GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") - GRAFANA_REQUEST_CPU = try( var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") - GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory != null ? var.observability_config.grafana.limit_memory: "500Mi", "500Mi") - GRAFANA_LIMIT_CPU = try( var.observability_config.grafana.limit_cpu != null ? var.observability_config.grafana.limit_cpu : "500m", "500m") - GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") - GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") - GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") - GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") - GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") - GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") - GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") - GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") - ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) :false, false) - ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" - OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.azurerm_key_vault_secret.oauth_client_id[0].value : null) : null, null) - OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.azurerm_key_vault_secret.oauth_client_secret[0].value : null) : null, null) - } + grafana_template = local.grafana_enable ? templatefile( + "${path.module}/templates/grafana-values.yaml", + { + NAMESPACE = "monitoring" + GRAFANA_HOST = local.grafana_host + GRAFANA_ENABLED = local.grafana_enable + GRAFANA_TLS_HOST = "*.${local.domain_name}" + GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") + CLUSTER_NAME = var.app_name + PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) + PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) + PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") + GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") + GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "postgres" : "", "") + GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_url : "", "") + GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_password : "", "") + GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_admin_user : "", "") + GRAFANA_DB_SSL_MODE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "require" : "disable", "disable") + GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) + GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) + GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") + GRAFANA_REQUEST_CPU = try(var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") + GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory != null ? var.observability_config.grafana.limit_memory: "500Mi", "500Mi") + GRAFANA_LIMIT_CPU = try(var.observability_config.grafana.limit_cpu != null ? var.observability_config.grafana.limit_cpu : "500m", "500m") + GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") + GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") + GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") + GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") + GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") + GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") + GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") + GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") + ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) : false, false) + ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" + OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.azurerm_key_vault_secret.oauth_client_id[0].value : null) : null, null) + OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.azurerm_key_vault_secret.oauth_client_secret[0].value : null) : null, null) + } + ) : null } resource "helm_release" "grafana" { @@ -58,7 +56,7 @@ resource "helm_release" "grafana" { repository = "https://grafana.github.io/helm-charts" values = [ - data.template_file.grafana_template[count.index].rendered + local.grafana_template ] depends_on = [helm_release.prometheus] } @@ -148,6 +146,7 @@ module "grafana_db" { sku_name = "GP_Standard_D2s_v3" storage_mb = 32768 key_vault_id = azurerm_key_vault.secrets.id + vpc = var.vpc tags = local.common_tags } diff --git a/k8s/azure/aks/k8s-events.tf b/k8s/azure/aks/k8s-events.tf index a51a089e..49677cdb 100644 --- a/k8s/azure/aks/k8s-events.tf +++ b/k8s/azure/aks/k8s-events.tf @@ -30,23 +30,21 @@ locals { }] : [] all_loki_receivers = concat(local.loki_receivers,local.observability_loki_recievers) -} - -data "template_file" "k8s_event_exporter" { - count = local.enable_k8s_event_exporter || local.enable_loki ? 1 : 0 - template = file("./templates/event-exporter-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level != null ? var.observability_config.kubernetes_event_exporter.log_level : "error" , "error") - MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second != null ? var.observability_config.kubernetes_event_exporter.max_event_age_second : "150" , "150") - LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) - WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) - LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu : "400m", "400m") - LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory != null ? var.observability_config.kubernetes_event_exporter.resource.limit_memory : "250Mi", "250Mi") - REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.request_cpu : "100m", "100m") - REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory != null ? var.observability_config.kubernetes_event_exporter.resource.request_memory : "100Mi", "100Mi") - } + k8s_event_exporter = (local.enable_k8s_event_exporter || local.enable_loki) ? templatefile( + "${path.module}/templates/event-exporter-values.yaml", + { + CLUSTER_NAME = local.cluster_name + LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level != null ? var.observability_config.kubernetes_event_exporter.log_level : "error", "error") + MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second != null ? var.observability_config.kubernetes_event_exporter.max_event_age_second : "150", "150") + LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) + WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) + LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu : "400m", "400m") + LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory != null ? var.observability_config.kubernetes_event_exporter.resource.limit_memory : "250Mi", "250Mi") + REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.request_cpu : "100m", "100m") + REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory != null ? var.observability_config.kubernetes_event_exporter.resource.request_memory : "100Mi", "100Mi") + } + ) : null } resource "helm_release" "kubernetes_event_exporter" { @@ -59,6 +57,6 @@ resource "helm_release" "kubernetes_event_exporter" { namespace = helm_release.prometheus[0].namespace values = [ - data.template_file.k8s_event_exporter[count.index].rendered + local.k8s_event_exporter ] } \ No newline at end of file diff --git a/k8s/azure/aks/main.tf b/k8s/azure/aks/main.tf index 54db2bfc..236129b8 100644 --- a/k8s/azure/aks/main.tf +++ b/k8s/azure/aks/main.tf @@ -1,5 +1,18 @@ data "azurerm_subscription" "current" {} +data "azurerm_virtual_network" "vnet" { + count = var.vpc != "" ? 1 : 0 + name = var.vpc + resource_group_name = var.resource_group_name +} + +data "azurerm_subnet" "aks_subnet" { + count = var.vpc != "" && var.subnet != "" ? 1 : 0 + name = var.subnet + resource_group_name = var.resource_group_name + virtual_network_name = data.azurerm_virtual_network.vnet[0].name +} + resource "random_password" "aks_sp_pwd" { length = 16 special = false @@ -11,6 +24,27 @@ locals { environment = var.app_env == "" ? element(local.cluster_name_parts, length(local.cluster_name_parts) - 1) : var.app_env node_port = 32443 # Node port which will be used by LB for exposure + # VNet configuration flags + vnet_enabled = var.vpc != "" && var.subnet != "" + + # Service CIDR configuration constants + service_cidr_prefix = 20 + dns_service_ip_fourth_octet = 10 + + # Calculate service CIDR from VNet data source to avoid conflicts + # Extract base IP from VNet address space (e.g., "10.1" from "10.1.0.0/16") + vnet_address_space = local.vnet_enabled ? try(data.azurerm_virtual_network.vnet[0].address_space[0], "") : "" + vnet_cidr_parts = local.vnet_enabled && local.vnet_address_space != "" ? split("/", local.vnet_address_space) : [] + vnet_base_ip_parts = length(local.vnet_cidr_parts) > 0 ? split(".", local.vnet_cidr_parts[0]) : [] + vnet_base_ip = length(local.vnet_base_ip_parts) >= 2 ? "${local.vnet_base_ip_parts[0]}.${local.vnet_base_ip_parts[1]}" : "" + + # Use high range in VNet for service CIDR to avoid subnet conflicts + # Example: For VNet 10.1.0.0/16, use service CIDR 10.1.240.0/20 + # This avoids typical subnet ranges like 10.1.1.0/24, 10.1.2.0/24, etc. + # The third octet is configurable via var.service_cidr_third_octet (default: 240) + service_cidr = local.vnet_enabled && local.vnet_base_ip != "" ? "${local.vnet_base_ip}.${var.service_cidr_third_octet}.0/${local.service_cidr_prefix}" : null + dns_service_ip = local.vnet_enabled && local.vnet_base_ip != "" ? "${local.vnet_base_ip}.${var.service_cidr_third_octet}.${local.dns_service_ip_fourth_octet}" : null + common_tags = merge(var.common_tags, tomap({ Project = local.cluster_name, @@ -67,6 +101,18 @@ module "aks" { oidc_issuer_enabled = true temporary_name_for_rotation = "${var.app_name}1" secret_rotation_enabled = true + + # VNet configuration - when VNet is provided, use Azure CNI with public node IPs + # Nodes will have public IPs for internet access and can connect to SQL/Redis via VNet + vnet_subnet_id = local.vnet_enabled ? data.azurerm_subnet.aks_subnet[0].id : null + network_plugin = local.vnet_enabled ? "azure" : "kubenet" + network_policy = local.vnet_enabled ? "azure" : null + enable_node_public_ip = local.vnet_enabled ? true : null + + # Service CIDR configuration - automatically calculated from VNet to avoid subnet conflicts + net_profile_service_cidr = local.service_cidr + net_profile_dns_service_ip = local.dns_service_ip + tags = merge(local.common_tags, tomap({ "Name" = local.cluster_name @@ -82,4 +128,4 @@ resource "null_resource" "aks_vmss_managed_identity" { command = "az vmss identity assign -g ${module.aks.node_resource_group} -n ${data.azurerm_resources.aks_vmscaleset_resource.resources[0].name}" } depends_on = [module.aks] -} \ No newline at end of file +} diff --git a/k8s/azure/aks/observability.tf b/k8s/azure/aks/observability.tf index bbb35749..b9871b91 100644 --- a/k8s/azure/aks/observability.tf +++ b/k8s/azure/aks/observability.tf @@ -3,6 +3,7 @@ locals { enable_tempo = try(var.observability_config.tempo != null ? var.observability_config.tempo.enable : false, false) enable_cortex = try(var.observability_config.cortex != null ? var.observability_config.cortex.enable : false, false) enable_mimir = try(var.observability_config.mimir != null ? var.observability_config.mimir.enable : false,false) + enable_openobserve = try(var.observability_config.openobserve != null ? length(var.observability_config.openobserve) > 0 && anytrue([for instance in var.observability_config.openobserve : instance.enable]) : false, false) storage_account = "${replace(local.cluster_name,"-","")}${random_string.storage_account_suffix.result}" } @@ -15,7 +16,7 @@ resource "random_string" "storage_account_suffix" { } resource "azurerm_storage_account" "aks_storage_account" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 name = local.storage_account resource_group_name = var.resource_group_name location = var.app_region @@ -24,7 +25,7 @@ resource "azurerm_storage_account" "aks_storage_account" { } resource "azurerm_key_vault_secret" "observability_az_user" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 name = "observability-${local.environment}-azure-user" value = azurerm_storage_account.aks_storage_account[0].primary_access_key key_vault_id = azurerm_key_vault.secrets.id @@ -32,7 +33,7 @@ resource "azurerm_key_vault_secret" "observability_az_user" { module "observability" { - count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir) ? 1: 0 + count = (local.enable_cortex || local.enable_loki || local.enable_tempo || local.enable_mimir || local.enable_openobserve) ? 1: 0 source = "../../../observability/azure" @@ -49,6 +50,7 @@ module "observability" { tempo = var.observability_config.tempo cortex = var.observability_config.cortex mimir = var.observability_config.mimir + openobserve = var.observability_config.openobserve depends_on = [helm_release.prometheus,azurerm_storage_account.aks_storage_account] } diff --git a/k8s/azure/aks/outputs.tf b/k8s/azure/aks/outputs.tf index 9419005b..a8d624a9 100644 --- a/k8s/azure/aks/outputs.tf +++ b/k8s/azure/aks/outputs.tf @@ -26,7 +26,7 @@ output "cluster_name" { } output "k8s_version" { - value = "1.31.10" + value = "1.33.0" } output "os" { @@ -141,4 +141,10 @@ output "grafana_user_credentials" { output "dns_zone_list" { description = "The list of Azure DNS zone names used in the ClusterIssuer solvers." value = var.dns_zone_list +} + +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" + value = try(module.observability[0].openobserve_instances, {}) + sensitive = true } \ No newline at end of file diff --git a/k8s/azure/aks/prometheus.tf b/k8s/azure/aks/prometheus.tf index d8e5b871..ed9eb426 100644 --- a/k8s/azure/aks/prometheus.tf +++ b/k8s/azure/aks/prometheus.tf @@ -66,40 +66,55 @@ locals{ }] : [] remote_write_config = concat(local.remote_write_config_list, local.default_remote_write_config) -} - - -data "template_file" "prom_template" { - count = local.prometheus_enable ? 1 : 0 - - template = file("./templates/prometheus-values.yaml") - vars = { - PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") - PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "45GB", "45GB") - PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "10d", "10d") - CLUSTER_NAME = var.app_name - DOMAIN_NAME = var.accessibility.domain_name - REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) - ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false - MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts == {} ? false : true - MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) == "" && jsonencode(local.cluster_teams_alerts) == "" ? false : true - MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) - MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key - MOOGSOFT_USERNAME = var.moogsoft_username - teams_webhook_alerts = jsonencode(local.cluster_alerts) - cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) - cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) - GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts == "" ? false : true - GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) - PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts == "" ? false : true - PAGER_DUTY_KEY = var.pagerduty_integration_key - PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) - GRAFANA_HOST = local.grafana_host - SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts == "" ? false : true - WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts == "" ? false : true - SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) - WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) - } + prom_template = local.prometheus_enable ? templatefile( + "${path.module}/templates/prometheus-values.yaml", + { + PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") + PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "45GB", "45GB") + PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "10d", "10d") + CLUSTER_NAME = var.app_name + DOMAIN_NAME = var.accessibility.domain_name + REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) + ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false + MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts != {} ? true : false + MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" ? true : false + MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) + MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key + MOOGSOFT_USERNAME = var.moogsoft_username + teams_webhook_alerts = jsonencode(local.cluster_alerts) + cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) + cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) + GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts != "" ? true : false + GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) + PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts != "" ? true : false + PAGER_DUTY_KEY = var.pagerduty_integration_key + PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) + GRAFANA_HOST = local.grafana_host + SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts != "" ? true : false + WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts != "" ? true : false + SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) + WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) + } + ) : "" + + cluster_alerts_config = templatefile( + "${path.module}/templates/cluster-level-alerts.yaml", + { + cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) + cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) + cluster_node_count_max_value = var.node_config.max_count + cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) + cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) + cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) + cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) + cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) + cluster_name = local.cluster_name + cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) + nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) + cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) + prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) + } + ) } resource "helm_release" "prometheus" { @@ -114,7 +129,7 @@ resource "helm_release" "prometheus" { repository = "https://prometheus-community.github.io/helm-charts" values = [ - data.template_file.prom_template[count.index].rendered + local.prom_template ] depends_on = [azurerm_public_ip.app_public_ip] } @@ -135,28 +150,9 @@ resource "helm_release" "alerts_teams" { depends_on = [helm_release.prometheus] } -data "template_file" "cluster-alerts" { - template = file("./templates/cluster-level-alerts.yaml") - vars = { - cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) - cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) - cluster_node_count_max_value = var.node_config.max_count - cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) - cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) - cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) - cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) - cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) - cluster_name = local.cluster_name - cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) - nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) - cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) - prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) - } -} - resource "kubectl_manifest" "cluster-alerts" { count = local.grafana_enable ? 1 : 0 - yaml_body = data.template_file.cluster-alerts.rendered + yaml_body = local.cluster_alerts_config depends_on = [helm_release.prometheus] } diff --git a/k8s/azure/aks/reloader.tf b/k8s/azure/aks/reloader.tf index ed503948..222aef1e 100644 --- a/k8s/azure/aks/reloader.tf +++ b/k8s/azure/aks/reloader.tf @@ -1,7 +1,8 @@ -data "template_file" "reloader_template" { - template = file("${path.module}/templates/reloader-values.yaml") - vars = { - } +locals { + reloader_template = templatefile( + "${path.module}/templates/reloader-values.yaml", + {} # no vars + ) } resource "helm_release" "reloader" { @@ -11,7 +12,7 @@ resource "helm_release" "reloader" { version = "1.0.60" values = [ - data.template_file.reloader_template.rendered + local.reloader_template ] depends_on = [ diff --git a/k8s/azure/aks/secrets.tf b/k8s/azure/aks/secrets.tf index 3fae1e46..fc03d8e2 100644 --- a/k8s/azure/aks/secrets.tf +++ b/k8s/azure/aks/secrets.tf @@ -24,6 +24,7 @@ resource "azurerm_key_vault" "secrets" { tenant_id = data.azurerm_client_config.current.tenant_id sku_name = "standard" soft_delete_retention_days = 7 + purge_protection_enabled = true access_policy { tenant_id = data.azurerm_client_config.current.tenant_id diff --git a/k8s/azure/aks/templates/grafana-values.yaml b/k8s/azure/aks/templates/grafana-values.yaml index c1071c4b..324262b7 100644 --- a/k8s/azure/aks/templates/grafana-values.yaml +++ b/k8s/azure/aks/templates/grafana-values.yaml @@ -19,6 +19,7 @@ grafana.ini: host: ${GRAFANA_DB_HOST} user: ${GRAFANA_DB_USER} password: ${GRAFANA_DB_PASSWORD} + ssl_mode: ${GRAFANA_DB_SSL_MODE} %{~ endif ~} server: root_url: https://${GRAFANA_HOST} diff --git a/k8s/azure/aks/vars.tf b/k8s/azure/aks/vars.tf index 85c05392..a634e45a 100644 --- a/k8s/azure/aks/vars.tf +++ b/k8s/azure/aks/vars.tf @@ -44,7 +44,7 @@ variable "public_ingress" { variable "kubernetes_version" { description = "Kubernetes version of the AKS Cluster" type = string - default = "1.31.10" + default = "1.33.0" } variable "user_access" { @@ -580,6 +580,20 @@ variable "observability_config" { metrics_ingestion_time_range_slack = optional(string) })) })) + openobserve = optional(list(object({ + enable = bool + name = string + replicaCount = optional(number, 2) + min_cpu = optional(string, "250m") + max_cpu = optional(string, "1") + min_memory = optional(string, "1Gi") + max_memory = optional(string, "2Gi") + enable_ingress = optional(bool, true) + env = optional(list(object({ + name = string + value = string + })), []) + })), []) }) default = null } @@ -706,4 +720,22 @@ variable "dns_zone_list" { description = "List of Azure DNS zone names to be used in the ClusterIssuer solvers." type = list(string) default = [] -} \ No newline at end of file +} + +variable "vpc" { + description = "VNet name the apps are going to use. When provided along with subnet, resources will be deployed inside the VNet." + type = string + default = "" +} + +variable "subnet" { + description = "Subnet name the apps are going to use. Must be provided along with vpc for VNet integration." + type = string + default = "" +} + +variable "service_cidr_third_octet" { + description = "Third octet for Kubernetes service CIDR calculation (e.g., 240 for 10.1.240.0/20). Should be in high range to avoid conflicts with typical subnet ranges. Default: 240" + type = number + default = 240 +} diff --git a/k8s/azure/namespace/issuer.tf b/k8s/azure/namespace/issuer.tf index ec1d400e..e763c8f5 100644 --- a/k8s/azure/namespace/issuer.tf +++ b/k8s/azure/namespace/issuer.tf @@ -1,14 +1,21 @@ -data "template_file" "issuer" { - template = file("${path.module}/templates/issuer.yaml") - vars = { - namespace = kubernetes_namespace.app_environments.metadata[0].name - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - email = var.cert_issuer_config.email - } +locals { + issuer_template = templatefile( + "${path.module}/templates/issuer.yaml", + { + namespace = kubernetes_namespace.app_environments.metadata[0].name + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" ? + "https://acme-staging-v02.api.letsencrypt.org/directory" : + "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + email = var.cert_issuer_config.email + } + ) } resource "kubectl_manifest" "namespace_issuer" { - yaml_body = data.template_file.issuer.rendered + yaml_body = local.issuer_template } resource "kubernetes_secret_v1" "namespace-cert-replicator" { diff --git a/k8s/azure/namespace/sql.tf b/k8s/azure/namespace/sql.tf index 21380dd0..78e60b7d 100644 --- a/k8s/azure/namespace/sql.tf +++ b/k8s/azure/namespace/sql.tf @@ -42,6 +42,7 @@ module "mysql" { io_scaling_enabled = var.sql_db.iops_scaling != null ? var.sql_db.iops_scaling : false read_replica = var.sql_db.read_replica != null ? var.sql_db.read_replica : false key_vault_id = data.azurerm_key_vault.secrets.id + vpc = var.vpc tags = local.common_tags } @@ -81,6 +82,7 @@ module "postgresql" { read_replica = var.sql_db.read_replica != null ? var.sql_db.read_replica : false key_vault_id = data.azurerm_key_vault.secrets.id enable_ssl = var.sql_db.enable_ssl != null ? var.sql_db.enable_ssl : false + vpc = var.vpc tags = merge(local.common_tags, tomap({ @@ -128,6 +130,7 @@ module "mysql_v2" { read_replica = each.value.read_replica != null ? each.value.read_replica : false multi_ds = true key_vault_id = data.azurerm_key_vault.secrets.id + vpc = var.vpc tags = local.common_tags } @@ -174,6 +177,7 @@ module "postgres_v2" { key_vault_id = data.azurerm_key_vault.secrets.id multi_ds = true enable_ssl = each.value.enable_ssl != null ? each.value.enable_ssl : false + vpc = var.vpc tags = merge(local.common_tags, tomap({ diff --git a/k8s/azure/namespace/vars.tf b/k8s/azure/namespace/vars.tf index 51052b5b..5e8093ce 100644 --- a/k8s/azure/namespace/vars.tf +++ b/k8s/azure/namespace/vars.tf @@ -334,7 +334,7 @@ variable "shared_services" { } variable "vpc" { - description = "VPC the apps are going to use" + description = "VNet name the apps are going to use" type = string default = "" } diff --git a/k8s/gcp/gke/autoscale.tf b/k8s/gcp/gke/autoscale.tf index a0f9b757..65cc7151 100644 --- a/k8s/gcp/gke/autoscale.tf +++ b/k8s/gcp/gke/autoscale.tf @@ -1,11 +1,13 @@ -data "template_file" "autoscale_template" { - template = file("./templates/cluster-auto-scaler-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - SERVICE_ACCOUNT = google_service_account.cluster_autoscaler.email - MIN_COUNT = var.node_config.min_count - MAX_COUNT = var.node_config.max_count - } +locals { + autoscale_template = templatefile( + "${path.module}/templates/cluster-auto-scaler-values.yaml", + { + CLUSTER_NAME = local.cluster_name + SERVICE_ACCOUNT = google_service_account.cluster_autoscaler.email + MIN_COUNT = var.node_config.min_count + MAX_COUNT = var.node_config.max_count + } + ) } resource "helm_release" "auto_scaler" { @@ -15,7 +17,7 @@ resource "helm_release" "auto_scaler" { namespace = "kube-system" version = "9.28.0" - values = [data.template_file.autoscale_template.rendered] + values = [local.autoscale_template] } resource "google_service_account" "cluster_autoscaler" { diff --git a/k8s/gcp/gke/cert-manager.tf b/k8s/gcp/gke/cert-manager.tf index 5051dd47..457cd592 100644 --- a/k8s/gcp/gke/cert-manager.tf +++ b/k8s/gcp/gke/cert-manager.tf @@ -1,3 +1,35 @@ +locals { + cert_manager_template = templatefile( + "${path.module}/templates/cert-manager-values.yaml", + { + CLUSTER_NAME = local.cluster_name + SERVICE_ACCOUNT = google_service_account.wildcard_dns_solver.email + } + ) + + cluster_wildcard_issuer = templatefile( + "${path.module}/templates/cluster-issuer.yaml", + { + email = var.cert_issuer_config.email + provider = var.provider_id + dns = local.domain_name + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" + ? "https://acme-staging-v02.api.letsencrypt.org/directory" + : "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + } + ) + + cluster_wildcard_certificate = templatefile( + "${path.module}/templates/cluster-certificate.yaml", + { + dns = local.domain_name + } + ) +} + resource "google_service_account" "wildcard_dns_solver" { account_id = "${local.cluster_name}-wildcard" display_name = "${local.cluster_name} wildcard dns01 solver" @@ -37,13 +69,6 @@ resource "google_project_iam_member" "wildcard_dns_solver_iam" { member = "serviceAccount:${google_service_account.wildcard_dns_solver.email}" } -data "template_file" "cert_manager_template" { - template = file("./templates/cert-manager-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - SERVICE_ACCOUNT = google_service_account.wildcard_dns_solver.email - } -} resource "helm_release" "cert-manager" { name = "cert-manager" @@ -58,35 +83,25 @@ resource "helm_release" "cert-manager" { value = "true" } - values = [data.template_file.cert_manager_template.rendered] -} - - -data "template_file" "cluster_wildcard_issuer" { - template = file("./templates/cluster-issuer.yaml") - vars = { - email = var.cert_issuer_config.email - provider = var.provider_id - dns = local.domain_name - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - } - depends_on = [helm_release.cert-manager,kubernetes_namespace.monitoring] + values = [local.cert_manager_template] } resource "kubectl_manifest" "cluster_wildcard_issuer" { - yaml_body = data.template_file.cluster_wildcard_issuer.rendered -} + yaml_body = local.cluster_wildcard_issuer + wait = true -data "template_file" "cluster_wildcard_certificate" { - template = file("./templates/cluster-certificate.yaml") - vars = { - dns = local.domain_name - } - depends_on = [kubectl_manifest.cluster_wildcard_issuer] + depends_on = [ + helm_release.cert-manager + ] } resource "kubectl_manifest" "cluster_wildcard_certificate" { - yaml_body = data.template_file.cluster_wildcard_certificate.rendered + yaml_body = local.cluster_wildcard_certificate + wait = true + + depends_on = [ + kubectl_manifest.cluster_wildcard_issuer + ] } resource "kubernetes_secret_v1" "certificate_replicator" { diff --git a/k8s/gcp/gke/fluentbit.tf b/k8s/gcp/gke/fluentbit.tf index af3db6d4..14eb47e9 100644 --- a/k8s/gcp/gke/fluentbit.tf +++ b/k8s/gcp/gke/fluentbit.tf @@ -68,42 +68,42 @@ locals { } if length(local.fluent_bit_slack) > 0 ] -} - -data template_file "fluent-bit"{ - count = local.fluent_bit_enable ? 1 : 0 - template = file("./templates/fluent-bit-values.yaml") - vars = { - "CLUSTER_NAME" = local.cluster_name - "SERVICE_ACCOUNT" = "serviceAccount:${data.google_project.this.number}-compute@developer.gserviceaccount.com" - "GCP_REGION" = var.app_region - "TAGS" = join(",", [for key, value in local.common_tags : "${key}=${value}"]) + fluent_bit = local.fluent_bit_enable ? templatefile( + "${path.module}/templates/fluent-bit-values.yaml", + { + CLUSTER_NAME = local.cluster_name + SERVICE_ACCOUNT = "serviceAccount:${data.google_project.this.number}-compute@developer.gserviceaccount.com" + GCP_REGION = var.app_region + TAGS = join(",", [for key, value in local.common_tags : "${key}=${value}"]) - "HTTP_SERVER" = "On" - "HTTP_PORT" = "2020" + HTTP_SERVER = "On" + HTTP_PORT = "2020" - "READ_FROM_HEAD" = "Off" - "READ_FROM_TAIL" = "On" + READ_FROM_HEAD = "Off" + READ_FROM_TAIL = "On" - fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) - fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) - fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) - fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) - fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) - fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) - } + fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) + fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) + fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) + fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) + fluent_bit_newrelic_outputs= jsonencode(local.fluent_bit_newrelic_outputs) + fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) + } + ) : null } + + resource "helm_release" "fluentbit-config" { count = local.fluent_bit_enable ? 1 : 0 repository = "https://fluent.github.io/helm-charts" chart = "fluent-bit" name = "fluent-bit" - version = "0.35.0" + version = "0.54.0" namespace = kubernetes_namespace.monitoring.metadata.0.name values = [ - data.template_file.fluent-bit[0].rendered + local.fluent_bit ] depends_on = [ kubernetes_namespace.monitoring diff --git a/k8s/gcp/gke/grafana.tf b/k8s/gcp/gke/grafana.tf index 1aee2e28..8bb3c181 100644 --- a/k8s/gcp/gke/grafana.tf +++ b/k8s/gcp/gke/grafana.tf @@ -13,6 +13,41 @@ locals { private_key_with_suffix = try(split(local.private_key_start,base64decode(google_service_account_key.cloud_monitoring_svc_acc[0].private_key) )[1],"") private_key = split(local.private_key_end,local.private_key_with_suffix )[0] + grafana_template = local.grafana_enable ? templatefile( + "${path.module}/templates/grafana-values.yaml", + { + NAMESPACE = "monitoring" + GRAFANA_TLS_HOST = "*.${local.domain_name}" + GRAFANA_HOST = local.grafana_host + GRAFANA_ENABLED = local.grafana_enable + GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin[0].result, "") : "", "") + PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) + PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) + PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") + GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") + GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_type : "", "") + GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_instance_ip : "", "") + GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_password : "", "") + GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_admin_user : "", "") + GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) + GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) + GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") + GRAFANA_REQUEST_CPU = try(var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") + GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") + GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") + GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") + GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") + GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") + GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") + GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") + GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") + ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) : false, false) + ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" + OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.google_secret_manager_secret_version.oauth_client_id[0].secret_data : null) : null, null) + OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.google_secret_manager_secret_version.oauth_client_secret[0].secret_data : null) : null, null) + USE_MONITORING_NODE_POOL = try(local.enable_monitoring_node_pool == true ? local.enable_monitoring_node_pool : false , false) + } + ) : "" } resource "random_password" "observability_admin" { @@ -21,43 +56,6 @@ resource "random_password" "observability_admin" { special = false } -data "template_file" "grafana_template" { - count = local.grafana_enable ? 1 : 0 - template = file("${path.module}/templates/grafana-values.yaml") - vars = { - NAMESPACE = "monitoring" - GRAFANA_TLS_HOST = "*.${local.domain_name}" - GRAFANA_HOST = local.grafana_host - GRAFANA_ENABLED = local.grafana_enable - GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") - PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) - PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) - PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") - GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") - GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_type : "", "") - GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_instance_ip : "", "") - GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_password : "", "") - GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? module.sql_db[0].db_admin_user : "", "") - GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) - GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) - GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") - GRAFANA_REQUEST_CPU = try(var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") - GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") - GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") - GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") - GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") - GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") - GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") - GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") - GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") - ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) :false, false) - ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" - OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.google_secret_manager_secret_version.oauth_client_id[0].secret_data : null) : null, null) - OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.google_secret_manager_secret_version.oauth_client_secret[0].secret_data : null) : null, null) - USE_MONITORING_NODE_POOL = try(local.enable_monitoring_node_pool == true ? local.enable_monitoring_node_pool : false , false) - } -} - resource "helm_release" "grafana" { count = local.grafana_enable ? 1 : 0 chart = "grafana" @@ -69,7 +67,7 @@ resource "helm_release" "grafana" { repository = "https://grafana.github.io/helm-charts" values = [ - data.template_file.grafana_template[0].rendered + local.grafana_template ] depends_on = [helm_release.prometheus] } diff --git a/k8s/gcp/gke/k8s-events.tf b/k8s/gcp/gke/k8s-events.tf index cc2c29cf..b280fe3f 100644 --- a/k8s/gcp/gke/k8s-events.tf +++ b/k8s/gcp/gke/k8s-events.tf @@ -31,23 +31,20 @@ locals { all_loki_receivers = concat(local.loki_receivers,local.observability_loki_recievers) -} - -data "template_file" "k8s_event_exporter" { - count = local.enable_k8s_event_exporter || local.enable_loki ? 1 : 0 - - template = file("./templates/event-exporter-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level != null ? var.observability_config.kubernetes_event_exporter.log_level : "error" , "error") - MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second != null ? var.observability_config.kubernetes_event_exporter.max_event_age_second : "150" , "150") - LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) - WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) - LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu : "400m", "400m") - LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory != null ? var.observability_config.kubernetes_event_exporter.resource.limit_memory : "250Mi", "250Mi") - REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.request_cpu : "100m", "100m") - REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory != null ? var.observability_config.kubernetes_event_exporter.resource.request_memory : "100Mi", "100Mi") - } + k8s_event_exporter_template = (local.enable_k8s_event_exporter || local.enable_loki) ? templatefile( + "${path.module}/templates/event-exporter-values.yaml", + { + CLUSTER_NAME = local.cluster_name + LOG_LEVEL = coalesce(try(var.observability_config.kubernetes_event_exporter.log_level, null), "error") + MAX_EVENT_AGE_SECONDS = coalesce(try(var.observability_config.kubernetes_event_exporter.max_event_age_second, null), "150") + LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) + WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) + LIMIT_CPU = coalesce(try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu, null), "400m") + LIMIT_MEMORY = coalesce(try(var.observability_config.kubernetes_event_exporter.resource.limit_memory, null), "250Mi") + REQUEST_CPU = coalesce(try(var.observability_config.kubernetes_event_exporter.resource.request_cpu, null), "100m") + REQUEST_MEMORY = coalesce(try(var.observability_config.kubernetes_event_exporter.resource.request_memory, null), "100Mi") + } + ) : "" } resource "helm_release" "kubernetes_event_exporter" { @@ -60,6 +57,6 @@ resource "helm_release" "kubernetes_event_exporter" { namespace = helm_release.prometheus[0].namespace values = [ - data.template_file.k8s_event_exporter[count.index].rendered + local.k8s_event_exporter_template ] } \ No newline at end of file diff --git a/k8s/gcp/gke/karpenter.tf b/k8s/gcp/gke/karpenter.tf index 7b8a451e..d3af000f 100644 --- a/k8s/gcp/gke/karpenter.tf +++ b/k8s/gcp/gke/karpenter.tf @@ -80,15 +80,16 @@ resource "kubernetes_secret" "gcp-credentials" { } # helm chart values -data "template_file" "karpenter_template" { - count = var.karpenter_configs.enable ? 1 : 0 - template = file("./templates/karpenter-values.yaml") - vars = { - PROJECT_ID = var.provider_id - REGION = var.app_region - CLUSTER_NAME = local.cluster_name - SECRET_NAME = kubernetes_secret.gcp-credentials[0].metadata[0].name - } +locals { + karpenter_template = var.karpenter_configs.enable ? templatefile( + "${path.module}/templates/karpenter-values.yaml", + { + PROJECT_ID = var.provider_id + REGION = var.app_region + CLUSTER_NAME = local.cluster_name + SECRET_NAME = kubernetes_secret.gcp-credentials[0].metadata[0].name + } + ) : "" } # helm chart install @@ -100,7 +101,7 @@ resource "helm_release" "karpenter" { namespace = "karpenter" version = "0.0.3" - values = [data.template_file.karpenter_template[0].rendered] + values = [local.karpenter_template] } # available zones in region diff --git a/k8s/gcp/gke/outputs.tf b/k8s/gcp/gke/outputs.tf index 1b74a398..03772395 100644 --- a/k8s/gcp/gke/outputs.tf +++ b/k8s/gcp/gke/outputs.tf @@ -89,6 +89,12 @@ output "cortex_host_url" { value = try(module.observability[0].cortex_host_url,"") } +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" + value = try(module.observability[0].openobserve_instances, {}) + sensitive = true +} + output "grafana_password" { sensitive = true value = try(random_password.observability_admin[0].result,"") diff --git a/k8s/gcp/gke/prometheus.tf b/k8s/gcp/gke/prometheus.tf index 2f75eee5..650938d0 100644 --- a/k8s/gcp/gke/prometheus.tf +++ b/k8s/gcp/gke/prometheus.tf @@ -60,39 +60,56 @@ locals{ }] : [] remote_write_config = concat(local.remote_write_config_list, local.default_remote_write_config) -} - -data "template_file" "prom_template" { - count = local.prometheus_enable ? 1 : 0 - template = file("./templates/prometheus-values.yaml") - vars = { - PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") - PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "20GB", "20GB") - PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "7d", "7d") - CLUSTER_NAME = local.cluster_name - REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) - ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false - MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts == {} ? false : true - MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) == "" && jsonencode(local.cluster_teams_alerts) == "" ? false : true - MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) - MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key - MOOGSOFT_USERNAME = var.moogsoft_username - teams_webhook_alerts = jsonencode(local.cluster_alerts) - cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) - cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) - GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts == "" ? false : true - SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts == "" ? false : true - WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts == "" ? false : true - GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) - SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) - WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) - PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts == "" ? false : true - PAGER_DUTY_KEY = var.pagerduty_integration_key - PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) - GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" - USE_MONITORING_NODE_POOL = try(local.enable_monitoring_node_pool != null ? local.enable_monitoring_node_pool : false, false) - } + prom_template = local.prometheus_enable ? templatefile( + "${path.module}/templates/prometheus-values.yaml", + { + PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") + PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "20GB", "20GB") + PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "7d", "7d") + CLUSTER_NAME = local.cluster_name + REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) + ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false + MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts != {} + MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" + MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) + MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key + MOOGSOFT_USERNAME = var.moogsoft_username + teams_webhook_alerts = jsonencode(local.cluster_alerts) + cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) + cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) + GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts != "" + SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts != "" + WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts != "" + GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) + SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) + WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) + PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts != "" + PAGER_DUTY_KEY = var.pagerduty_integration_key + PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) + GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" + USE_MONITORING_NODE_POOL = try(local.enable_monitoring_node_pool != null ? local.enable_monitoring_node_pool : false, false) + } + ) : "" + + cluster_alerts_template = templatefile( + "${path.module}/templates/cluster-level-alerts.yaml", + { + cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) + cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) + cluster_node_count_max_value = local.enable_monitoring_node_pool ? var.monitoring_node_config.max_count : var.node_config.max_count + cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) + cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) + cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) + cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) + cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) + cluster_name = local.cluster_name + cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) + nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) + cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) + prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) + } + ) } @@ -110,7 +127,7 @@ resource "helm_release" "prometheus" { repository = "https://prometheus-community.github.io/helm-charts" values = [ - data.template_file.prom_template[count.index].rendered + local.prom_template ] } @@ -128,27 +145,8 @@ resource "helm_release" "alerts_teams" { ] } -data "template_file" "cluster-alerts" { - template = file("./templates/cluster-level-alerts.yaml") - vars = { - cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) - cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) - cluster_node_count_max_value = local.enable_monitoring_node_pool ? var.monitoring_node_config.max_count : var.node_config.max_count - cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) - cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) - cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) - cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) - cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) - cluster_name = local.cluster_name - cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) - nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) - cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) - prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) - } -} - resource "kubectl_manifest" "cluster-alerts" { count = local.prometheus_enable ? 1 : 0 - yaml_body = data.template_file.cluster-alerts.rendered + yaml_body = local.cluster_alerts_template depends_on = [helm_release.prometheus] } \ No newline at end of file diff --git a/k8s/gcp/gke/reloader.tf b/k8s/gcp/gke/reloader.tf index ed503948..eb8de25e 100644 --- a/k8s/gcp/gke/reloader.tf +++ b/k8s/gcp/gke/reloader.tf @@ -1,9 +1,8 @@ -data "template_file" "reloader_template" { - template = file("${path.module}/templates/reloader-values.yaml") - vars = { - } +locals { + reloader_template = templatefile("${path.module}/templates/reloader-values.yaml", {}) } + resource "helm_release" "reloader" { name = "reloader" repository = "https://stakater.github.io/stakater-charts" @@ -11,7 +10,7 @@ resource "helm_release" "reloader" { version = "1.0.60" values = [ - data.template_file.reloader_template.rendered + local.reloader_template ] depends_on = [ diff --git a/k8s/gcp/gke/templates/gcp-secrets-driver.yaml b/k8s/gcp/gke/templates/gcp-secrets-driver.yaml index 515be673..8a99831e 100644 --- a/k8s/gcp/gke/templates/gcp-secrets-driver.yaml +++ b/k8s/gcp/gke/templates/gcp-secrets-driver.yaml @@ -70,6 +70,22 @@ spec: env: - name: TARGET_DIR value: "/etc/kubernetes/secrets-store-csi-providers" + livenessProbe: + httpGet: + path: /live + port: 8095 + initialDelaySeconds: 5 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /live + port: 8095 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 volumeMounts: - mountPath: "/etc/kubernetes/secrets-store-csi-providers" name: providervol diff --git a/k8s/gcp/gke/vars.tf b/k8s/gcp/gke/vars.tf index 5a792635..da8c87b9 100644 --- a/k8s/gcp/gke/vars.tf +++ b/k8s/gcp/gke/vars.tf @@ -634,7 +634,7 @@ variable "observability_config" { max_cpu = optional(string, "1") min_memory = optional(string, "512Mi") max_memory = optional(string, "1Gi") - enable_ingress = optional(bool, false) + enable_ingress = optional(bool, true) env = optional(list(object({ name = string value = string diff --git a/k8s/gcp/namespace/issuer.tf b/k8s/gcp/namespace/issuer.tf index 61738bd4..14f44a96 100644 --- a/k8s/gcp/namespace/issuer.tf +++ b/k8s/gcp/namespace/issuer.tf @@ -1,14 +1,19 @@ -data "template_file" "issuer" { - template = file("./templates/issuer.yaml") - vars = { - namespace = kubernetes_namespace.app_environments.metadata[0].name +locals { + issuer_template = templatefile("${path.module}/templates/issuer.yaml", { + namespace = kubernetes_namespace.app_environments.metadata[0].name email = var.cert_issuer_config.email - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - } + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" ? + "https://acme-staging-v02.api.letsencrypt.org/directory" : + "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + }) } + resource "kubectl_manifest" "namespace_issuer" { - yaml_body = data.template_file.issuer.rendered + yaml_body = local.issuer_template } resource "kubernetes_secret_v1" "namespace-cert-replicator" { diff --git a/k8s/oci/namespace/issuer.tf b/k8s/oci/namespace/issuer.tf index 61738bd4..54ca6288 100644 --- a/k8s/oci/namespace/issuer.tf +++ b/k8s/oci/namespace/issuer.tf @@ -1,14 +1,19 @@ -data "template_file" "issuer" { - template = file("./templates/issuer.yaml") - vars = { - namespace = kubernetes_namespace.app_environments.metadata[0].name +locals { + issuer_yaml = templatefile("${path.module}/templates/issuer.yaml", { + namespace = kubernetes_namespace.app_environments.metadata[0].name email = var.cert_issuer_config.email - cert_issuer_url = try(var.cert_issuer_config.env == "stage" ? "https://acme-staging-v02.api.letsencrypt.org/directory" : "https://acme-v02.api.letsencrypt.org/directory","https://acme-staging-v02.api.letsencrypt.org/directory") - } + cert_issuer_url = try( + var.cert_issuer_config.env == "stage" ? + "https://acme-staging-v02.api.letsencrypt.org/directory" : + "https://acme-v02.api.letsencrypt.org/directory", + "https://acme-staging-v02.api.letsencrypt.org/directory" + ) + }) } + resource "kubectl_manifest" "namespace_issuer" { - yaml_body = data.template_file.issuer.rendered + yaml_body = local.issuer_yaml } resource "kubernetes_secret_v1" "namespace-cert-replicator" { diff --git a/k8s/oci/oke/autoscale.tf b/k8s/oci/oke/autoscale.tf index 69dc2961..2515ece2 100644 --- a/k8s/oci/oke/autoscale.tf +++ b/k8s/oci/oke/autoscale.tf @@ -5,12 +5,11 @@ resource "null_resource" "wait_for_cluster" { depends_on = [module.oke] } -data "template_file" "autoscale_template" { - template = file("./templates/cluster-auto-scaler-values.yaml") - vars = { - CLUSTER_NAME = local.cluster_name - REGION = var.app_region - } +locals { + autoscale_yaml = templatefile("${path.module}/templates/cluster-auto-scaler-values.yaml", { + CLUSTER_NAME = local.cluster_name + REGION = var.app_region + }) } resource "helm_release" "auto_scaler" { @@ -20,7 +19,7 @@ resource "helm_release" "auto_scaler" { namespace = "kube-system" version = "9.28.0" - values = [data.template_file.autoscale_template.rendered] + values = [local.autoscale_yaml] depends_on = [null_resource.wait_for_cluster] } \ No newline at end of file diff --git a/k8s/oci/oke/cert-manager.tf b/k8s/oci/oke/cert-manager.tf index e29a09b7..158a93fe 100644 --- a/k8s/oci/oke/cert-manager.tf +++ b/k8s/oci/oke/cert-manager.tf @@ -2,6 +2,38 @@ locals { private_key_content = < 0 ] -} - -data template_file "fluent-bit"{ - count = local.fluent_bit_enable ? 1 : 0 - template = file("./templates/fluent-bit-values.yaml") - vars = { - "CLUSTER_NAME" = local.cluster_name - "TAGS" = join(",", [for key, value in local.common_tags : "${key}=${value}"]) - - "HTTP_SERVER" = "On" - "HTTP_PORT" = "2020" - - "READ_FROM_HEAD" = "Off" - "READ_FROM_TAIL" = "On" - - fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) - fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) + fluent_bit_yaml = local.fluent_bit_enable ? templatefile("${path.module}/templates/fluent-bit-values.yaml", { + CLUSTER_NAME = local.cluster_name + TAGS = join(",", [for key, value in local.common_tags : "${key}=${value}"]) + HTTP_SERVER = "On" + HTTP_PORT = "2020" + READ_FROM_HEAD = "Off" + READ_FROM_TAIL = "On" + fluent_bit_loki_outputs = jsonencode(local.fluent_bit_loki_outputs) + fluent_bit_http_outputs = jsonencode(local.fluent_bit_http_outputs) fluent_bit_splunk_outputs = jsonencode(local.fluent_bit_splunk_outputs) fluent_bit_datadog_outputs = jsonencode(local.fluent_bit_datadog_outputs) fluent_bit_newrelic_outputs = jsonencode(local.fluent_bit_newrelic_outputs) fluent_bit_slack_outputs = jsonencode(local.fluent_bit_slack_outputs) - } + }) : null } + resource "helm_release" "fluentbit-config" { count = local.fluent_bit_enable ? 1 : 0 repository = "https://fluent.github.io/helm-charts" @@ -101,7 +94,7 @@ resource "helm_release" "fluentbit-config" { namespace = kubernetes_namespace.monitoring.metadata.0.name values = [ - data.template_file.fluent-bit[0].rendered + local.fluent_bit_yaml ] depends_on = [ kubernetes_namespace.monitoring diff --git a/k8s/oci/oke/grafana.tf b/k8s/oci/oke/grafana.tf index 84eee49f..66f4f3ff 100644 --- a/k8s/oci/oke/grafana.tf +++ b/k8s/oci/oke/grafana.tf @@ -6,51 +6,47 @@ locals { prometheus_enable = try(var.observability_config.prometheus != null ? var.observability_config.prometheus.enable : true, true) grafana_enable = try(var.observability_config.grafana != null ? var.observability_config.grafana.enable : false, false) grafana_host = try(var.observability_config.grafana.url != null ? var.observability_config.grafana.url : (local.domain_name != "" && !var.public_ingress ? "grafana.${local.domain_name}" : ""), "") -} - -resource "random_password" "observability_admin" { - count = local.grafana_enable ? 1 : 0 - length = 16 - special = false -} - -data "template_file" "grafana_template" { - count = local.grafana_enable ? 1 : 0 - template = file("./templates/grafana-values.yaml") - vars = { + grafana_values = local.grafana_enable ? templatefile("${path.module}/templates/grafana-values.yaml", { NAMESPACE = "monitoring" GRAFANA_HOST = local.grafana_host GRAFANA_ENABLED = local.grafana_enable GRAFANA_TLS_HOST = "*.${local.domain_name}" - GRAFANA_OBS_ADMIN_PASSWORD = try(local.grafana_enable ? try(random_password.observability_admin.0.result, "") : "", "") + GRAFANA_OBS_ADMIN_PASSWORD = try(random_password.observability_admin[0].result, "") CLUSTER_NAME = var.app_name PERSISTENCE_TYPE_DB = try(var.observability_config.grafana.persistence.type == "db" ? true : false, false) PERSISTENCE_TYPE_PVC = try(var.observability_config.grafana.persistence.type == "pvc" ? true : false, false) - PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size != null ? var.observability_config.grafana.persistence.disk_size : "10Gi", "10Gi") - GRAFANA_DB_NAME = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") - GRAFANA_DB_TYPE = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "postgres" : "", "") - GRAFANA_DB_HOST = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "module.grafana_db[0].db_url" : "", "") - GRAFANA_DB_PASSWORD = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "module.grafana_db[0].db_password" : "", "") - GRAFANA_DB_USER = try(local.grafana_enable && var.observability_config.grafana.persistence.type == "db" ? "module.grafana_db[0].db_admin_user" : "", "") - GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica != null ? var.observability_config.grafana.min_replica : 1, 1) - GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica != null ? var.observability_config.grafana.max_replica : 10, 10) - GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory != null ? var.observability_config.grafana.request_memory : "100Mi", "100Mi") - GRAFANA_REQUEST_CPU = try( var.observability_config.grafana.request_cpu != null ? var.observability_config.grafana.request_cpu : "100m", "100m") - GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory != null ? var.observability_config.grafana.limit_memory: "500Mi", "500Mi") - GRAFANA_LIMIT_CPU = try( var.observability_config.grafana.limit_cpu != null ? var.observability_config.grafana.limit_cpu : "500m", "500m") - GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory != null ? var.observability_config.grafana.dashboard.limit_memory : "512Mi", "512Mi") - GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu != null ? var.observability_config.grafana.dashboard.limit_cpu : "512m", "512m") - GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory != null ? var.observability_config.grafana.dashboard.request_memory : "256Mi", "256Mi") - GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu != null ? var.observability_config.grafana.dashboard.request_cpu : "256m", "256m") - GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory != null ? var.observability_config.grafana.datasource.limit_memory : "512Mi", "512Mi") - GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu != null ? var.observability_config.grafana.datasource.limit_cpu : "512m", "512m") - GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory != null ? var.observability_config.grafana.datasource.request_memory : "256Mi", "256Mi") - GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu != null ? var.observability_config.grafana.datasource.request_cpu : "256m", "256m") - ENABLE_SSO = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? var.observability_config.grafana.configs.enable_sso : false) :false, false) - ALLOWED_DOMAINS = local.grafana_enable ? local.grafana_allowed_domains : "" - # OAUTH_ID = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.oci_secrets_secretbundle.oauth_client_id[0].secret_bundle_content[0].content : null) : null, null) - # OAUTH_SECRET = try(var.observability_config.grafana.configs != null ? (var.observability_config.grafana.configs.enable_sso != null ? data.oci_secrets_secretbundle.oauth_client_secret[0].secret_bundle_content[0].content : null) : null, null) - } + PERSISTENCE_DISK_SIZE = try(var.observability_config.grafana.persistence.disk_size, "10Gi") + GRAFANA_DB_NAME = try(var.observability_config.grafana.persistence.type == "db" ? "grafana" : "", "") + GRAFANA_DB_TYPE = try(var.observability_config.grafana.persistence.type == "db" ? "postgres" : "", "") + GRAFANA_DB_HOST = try(var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_url : "", "") + GRAFANA_DB_PASSWORD = try(var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_password : "", "") + GRAFANA_DB_USER = try(var.observability_config.grafana.persistence.type == "db" ? module.grafana_db[0].db_admin_user : "", "") + GRAFANA_MIN_REPLICA = try(var.observability_config.grafana.min_replica, 1) + GRAFANA_MAX_REPLICA = try(var.observability_config.grafana.max_replica, 10) + GRAFANA_REQUEST_MEMORY = try(var.observability_config.grafana.request_memory, "100Mi") + GRAFANA_REQUEST_CPU = try(var.observability_config.grafana.request_cpu, "100m") + GRAFANA_LIMIT_MEMORY = try(var.observability_config.grafana.limit_memory, "500Mi") + GRAFANA_LIMIT_CPU = try(var.observability_config.grafana.limit_cpu, "500m") + GRAFANA_DASHBOARD_LIMIT_MEMORY = try(var.observability_config.grafana.dashboard.limit_memory, "512Mi") + GRAFANA_DASHBOARD_LIMIT_CPU = try(var.observability_config.grafana.dashboard.limit_cpu, "512m") + GRAFANA_DASHBOARD_REQUEST_MEMORY = try(var.observability_config.grafana.dashboard.request_memory, "256Mi") + GRAFANA_DASHBOARD_REQUEST_CPU = try(var.observability_config.grafana.dashboard.request_cpu, "256m") + GRAFANA_DATASOURCE_LIMIT_MEMORY = try(var.observability_config.grafana.datasource.limit_memory, "512Mi") + GRAFANA_DATASOURCE_LIMIT_CPU = try(var.observability_config.grafana.datasource.limit_cpu, "512m") + GRAFANA_DATASOURCE_REQUEST_MEMORY = try(var.observability_config.grafana.datasource.request_memory, "256Mi") + GRAFANA_DATASOURCE_REQUEST_CPU = try(var.observability_config.grafana.datasource.request_cpu, "256m") + ENABLE_SSO = try(var.observability_config.grafana.configs.enable_sso, false) + ALLOWED_DOMAINS = local.grafana_allowed_domains + # Uncomment these when you have OCI secrets configured + # OAUTH_ID = try(data.oci_secrets_secretbundle.oauth_client_id[0].secret_bundle_content[0].content, null) + # OAUTH_SECRET = try(data.oci_secrets_secretbundle.oauth_client_secret[0].secret_bundle_content[0].content, null) + }) : null +} + +resource "random_password" "observability_admin" { + count = local.grafana_enable ? 1 : 0 + length = 16 + special = false } resource "helm_release" "grafana" { @@ -64,7 +60,7 @@ resource "helm_release" "grafana" { repository = "https://grafana.github.io/helm-charts" values = [ - data.template_file.grafana_template[count.index].rendered + local.grafana_values[count.index] ] depends_on = [ helm_release.prometheus ] diff --git a/k8s/oci/oke/k8s-events.tf b/k8s/oci/oke/k8s-events.tf index a51a089e..88fec992 100644 --- a/k8s/oci/oke/k8s-events.tf +++ b/k8s/oci/oke/k8s-events.tf @@ -30,23 +30,18 @@ locals { }] : [] all_loki_receivers = concat(local.loki_receivers,local.observability_loki_recievers) -} - -data "template_file" "k8s_event_exporter" { - count = local.enable_k8s_event_exporter || local.enable_loki ? 1 : 0 - template = file("./templates/event-exporter-values.yaml") - vars = { + k8s_event_exporter_values = (local.enable_k8s_event_exporter || local.enable_loki) ? templatefile("${path.module}/templates/event-exporter-values.yaml", { CLUSTER_NAME = local.cluster_name - LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level != null ? var.observability_config.kubernetes_event_exporter.log_level : "error" , "error") - MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second != null ? var.observability_config.kubernetes_event_exporter.max_event_age_second : "150" , "150") + LOG_LEVEL = try(var.observability_config.kubernetes_event_exporter.log_level, "error") + MAX_EVENT_AGE_SECONDS = try(var.observability_config.kubernetes_event_exporter.max_event_age_second, "150") LOKI_RECEIVER_CONFIGS = jsonencode(local.all_loki_receivers) WEBHOOK_RECEIVER_CONFIGS = jsonencode(local.webhook_receivers) - LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.limit_cpu : "400m", "400m") - LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory != null ? var.observability_config.kubernetes_event_exporter.resource.limit_memory : "250Mi", "250Mi") - REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu != null ? var.observability_config.kubernetes_event_exporter.resource.request_cpu : "100m", "100m") - REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory != null ? var.observability_config.kubernetes_event_exporter.resource.request_memory : "100Mi", "100Mi") - } + LIMIT_CPU = try(var.observability_config.kubernetes_event_exporter.resource.limit_cpu, "400m") + LIMIT_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.limit_memory, "250Mi") + REQUEST_CPU = try(var.observability_config.kubernetes_event_exporter.resource.request_cpu, "100m") + REQUEST_MEMORY = try(var.observability_config.kubernetes_event_exporter.resource.request_memory, "100Mi") + }) : null } resource "helm_release" "kubernetes_event_exporter" { @@ -59,6 +54,6 @@ resource "helm_release" "kubernetes_event_exporter" { namespace = helm_release.prometheus[0].namespace values = [ - data.template_file.k8s_event_exporter[count.index].rendered + local.k8s_event_exporter_values[count.index] ] } \ No newline at end of file diff --git a/k8s/oci/oke/prometheus.tf b/k8s/oci/oke/prometheus.tf index 47676011..c3780bc2 100644 --- a/k8s/oci/oke/prometheus.tf +++ b/k8s/oci/oke/prometheus.tf @@ -62,41 +62,67 @@ locals{ }] : [] remote_write_config = concat(local.remote_write_config_list, local.default_remote_write_config) + prom_values = local.prometheus_enable ? templatefile("${path.module}/templates/prometheus-values.yaml", { + PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size, "50Gi") + PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size, "20GB") + PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration, "7d") + CLUSTER_NAME = local.cluster_name + REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) + + ALERTS_ENABLED = ( + jsonencode(local.cluster_moogsoft_alerts) != "" || + jsonencode(local.namespace_teams_webhook) != "" || + jsonencode(local.cluster_teams_alerts) != "" || + jsonencode(local.google_chat_alerts) != "" || + jsonencode(local.cluster_slack_alerts) != "" || + jsonencode(local.cluster_webhook_alerts) != "" + ) ? true : false + + MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts != {} ? true : false + MS_TEAMS_ALERT_ENABLED = (jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "") ? true : false + MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) + MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key + MOOGSOFT_USERNAME = var.moogsoft_username + + teams_webhook_alerts = jsonencode(local.cluster_alerts) + cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) + cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) + + GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts != "" ? true : false + SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts != "" ? true : false + WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts != "" ? true : false + + GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) + SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) + WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) + + PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts != "" ? true : false + PAGER_DUTY_KEY = var.pagerduty_integration_key + PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) + + GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" + }) : null + + cluster_alerts = templatefile( + "./templates/cluster-level-alerts.yaml", + { + cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : try(var.cluster_alert_thresholds.memory_underutilisation, 20) + cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : try(var.cluster_alert_thresholds.cpu_underutilisation, 20) + cluster_node_count_max_value = var.node_config.size + cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.node_count, 80) + cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.pod_count, 80) + cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.cpu_utilisation, 80) + cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : try(var.cluster_alert_thresholds.memory_utilisation, 20) + cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.disk_utilization, 80) + cluster_name = local.cluster_name + cortex_enabled = try(var.observability_config.cortex.enable, false) + nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : try(var.cluster_alert_thresholds.nginx_5xx_percentage_threshold, 5) + cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.cortex_disk_utilization_threshold, 80) + prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : try(var.cluster_alert_thresholds.prometheus_disk_utilization_threshold, 80) + } + ) } -data "template_file" "prom_template" { - count = local.prometheus_enable ? 1 : 0 - - template = file("./templates/prometheus-values.yaml") - vars = { - PROMETHEUS_DISK_SIZE = try(var.observability_config.prometheus.persistence.disk_size != null ? var.observability_config.prometheus.persistence.disk_size : "50Gi", "50Gi") - PROMETHEUS_RETENTION_SIZE = try(var.observability_config.prometheus.persistence.retention_size != null ? var.observability_config.prometheus.persistence.retention_size : "20GB", "20GB") - PROMETHEUS_RETENTION_DURATION = try(var.observability_config.prometheus.persistence.retention_duration != null ? var.observability_config.prometheus.persistence.retention_duration : "7d", "7d") - CLUSTER_NAME = local.cluster_name - REMOTE_WRITE_CONFIGS = jsonencode(local.remote_write_config) - ALERTS_ENABLED = jsonencode(local.cluster_moogsoft_alerts) != "" || jsonencode(local.namespace_teams_webhook) != "" || jsonencode(local.cluster_teams_alerts) != "" || jsonencode(local.google_chat_alerts) != "" || jsonencode(local.cluster_slack_alerts) != "" || jsonencode(local.cluster_webhook_alerts) != "" ? true : false - MOOGSOFT_ALERTS_ENABLED = local.cluster_moogsoft_alerts == {} ? false : true - MS_TEAMS_ALERT_ENABLED = jsonencode(local.namespace_teams_webhook) == "" && jsonencode(local.cluster_teams_alerts) == "" ? false : true - MOOGSOFT_ENDPOINT_URL = jsonencode(local.cluster_moogsoft_alerts) - MOOGSOFT_ENDPOINT_API_KEY = var.moogsoft_endpoint_api_key - MOOGSOFT_USERNAME = var.moogsoft_username - teams_webhook_alerts = jsonencode(local.cluster_alerts) - cluster_moogsoft_alerts = jsonencode(local.cluster_moogsoft_alerts) - cluster_teams_alerts = jsonencode(local.cluster_alerts_webhook) - GOOGLE_CHAT_ALERTS_ENABLED = local.google_chat_alerts == "" ? false : true - SLACK_CHAT_ALERTS_ENABLED = local.cluster_slack_alerts == "" ? false : true - WEBHOOK_ALERTS_ENABLED = local.cluster_webhook_alerts == "" ? false : true - GOOGLE_CHAT_CONFIGS = jsonencode(local.google_chat_alerts) - SLACK_CONFIGS = jsonencode(local.cluster_slack_alerts) - WEBHOOK_CONFIGS = jsonencode(local.cluster_webhook_alerts) - PAGER_DUTY_ALERTS_ENABLED = local.cluster_pagerduty_alerts == "" ? false : true - PAGER_DUTY_KEY = var.pagerduty_integration_key - PAGER_DUTY_ENDPOINT_URL = jsonencode(local.cluster_pagerduty_alerts) - GRAFANA_HOST = local.grafana_enable ? local.grafana_host : "" - } -} - - resource "helm_release" "prometheus" { count = local.prometheus_enable ? 1 : 0 @@ -111,7 +137,7 @@ resource "helm_release" "prometheus" { repository = "https://prometheus-community.github.io/helm-charts" values = [ - data.template_file.prom_template[count.index].rendered + local.prom_values[count.index] ] } @@ -129,27 +155,8 @@ resource "helm_release" "alerts_teams" { ] } -data "template_file" "cluster-alerts" { - template = file("./templates/cluster-level-alerts.yaml") - vars = { - cluster_memory_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_underutilisation != null ? var.cluster_alert_thresholds.memory_underutilisation : 20) - cluster_cpu_usage_request_underutilisation_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.cpu_underutilisation != null ? var.cluster_alert_thresholds.cpu_underutilisation : 20) - cluster_node_count_max_value = var.node_config.size - cluster_node_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.node_count != null ? var.cluster_alert_thresholds.node_count : 80) - cluster_pod_count_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.pod_count != null ? var.cluster_alert_thresholds.pod_count: 80) - cluster_total_cpu_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cpu_utilisation != null ? var.cluster_alert_thresholds.cpu_utilisation: 80) - cluster_total_memory_utilization_threshold = var.cluster_alert_thresholds == null ? 20 : (var.cluster_alert_thresholds.memory_utilisation != null ? var.cluster_alert_thresholds.memory_utilisation: 20) - cluster_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.disk_utilization != null ? var.cluster_alert_thresholds.disk_utilization: 80) - cluster_name = local.cluster_name - cortex_enabled = try(var.observability_config.cortex == null ? false : var.observability_config.cortex.enable, false) - nginx_5xx_percentage_threshold = var.cluster_alert_thresholds == null ? 5 : (var.cluster_alert_thresholds.nginx_5xx_percentage_threshold != null ? var.cluster_alert_thresholds.nginx_5xx_percentage_threshold: 5) - cortex_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.cortex_disk_utilization_threshold != null ? var.cluster_alert_thresholds.cortex_disk_utilization_threshold : 80) - prometheus_disk_utilization_threshold = var.cluster_alert_thresholds == null ? 80 : (var.cluster_alert_thresholds.prometheus_disk_utilization_threshold != null ? var.cluster_alert_thresholds.prometheus_disk_utilization_threshold : 80) - } -} - resource "kubectl_manifest" "cluster-alerts" { count = local.prometheus_enable ? 1 : 0 - yaml_body = data.template_file.cluster-alerts.rendered + yaml_body = local.cluster_alerts depends_on = [helm_release.prometheus] } \ No newline at end of file diff --git a/k8s/oci/oke/reloader.tf b/k8s/oci/oke/reloader.tf index ed503948..6f20a70a 100644 --- a/k8s/oci/oke/reloader.tf +++ b/k8s/oci/oke/reloader.tf @@ -1,7 +1,5 @@ -data "template_file" "reloader_template" { - template = file("${path.module}/templates/reloader-values.yaml") - vars = { - } +locals { + reloader_values = templatefile("${path.module}/templates/reloader-values.yaml", {}) } resource "helm_release" "reloader" { @@ -11,7 +9,7 @@ resource "helm_release" "reloader" { version = "1.0.60" values = [ - data.template_file.reloader_template.rendered + local.reloader_values ] depends_on = [ diff --git a/object-storage/aws/main.tf b/object-storage/aws/main.tf index 1945827f..548daaea 100644 --- a/object-storage/aws/main.tf +++ b/object-storage/aws/main.tf @@ -1,7 +1,11 @@ resource "aws_s3_bucket" "s3_bucket" { for_each = toset(var.bucket_names) bucket = each.key - force_destroy = true + force_destroy = false + + lifecycle { + prevent_destroy = true + } } resource "aws_s3_bucket_versioning" "s3_versioning" { @@ -11,4 +15,28 @@ resource "aws_s3_bucket_versioning" "s3_versioning" { versioning_configuration { status = var.enable_versioning ? "Enabled" : "Suspended" } -} \ No newline at end of file +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "s3_encryption" { + for_each = aws_s3_bucket.s3_bucket + + bucket = each.value.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "s3_public_access_block" { + for_each = aws_s3_bucket.s3_bucket + + bucket = each.value.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} diff --git a/observability/alerts/cortex.tf b/observability/alerts/cortex.tf index a6dc8ebd..9cc98b7d 100644 --- a/observability/alerts/cortex.tf +++ b/observability/alerts/cortex.tf @@ -1,16 +1,17 @@ -data "template_file" "cortex_alerts" { - count = local.configure_cortex_alerts ? 1 : 0 - template = file("${path.module}/templates/cortex-alerts.yaml") - vars = { - cluster_name = var.cluster_name - distributor_replica_threshold = var.cortex.alerts == null ? 1 : (var.cortex.alerts.distributor_replica == null ? 1 : var.cortex.alerts.distributor_replica) - ingester_replica_threshold = var.cortex.alerts == null ? 2 : (var.cortex.alerts.ingester_replica == null ? 2 : var.cortex.alerts.ingester_replica) - querier_replica_threshold = var.cortex.alerts == null ? 3 : (var.cortex.alerts.querier_replica == null ? 3 : var.cortex.alerts.querier_replica) - queryfrontend_replica_threshold = var.cortex.alerts == null ? 1 : (var.cortex.alerts.query_frontend_replica == null ? 1 : var.cortex.alerts.query_frontend_replica) - } +locals { + cortex_alerts = local.configure_cortex_alerts ? templatefile( + "${path.module}/templates/cortex-alerts.yaml", + { + cluster_name = var.cluster_name + distributor_replica_threshold = try(var.cortex.alerts.distributor_replica, 1) + ingester_replica_threshold = try(var.cortex.alerts.ingester_replica, 2) + querier_replica_threshold = try(var.cortex.alerts.querier_replica, 3) + queryfrontend_replica_threshold = try(var.cortex.alerts.query_frontend_replica, 1) + } + ) : null } resource "kubectl_manifest" "cortex_alerts" { count = local.configure_cortex_alerts ? 1 : 0 - yaml_body = data.template_file.cortex_alerts[0].rendered + yaml_body = local.cortex_alerts } \ No newline at end of file diff --git a/observability/alerts/loki.tf b/observability/alerts/loki.tf index 3fa9995f..1dcdec3b 100644 --- a/observability/alerts/loki.tf +++ b/observability/alerts/loki.tf @@ -1,22 +1,23 @@ -data "template_file" "loki_alerts" { - count = local.configure_loki_alerts ? 1 : 0 - template = file("${path.module}/templates/loki-alerts.yaml") - vars = { - cluster_name = var.cluster_name - distributor_lines_received_threshold = var.loki.alerts == null ? 10000 : (var.loki.alerts.distributor_lines_received == null ? 10000: var.loki.alerts.distributor_lines_received) - distributor_bytes_received_threshold = var.loki.alerts == null ? 0 : (var.loki.alerts.distributor_bytes_received == null ? 0: var.loki.alerts.distributor_bytes_received) - distributor_appended_failures_threshold = var.loki.alerts == null ? 10000 : (var.loki.alerts.distributor_appended_failures == null ? 10000 : var.loki.alerts.distributor_appended_failures) - request_errors_threshold = var.loki.alerts == null ? 10 : (var.loki.alerts.request_errors == null ? 10 : var.loki.alerts.request_errors) - panics_threshold = var.loki.alerts == null ? 0 : (var.loki.alerts.panics == null ? 0 : var.loki.alerts.panics) - request_latency_threshold = var.loki.alerts == null ? 1 : (var.loki.alerts.request_latency == null ? 1 : var.loki.alerts.request_latency) - distributor_replica_threshold = var.loki.alerts == null ? 1 : (var.loki.alerts.distributor_replica == null ? 1 : var.loki.alerts.distributor_replica) - ingester_replica_threshold = var.loki.alerts == null ? 1 : (var.loki.alerts.ingester_replica == null ? 1 : var.loki.alerts.ingester_replica) - querier_replica_threshold = var.loki.alerts == null ? 4 : (var.loki.alerts.querier_replica == null ? 4 : var.loki.alerts.querier_replica) - queryfrontend_replica_threshold = var.loki.alerts == null ? 1 : (var.loki.alerts.query_frontend_replica == null ? 1 : var.loki.alerts.query_frontend_replica) - } +locals { + loki_alerts = local.configure_loki_alerts ? templatefile( + "${path.module}/templates/loki-alerts.yaml", + { + cluster_name = var.cluster_name + distributor_lines_received_threshold = try(var.loki.alerts.distributor_lines_received, 10000) + distributor_bytes_received_threshold = try(var.loki.alerts.distributor_bytes_received, 0) + distributor_appended_failures_threshold = try(var.loki.alerts.distributor_appended_failures, 10000) + request_errors_threshold = try(var.loki.alerts.request_errors, 10) + panics_threshold = try(var.loki.alerts.panics, 0) + request_latency_threshold = try(var.loki.alerts.request_latency, 1) + distributor_replica_threshold = try(var.loki.alerts.distributor_replica, 1) + ingester_replica_threshold = try(var.loki.alerts.ingester_replica, 1) + querier_replica_threshold = try(var.loki.alerts.querier_replica, 4) + queryfrontend_replica_threshold = try(var.loki.alerts.query_frontend_replica, 1) + } + ) : null } resource "kubectl_manifest" "loki_alerts" { count = local.configure_loki_alerts ? 1 : 0 - yaml_body = data.template_file.loki_alerts[0].rendered + yaml_body = local.loki_alerts } \ No newline at end of file diff --git a/observability/alerts/mimir.tf b/observability/alerts/mimir.tf index 8ffccd83..52126535 100644 --- a/observability/alerts/mimir.tf +++ b/observability/alerts/mimir.tf @@ -1,16 +1,17 @@ -data "template_file" "mimir_alerts" { - count = local.configure_mimir_alerts ? 1 : 0 - template = file("${path.module}/templates/mimir-alerts.yaml") - vars = { - cluster_name = var.cluster_name - distributor_replica_threshold = var.mimir.alerts == null ? 1 : (var.mimir.alerts.distributor_replica == null ? 1 : var.mimir.alerts.distributor_replica) - ingester_replica_threshold = var.mimir.alerts == null ? 2 : (var.mimir.alerts.ingester_replica == null ? 2 : var.mimir.alerts.ingester_replica) - querier_replica_threshold = var.mimir.alerts == null ? 3 : (var.mimir.alerts.querier_replica == null ? 3 : var.mimir.alerts.querier_replica) - queryfrontend_replica_threshold = var.mimir.alerts == null ? 1 : (var.mimir.alerts.query_frontend_replica == null ? 1 : var.mimir.alerts.query_frontend_replica) - } +locals { + mimir_alerts = local.configure_mimir_alerts ? templatefile( + "${path.module}/templates/mimir-alerts.yaml", + { + cluster_name = var.cluster_name + distributor_replica_threshold = try(var.mimir.alerts.distributor_replica, 1) + ingester_replica_threshold = try(var.mimir.alerts.ingester_replica, 2) + querier_replica_threshold = try(var.mimir.alerts.querier_replica, 3) + queryfrontend_replica_threshold = try(var.mimir.alerts.query_frontend_replica, 1) + } + ) : null } resource "kubectl_manifest" "mimir_alerts" { count = local.configure_mimir_alerts ? 1 : 0 - yaml_body = data.template_file.mimir_alerts[0].rendered + yaml_body = local.mimir_alerts } \ No newline at end of file diff --git a/observability/alerts/tempo.tf b/observability/alerts/tempo.tf index bd9b515e..11689abd 100644 --- a/observability/alerts/tempo.tf +++ b/observability/alerts/tempo.tf @@ -1,24 +1,25 @@ -data "template_file" "tempo_alerts" { - count = local.configure_tempo_alerts ? 1 : 0 - template = file("${path.module}/templates/tempo-alerts.yaml") - vars = { - cluster_name = var.cluster_name - ingester_bytes_received_threshold = var.tempo.alerts == null ? 0 : (var.tempo.alerts.ingester_bytes_received == null ? 0: var.tempo.alerts.ingester_bytes_received) - distributor_ingester_appends_threshold = var.tempo.alerts == null ? 0 : (var.tempo.alerts.distributor_ingester_appends == null ? 0 : var.tempo.alerts.distributor_ingester_appends) - distributor_ingester_append_failures_threshold = var.tempo.alerts == null ? 2000 : (var.tempo.alerts.distributor_ingester_append_failures == null ? 2000 : var.tempo.alerts.distributor_ingester_append_failures) - ingester_live_traces_threshold = var.tempo.alerts == null ? 20000 : (var.tempo.alerts.ingester_live_traces == null ? 20000 : var.tempo.alerts.ingester_live_traces) - distributor_spans_received_threshold = var.tempo.alerts == null ? 0 : (var.tempo.alerts.distributor_spans_received == null ? 0 : var.tempo.alerts.distributor_spans_received) - distributor_bytes_received_threshold = var.tempo.alerts == null ? 0 : (var.tempo.alerts.distributor_bytes_received == null ? 0 : var.tempo.alerts.distributor_bytes_received) - ingester_blocks_flushed_threshold = var.tempo.alerts == null ? 0 : (var.tempo.alerts.ingester_blocks_flushed == null ? 0 : var.tempo.alerts.ingester_blocks_flushed) - tempodb_blocklist_threshold = var.tempo.alerts == null ? 2000 : (var.tempo.alerts.tempodb_blocklist == null ? 2000 : var.tempo.alerts.tempodb_blocklist) - distributor_replica_threshold = var.tempo.alerts == null ? 1 : (var.tempo.alerts.distributor_replica == null ? 1 : var.tempo.alerts.distributor_replica) - ingester_replica_threshold = var.tempo.alerts == null ? 1 : (var.tempo.alerts.ingester_replica == null ? 1 : var.tempo.alerts.ingester_replica) - querier_replica_threshold = var.tempo.alerts == null ? 1 : (var.tempo.alerts.querier_replica == null ? 1 : var.tempo.alerts.querier_replica) - queryfrontend_replica_threshold = var.tempo.alerts == null ? 1 : (var.tempo.alerts.query_frontend_replica == null ? 1 : var.tempo.alerts.query_frontend_replica) - } +locals { + tempo_alerts = local.configure_tempo_alerts ? templatefile( + "${path.module}/templates/tempo-alerts.yaml", + { + cluster_name = var.cluster_name + ingester_bytes_received_threshold = try(var.tempo.alerts.ingester_bytes_received, 0) + distributor_ingester_appends_threshold = try(var.tempo.alerts.distributor_ingester_appends, 0) + distributor_ingester_append_failures_threshold = try(var.tempo.alerts.distributor_ingester_append_failures, 2000) + ingester_live_traces_threshold = try(var.tempo.alerts.ingester_live_traces, 20000) + distributor_spans_received_threshold = try(var.tempo.alerts.distributor_spans_received, 0) + distributor_bytes_received_threshold = try(var.tempo.alerts.distributor_bytes_received, 0) + ingester_blocks_flushed_threshold = try(var.tempo.alerts.ingester_blocks_flushed, 0) + tempodb_blocklist_threshold = try(var.tempo.alerts.tempodb_blocklist, 2000) + distributor_replica_threshold = try(var.tempo.alerts.distributor_replica, 1) + ingester_replica_threshold = try(var.tempo.alerts.ingester_replica, 1) + querier_replica_threshold = try(var.tempo.alerts.querier_replica, 1) + queryfrontend_replica_threshold = try(var.tempo.alerts.query_frontend_replica, 1) + } + ) : null } resource "kubectl_manifest" "tempo_alerts" { count = local.configure_tempo_alerts ? 1 : 0 - yaml_body = data.template_file.tempo_alerts[0].rendered + yaml_body = local.tempo_alerts } \ No newline at end of file diff --git a/observability/aws/cortex.tf b/observability/aws/cortex.tf index ff0c1bac..b6d29da4 100644 --- a/observability/aws/cortex.tf +++ b/observability/aws/cortex.tf @@ -1,7 +1,132 @@ +locals { + cortex_template = local.enable_cortex ? templatefile( + "${path.module}/templates/cortex-values.yaml", + { + BUCKET_NAME = aws_s3_bucket.cortex_data[0].id + AWS_SECRET = var.access_secret + AWS_KEY = var.access_key + cluster_name = local.cluster_name + app_region = var.app_region + + limits_ingestion_rate = try(var.cortex.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size, "500000") + limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric, "0") + limits_max_series_per_user = try(var.cortex.limits.max_series_per_user, "0") + limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query, "3000000") + query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout, "30s") + + compactor_enable = try(var.cortex.compactor.enable, "true") + compactor_replicas = try(var.cortex.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.cortex.compactor.min_cpu, "null") + compactor_min_memory = try(var.cortex.compactor.min_memory, "null") + compactor_max_cpu = try(var.cortex.compactor.max_cpu, "null") + compactor_max_memory = try(var.cortex.compactor.max_memory, "null") + + ingester_replicas = try(var.cortex.ingester.replicas, "1") + ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.cortex.ingester.min_memory, "null") + ingester_min_cpu = try(var.cortex.ingester.min_cpu, "null") + ingester_max_memory = try(var.cortex.ingester.max_memory, "null") + ingester_max_cpu = try(var.cortex.ingester.max_cpu, "null") + ingester_autoscaling = try(var.cortex.ingester.autoscaling, "true") + ingester_max_replicas = try(var.cortex.ingester.max_replicas, "100") + ingester_min_replicas = try(var.cortex.ingester.min_replicas, "2") + ingester_memory_utilization = try(var.cortex.ingester.memory_utilization, "") + + querier_replicas = try(var.cortex.querier.replicas, "1") + querier_min_memory = try(var.cortex.querier.min_memory, "null") + querier_min_cpu = try(var.cortex.querier.min_cpu, "null") + querier_max_memory = try(var.cortex.querier.max_memory, "null") + querier_max_cpu = try(var.cortex.querier.max_cpu, "null") + querier_autoscaling = try(var.cortex.querier.autoscaling, "true") + querier_max_replicas = try(var.cortex.querier.max_replicas, "20") + querier_min_replicas = try(var.cortex.querier.min_replicas, "2") + querier_memory_utilization = try(var.cortex.querier.memory_utilization, "") + querier_cpu_utilization = try(var.cortex.querier.cpu_utilization, "") + + query_frontend_replicas = try(var.cortex.query_frontend.replicas, "4") + query_frontend_enable = try(var.cortex.query_frontend.enable, "true") + + store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.cortex.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size, "500Gi") + store_gateway_min_memory = try(var.cortex.store_gateway.min_memory, "null") + store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu, "null") + store_gateway_max_memory = try(var.cortex.store_gateway.max_memory, "null") + store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu, "null") + + memcached_frontend_enable = try(var.cortex.memcached_frontend.enable, "true") + memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory, "null") + memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu, "null") + memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory, "null") + memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu, "null") + + memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable, "true") + memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu, "null") + memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory, "null") + memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu, "null") + memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory, "null") + + memcached_blocks_enable = try(var.cortex.memcached_blocks.enable, "true") + memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory, "null") + memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu, "null") + memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory, "null") + memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu, "null") + + memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable, "true") + memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory, "null") + memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu, "null") + memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory, "null") + memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu, "null") + + distributor_replicas = try(var.cortex.distributor.replicas, "1") + distributor_min_memory = try(var.cortex.distributor.min_memory, "null") + distributor_min_cpu = try(var.cortex.distributor.min_cpu, "null") + distributor_max_memory = try(var.cortex.distributor.max_memory, "null") + distributor_max_cpu = try(var.cortex.distributor.max_cpu, "null") + distributor_autoscaling = try(var.cortex.distributor.autoscaling, "true") + distributor_max_replicas = try(var.cortex.distributor.max_replicas, "30") + distributor_min_replicas = try(var.cortex.distributor.min_replicas, "2") + distributor_memory_utilization = try(var.cortex.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization, "") + } + ) : "" +} + resource "aws_s3_bucket" "cortex_data" { count = local.enable_cortex ? 1 : 0 bucket = "${local.cluster_name}-cortex-data-${var.observability_suffix}" - force_destroy = "true" + force_destroy = false + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_s3_bucket_public_access_block" "cortex_public_access_block" { + count = local.enable_cortex ? 1 : 0 + bucket = aws_s3_bucket.cortex_data[0].id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + + + +resource "aws_s3_bucket_server_side_encryption_configuration" "cortex_data_encryption" { + count = local.enable_cortex ? 1 : 0 + bucket = aws_s3_bucket.cortex_data[0].id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true + } } resource "kubernetes_secret" "cortex-aws-credentials" { @@ -22,91 +147,6 @@ resource "kubernetes_secret" "cortex-aws-credentials" { } -data "template_file" "cortex_template"{ - count = local.enable_cortex ? 1 : 0 - template = file("${path.module}/templates/cortex-values.yaml") - vars = { - "BUCKET_NAME" = aws_s3_bucket.cortex_data[0].id - "AWS_SECRET" = var.access_secret - "AWS_KEY" = var.access_key - cluster_name = local.cluster_name - app_region = var.app_region - limits_ingestion_rate = try(var.cortex.limits.ingestion_rate != null ? var.cortex.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size != null ? var.cortex.limits.ingestion_burst_size : "500000", "500000") - limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric != null ? var.cortex.limits.max_series_per_metric : "0", "0") - limits_max_series_per_user = try(var.cortex.limits.max_series_per_user != null ? var.cortex.limits.max_series_per_user : "0", "0") - limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query != null ? var.cortex.limits.max_fetched_chunks_per_query : "3000000", "3000000") - query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout != null ? var.cortex.query_range.memcached_client_timeout : "30s", "30s") - compactor_enable = try(var.cortex.compactor.enable != null ? var.cortex.compactor.enable : "true", "true") - compactor_replicas = try(var.cortex.compactor.replicas != null ? var.cortex.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable != null ? var.cortex.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size != null ? var.cortex.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.cortex.compactor.min_cpu != null ? var.cortex.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.cortex.compactor.min_memory != null ? var.cortex.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.cortex.compactor.max_cpu != null ? var.cortex.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.cortex.compactor.max_memory != null ? var.cortex.compactor.max_memory : "null", "null") - ingester_replicas = try(var.cortex.ingester.replicas != null ? var.cortex.ingester.replicas : "1", "1") - ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size != null ? var.cortex.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.cortex.ingester.min_memory != null ? var.cortex.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.cortex.ingester.min_cpu != null ? var.cortex.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.cortex.ingester.max_memory != null ? var.cortex.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.cortex.ingester.max_cpu != null ? var.cortex.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.cortex.ingester.autoscaling != null ? var.cortex.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.cortex.ingester.max_replicas != null ? var.cortex.ingester.max_replicas : "100", "100") - ingester_min_replicas = try(var.cortex.ingester.min_replicas != null ? var.cortex.ingester.min_replicas : "2", "2") - ingester_memory_utilization = try(var.cortex.ingester.memory_utilization != null ? var.cortex.ingester.memory_utilization : "", "") - querier_replicas = try(var.cortex.querier.replicas != null ? var.cortex.querier.replicas : "1", "1") - querier_min_memory = try(var.cortex.querier.min_memory != null ? var.cortex.querier.min_memory : "null", "null") - querier_min_cpu = try(var.cortex.querier.min_cpu != null ? var.cortex.querier.min_cpu : "null", "null") - querier_max_memory = try(var.cortex.querier.max_memory != null ? var.cortex.querier.max_memory : "null", "null") - querier_max_cpu = try(var.cortex.querier.max_cpu != null ? var.cortex.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.cortex.querier.autoscaling != null ? var.cortex.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.cortex.querier.max_replicas != null ? var.cortex.querier.max_replicas : "20", "20") - querier_min_replicas = try(var.cortex.querier.min_replicas != null ? var.cortex.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.cortex.querier.memory_utilization != null ? var.cortex.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.cortex.querier.cpu_utilization != null ? var.cortex.querier.cpu_utilization : "", "") - query_frontend_replicas = try(var.cortex.query_frontend.replicas != null ? var.cortex.query_frontend.replicas : "4", "4") - query_frontend_enable = try(var.cortex.query_frontend.enable != null ? var.cortex.query_frontend.enable : "true", "true") - store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor != null ? var.cortex.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.cortex.store_gateway.replicas != null ? var.cortex.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size != null ? var.cortex.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.cortex.store_gateway.min_memory != null ? var.cortex.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu != null ? var.cortex.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.cortex.store_gateway.max_memory != null ? var.cortex.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu != null ? var.cortex.store_gateway.max_cpu : "null", "null") - memcached_frontend_enable = try(var.cortex.memcached_frontend.enable != null ? var.cortex.memcached_frontend.enable : "true", "true") - memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory != null ? var.cortex.memcached_frontend.min_memory : "null", "null") - memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu != null ? var.cortex.memcached_frontend.min_cpu : "null", "null") - memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory != null ? var.cortex.memcached_frontend.max_memory : "null", "null") - memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu != null ? var.cortex.memcached_frontend.max_cpu : "null", "null") - memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable != null ? var.cortex.memcached_blocks_index.enable : "true", "true") - memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu != null ? var.cortex.memcached_blocks_index.min_cpu : "null", "null") - memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory != null ? var.cortex.memcached_blocks_index.min_memory : "null", "null") - memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu != null ? var.cortex.memcached_blocks_index.max_cpu : "null", "null") - memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory != null ? var.cortex.memcached_blocks_index.max_memory : "null", "null") - memcached_blocks_enable = try(var.cortex.memcached_blocks.enable != null ? var.cortex.memcached_blocks.enable : "true", "true") - memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory != null ? var.cortex.memcached_blocks.min_memory : "null", "null") - memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu != null ? var.cortex.memcached_blocks.min_cpu : "null", "null") - memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory != null ? var.cortex.memcached_blocks.max_memory : "null", "null") - memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu != null ? var.cortex.memcached_blocks.max_cpu : "null", "null") - memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable != null ? var.cortex.memcached_blocks_metadata.enable : "true", "true") - memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory != null ? var.cortex.memcached_blocks_metadata.min_memory : "null", "null") - memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu != null ? var.cortex.memcached_blocks_metadata.min_cpu : "null", "null") - memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory != null ? var.cortex.memcached_blocks_metadata.max_memory : "null", "null") - memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu != null ? var.cortex.memcached_blocks_metadata.max_cpu : "null", "null") - distributor_replicas = try(var.cortex.distributor.replicas != null ? var.cortex.distributor.replicas : "1", "1") - distributor_min_memory = try(var.cortex.distributor.min_memory != null ? var.cortex.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.cortex.distributor.min_cpu != null ? var.cortex.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.cortex.distributor.max_memory != null ? var.cortex.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.cortex.distributor.max_cpu != null ? var.cortex.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.cortex.distributor.autoscaling != null ? var.cortex.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.cortex.distributor.max_replicas != null ? var.cortex.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.cortex.distributor.min_replicas != null ? var.cortex.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.cortex.distributor.memory_utilization != null ? var.cortex.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization != null ? var.cortex.distributor.cpu_utilization : "", "") - } -} - resource "helm_release" "cortex" { count = local.enable_cortex ? 1 : 0 name = "cortex" @@ -116,6 +156,6 @@ resource "helm_release" "cortex" { version = "1.7.0" values = [ - data.template_file.cortex_template[0].rendered + local.cortex_template ] } diff --git a/observability/aws/loki.tf b/observability/aws/loki.tf index 2df42640..07859e38 100644 --- a/observability/aws/loki.tf +++ b/observability/aws/loki.tf @@ -1,58 +1,89 @@ +locals { + loki_template = local.enable_loki ? templatefile( + "${path.module}/templates/loki-values.yaml", + { + BUCKET_NAME = aws_s3_bucket.loki_data[0].id + AWS_SECRET = local.access_secret + AWS_KEY = local.access_key + app_region = var.app_region + + ingester_replicas = try(var.loki.ingester.replicas, "1") + ingester_max_memory = try(var.loki.ingester.max_memory, "null") + ingester_min_memory = try(var.loki.ingester.min_memory, "1Gi") + ingester_max_cpu = try(var.loki.ingester.max_cpu, "null") + ingester_min_cpu = try(var.loki.ingester.min_cpu, "null") + ingester_autoscaling = try(var.loki.ingester.autoscaling, "true") + ingester_max_replicas = try(var.loki.ingester.max_replicas, "30") + ingester_min_replicas = try(var.loki.ingester.min_replicas, "2") + ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization, "") + ingester_memory_utilization = try(var.loki.ingester.memory_utilization, "") + + distributor_replicas = try(var.loki.distributor.replicas, "1") + distributor_max_memory = try(var.loki.distributor.max_memory, "1Gi") + distributor_min_memory = try(var.loki.distributor.min_memory, "512Mi") + distributor_max_cpu = try(var.loki.distributor.max_cpu, "1") + distributor_min_cpu = try(var.loki.distributor.min_cpu, "250m") + distributor_autoscaling = try(var.loki.distributor.autoscaling, "true") + distributor_max_replicas = try(var.loki.distributor.max_replicas, "30") + distributor_min_replicas = try(var.loki.distributor.min_replicas, "2") + distributor_memory_utilization = try(var.loki.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization, "") + + querier_replicas = try(var.loki.querier.replicas, "4") + querier_max_unavailable = try(var.loki.querier.max_unavailable, "1") + querier_min_memory = try(var.loki.querier.min_memory, "500Mi") + querier_min_cpu = try(var.loki.querier.min_cpu, "100m") + querier_max_memory = try(var.loki.querier.max_memory, "null") + querier_max_cpu = try(var.loki.querier.max_cpu, "null") + querier_autoscaling = try(var.loki.querier.autoscaling, "true") + querier_max_replicas = try(var.loki.querier.max_replicas, "6") + querier_min_replicas = try(var.loki.querier.min_replicas, "2") + querier_memory_utilization = try(var.loki.querier.memory_utilization, "") + querier_cpu_utilization = try(var.loki.querier.cpu_utilization, "") + + queryFrontend_replicas = try(var.loki.queryFrontend.replicas, "1") + queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory, "250Mi") + queryFrontend_max_memory = try(var.loki.queryFrontend.max_memory, "null") + queryFrontend_min_cpu = try(var.loki.queryFrontend.min_cpu, "null") + queryFrontend_max_cpu = try(var.loki.queryFrontend.max_cpu, "null") + queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling, "true") + queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas, "6") + queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas, "1") + queryFrontend_memory_utilization = try(var.loki.queryFrontend.memory_utilization, "") + queryFrontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization, "") + } + ) : "" +} + resource "aws_s3_bucket" "loki_data" { count = local.enable_loki ? 1 : 0 bucket = "${local.cluster_name}-loki-data-${var.observability_suffix}" - force_destroy = "true" + force_destroy = false + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_s3_bucket_public_access_block" "loki_public_access_block" { + count = local.enable_loki ? 1 : 0 + bucket = aws_s3_bucket.loki_data[0].id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true } -data "template_file" "loki_template" { - count = local.enable_loki ? 1 : 0 - template = file("${path.module}/templates/loki-values.yaml") - vars = { - BUCKET_NAME = aws_s3_bucket.loki_data[0].id - AWS_SECRET = local.access_secret - AWS_KEY = local.access_key - app_region = var.app_region - ingester_replicas = try(var.loki.ingester.replicas != null ? var.loki.ingester.replicas : "1", "1") - ingester_max_memory = try(var.loki.ingester.max_memory != null ? var.loki.ingester.max_memory : "null", "null") - ingester_min_memory = try(var.loki.ingester.min_memory != null ? var.loki.ingester.min_memory : "1Gi", "1Gi") - ingester_max_cpu = try(var.loki.ingester.max_cpu != null ? var.loki.ingester.max_cpu : "null", "null") - ingester_min_cpu = try(var.loki.ingester.min_cpu != null ? var.loki.ingester.min_cpu : "null", "null") - ingester_autoscaling = try(var.loki.ingester.autoscaling != null ? var.loki.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.loki.ingester.max_replicas != null ? var.loki.ingester.max_replicas : "30", "30") - ingester_min_replicas = try(var.loki.ingester.min_replicas != null ? var.loki.ingester.min_replicas : "2", "2") - ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization != null ? var.loki.ingester.cpu_utilization : "", "") - ingester_memory_utilization = try(var.loki.ingester.memory_utilization != null ? var.loki.ingester.memory_utilization : "", "") - distributor_replicas = try(var.loki.distributor.replicas != null ? var.loki.distributor.replicas : "1", "1") - distributor_max_memory = try(var.loki.distributor.max_memory != null ? var.loki.distributor.max_memory : "1Gi", "1Gi") - distributor_min_memory = try(var.loki.distributor.min_memory != null ? var.loki.distributor.min_memory : "512Mi", "512Mi") - distributor_max_cpu = try(var.loki.distributor.max_cpu != null ? var.loki.distributor.max_cpu : "1", "1") - distributor_min_cpu = try(var.loki.distributor.min_cpu != null ? var.loki.distributor.min_cpu : "250m", "250m") - distributor_autoscaling = try(var.loki.distributor.autoscaling != null ? var.loki.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.loki.distributor.max_replicas != null ? var.loki.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.loki.distributor.min_replicas != null ? var.loki.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.loki.distributor.memory_utilization != null ? var.loki.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization != null ? var.loki.distributor.cpu_utilization : "", "") - querier_replicas = try(var.loki.querier.replicas != null ? var.loki.querier.replicas : "4", "4") - querier_max_unavailable = try(var.loki.querier.max_unavailable != null ? var.loki.querier.max_unavailable : "1", "1") - querier_min_memory = try(var.loki.querier.min_memory != null ? var.loki.querier.min_memory : "500Mi", "500Mi") - querier_min_cpu = try(var.loki.querier.min_cpu != null ? var.loki.querier.min_cpu : "100m", "100m") - querier_max_memory = try(var.loki.querier.max_memory != null ? var.loki.querier.max_memory : "null", "null") - querier_max_cpu = try(var.loki.querier.max_cpu != null ? var.loki.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.loki.querier.autoscaling != null ? var.loki.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.loki.querier.max_replicas != null ? var.loki.querier.max_replicas : "6", "6") - querier_min_replicas = try(var.loki.querier.min_replicas != null ? var.loki.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.loki.querier.memory_utilization != null ? var.loki.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.loki.querier.cpu_utilization != null ? var.loki.querier.cpu_utilization : "", "") - queryFrontend_replicas = try(var.loki.queryFrontend.replicas != null ? var.loki.queryFrontend.replicas : "1", "1") - queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory != null ? var.loki.queryFrontend.min_memory : "250Mi", "250Mi") - queryFrontend_max_memory = try(var.loki.query_frontend.max_memory != null ? var.loki.query_frontend.max_memory : "null", "null") - queryFrontend_min_cpu = try(var.loki.query_frontend.min_cpu != null ? var.loki.query_frontend.min_cpu : "null", "null") - queryFrontend_max_cpu = try(var.loki.query_frontend.max_cpu != null ? var.loki.query_frontend.max_cpu : "null", "null") - queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling != null ? var.loki.queryFrontend.autoscaling : "true", "true") - queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas != null ? var.loki.queryFrontend.max_replicas : "6", "6") - queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas != null ? var.loki.queryFrontend.min_replicas : "1", "1") - queryFrontend_memory_utilization= try(var.loki.queryFrontend.memory_utilization != null ? var.loki.queryFrontend.memory_utilization : "", "") - queryFrontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization != null ? var.loki.queryFrontend.cpu_utilization : "", "") +resource "aws_s3_bucket_server_side_encryption_configuration" "loki_data_encryption" { + count = local.enable_loki ? 1 : 0 + bucket = aws_s3_bucket.loki_data[0].id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true } } @@ -65,6 +96,6 @@ resource "helm_release" "loki" { version = "0.68.0" values = [ - data.template_file.loki_template[0].rendered + local.loki_template ] } \ No newline at end of file diff --git a/observability/aws/main.tf b/observability/aws/main.tf index a6c30a83..07e9a284 100644 --- a/observability/aws/main.tf +++ b/observability/aws/main.tf @@ -9,11 +9,13 @@ locals { enable_tempo = try(var.tempo != null ? var.tempo.enable : false, false) enable_cortex = try(var.cortex != null ? var.cortex.enable : false, false) enable_mimir = try(var.mimir != null ? var.mimir.enable : false,false) + enable_openobserve = length(var.openobserve) > 0 && anytrue([for instance in var.openobserve : instance.enable]) enable_ingress_loki = local.enable_loki ? (var.loki.enable_ingress != null ? var.loki.enable_ingress : false ) : false enable_ingress_tempo = local.enable_tempo ? (var.tempo.enable_ingress != null ? var.tempo.enable_ingress : false ) : false enable_ingress_mimir = local.enable_mimir ? (var.mimir.enable_ingress != null ? var.mimir.enable_ingress : false ) : false enable_ingress_cortex = local.enable_cortex ? (var.cortex.enable_ingress != null ? var.cortex.enable_ingress : false ) : false + enable_ingress_openobserve = local.enable_openobserve ? anytrue([for instance in var.openobserve : instance.enable && try(instance.enable_ingress, true)]) : false app_namespaces = { loki = local.enable_loki ? { @@ -32,6 +34,10 @@ locals { services = ["mimir-distributor:8080"] ingress = local.enable_ingress_mimir } : null + openobserve = local.enable_openobserve ? { + services = [for instance in var.openobserve : "${instance.name}-openobserve-standalone:5080" if instance.enable && instance.enable_ingress != false] + ingress = local.enable_ingress_openobserve + } : null } filtered_namespace = { diff --git a/observability/aws/mimir.tf b/observability/aws/mimir.tf index 7b82ebba..21421de0 100644 --- a/observability/aws/mimir.tf +++ b/observability/aws/mimir.tf @@ -1,7 +1,84 @@ +locals { + mimir_template = local.enable_mimir ? templatefile( + "${path.module}/templates/mimir-values.yaml", + { + BUCKET_NAME = aws_s3_bucket.mimir_data[0].id + cluster_name = local.cluster_name + app_region = var.app_region + AWS_SECRET = var.access_secret + AWS_KEY = var.access_key + limits_ingestion_rate = try(var.mimir.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size, "500000") + limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query, "3000000") + limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness, "24h") + limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant, "1000") + compactor_replicas = try(var.mimir.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.mimir.compactor.min_cpu, "null") + compactor_min_memory = try(var.mimir.compactor.min_memory, "null") + compactor_max_cpu = try(var.mimir.compactor.max_cpu, "null") + compactor_max_memory = try(var.mimir.compactor.max_memory, "null") + ingester_replicas = try(var.mimir.ingester.replicas, "2") + ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.mimir.ingester.min_memory, "null") + ingester_min_cpu = try(var.mimir.ingester.min_cpu, "null") + ingester_max_memory = try(var.mimir.ingester.max_memory, "null") + ingester_max_cpu = try(var.mimir.ingester.max_cpu, "null") + querier_replicas = try(var.mimir.querier.replicas, "3") + querier_min_memory = try(var.mimir.querier.min_memory, "null") + querier_min_cpu = try(var.mimir.querier.min_cpu, "null") + querier_max_memory = try(var.mimir.querier.max_memory, "null") + querier_max_cpu = try(var.mimir.querier.max_cpu, "null") + query_frontend_replicas = try(var.mimir.query_frontend.replicas, "1") + store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.mimir.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size, "500Gi") + store_gateway_min_memory = try(var.mimir.store_gateway.min_memory, "null") + store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu, "null") + store_gateway_max_memory = try(var.mimir.store_gateway.max_memory, "null") + store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu, "null") + distributor_replicas = try(var.mimir.distributor.replicas, "1") + distributor_min_memory = try(var.mimir.distributor.min_memory, "null") + distributor_min_cpu = try(var.mimir.distributor.min_cpu, "null") + distributor_max_memory = try(var.mimir.distributor.max_memory, "null") + distributor_max_cpu = try(var.mimir.distributor.max_cpu, "null") + mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result + mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result + } + ) : null +} + resource "aws_s3_bucket" "mimir_data" { count = local.enable_mimir ? 1 : 0 bucket = "${local.cluster_name}-mimir-data-${var.observability_suffix}" - force_destroy = "true" + force_destroy = false + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_s3_bucket_public_access_block" "mimir_public_access_block" { + count = local.enable_mimir ? 1 : 0 + bucket = aws_s3_bucket.mimir_data[0].id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "mimir_data_encryption" { + count = local.enable_mimir ? 1 : 0 + bucket = aws_s3_bucket.mimir_data[0].id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true + } } resource "random_password" "mimir_basic_auth_username" { @@ -56,56 +133,6 @@ resource "kubernetes_secret" "mimir-aws-credentials" { } -data "template_file" "mimir_template" { - count = local.enable_mimir ? 1 : 0 - template = file("${path.module}/templates/mimir-values.yaml") - vars = { - BUCKET_NAME = aws_s3_bucket.mimir_data[0].id - cluster_name = local.cluster_name - app_region = var.app_region - "AWS_SECRET" = var.access_secret - "AWS_KEY" = var.access_key - limits_ingestion_rate = try(var.mimir.limits.ingestion_rate != null ? var.mimir.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size != null ? var.mimir.limits.ingestion_burst_size : "500000", "500000") - limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query != null ? var.mimir.limits.max_fetched_chunks_per_query : "3000000", "3000000") - limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness != null ? var.mimir.limits.max_cache_freshness : "24h", "24h") - limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant != null ? var.mimir.limits.max_outstanding_requests_per_tenant : "1000", "1000") - compactor_replicas = try(var.mimir.compactor.replicas != null ? var.mimir.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable != null ? var.mimir.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size != null ? var.mimir.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.mimir.compactor.min_cpu != null ? var.mimir.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.mimir.compactor.min_memory != null ? var.mimir.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.mimir.compactor.max_cpu != null ? var.mimir.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.mimir.compactor.max_memory != null ? var.mimir.compactor.max_memory : "null", "null") - ingester_replicas = try(var.mimir.ingester.replicas != null ? var.mimir.ingester.replicas : "2", "2") - ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size != null ? var.mimir.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.mimir.ingester.min_memory != null ? var.mimir.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.mimir.ingester.min_cpu != null ? var.mimir.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.mimir.ingester.max_memory != null ? var.mimir.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.mimir.ingester.max_cpu != null ? var.mimir.ingester.max_cpu : "null", "null") - querier_replicas = try(var.mimir.querier.replicas != null ? var.mimir.querier.replicas : "3", "3") - querier_min_memory = try(var.mimir.querier.min_memory != null ? var.mimir.querier.min_memory : "null", "null") - querier_min_cpu = try(var.mimir.querier.min_cpu != null ? var.mimir.querier.min_cpu : "null", "null") - querier_max_memory = try(var.mimir.querier.max_memory != null ? var.mimir.querier.max_memory : "null", "null") - querier_max_cpu = try(var.mimir.querier.max_cpu != null ? var.mimir.querier.max_cpu : "null", "null") - query_frontend_replicas = try(var.mimir.query_frontend.replicas != null ? var.mimir.query_frontend.replicas : "1", "1") - store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor != null ? var.mimir.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.mimir.store_gateway.replicas != null ? var.mimir.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size != null ? var.mimir.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.mimir.store_gateway.min_memory != null ? var.mimir.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu != null ? var.mimir.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.mimir.store_gateway.max_memory != null ? var.mimir.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu != null ? var.mimir.store_gateway.max_cpu : "null", "null") - distributor_replicas = try(var.mimir.distributor.replicas != null ? var.mimir.distributor.replicas : "1", "1") - distributor_min_memory = try(var.mimir.distributor.min_memory != null ? var.mimir.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.mimir.distributor.min_cpu != null ? var.mimir.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.mimir.distributor.max_memory != null ? var.mimir.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.mimir.distributor.max_cpu != null ? var.mimir.distributor.max_cpu : "null", "null") - mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result - mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result - } -} - resource "helm_release" "mimir" { count = local.enable_mimir ? 1 : 0 name = "mimir" @@ -114,7 +141,7 @@ resource "helm_release" "mimir" { namespace = kubernetes_namespace.app_environments["mimir"].metadata[0].name version = "5.1.3" values = [ - data.template_file.mimir_template[0].rendered + local.mimir_template ] depends_on = [ diff --git a/observability/aws/openobserve.tf b/observability/aws/openobserve.tf new file mode 100644 index 00000000..35c8881c --- /dev/null +++ b/observability/aws/openobserve.tf @@ -0,0 +1,96 @@ +# Create S3 bucket for OpenObserve data storage (auto-generated bucket names) +resource "aws_s3_bucket" "openobserve_data" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + bucket = "${local.cluster_name}-openobserve-${each.value.name}-${var.observability_suffix}" + force_destroy = false + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_s3_bucket_public_access_block" "openobserve_public_access_block" { + for_each = aws_s3_bucket.openobserve_data + + bucket = each.value.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "openobserve_data_encryption" { + for_each = aws_s3_bucket.openobserve_data + bucket = each.value.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true + } +} + +# Generate random password for OpenObserve +resource "random_password" "openobserve_password" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + length = 16 + special = true + upper = true + lower = true + numeric = true +} + +# Create template for OpenObserve values +locals { + openobserve_templates = local.enable_openobserve ? { + for instance in var.openobserve : + instance.name => templatefile( + "${path.module}/templates/openobserve-values.yaml", + { + replica_count = try(instance.replicaCount, 2) + cpu_request = try(instance.min_cpu, "250m") + memory_request = try(instance.min_memory, "1Gi") + cpu_limit = try(instance.max_cpu, "1") + memory_limit = try(instance.max_memory, "2Gi") + storage_provider = "s3" + storage_region = var.app_region + storage_bucket_name = aws_s3_bucket.openobserve_data[instance.name].id + aws_access_key = var.access_key + aws_secret_key = var.access_secret + root_user_email = "admin@zop.dev" + root_user_password = random_password.openobserve_password[instance.name].result + + additional_env_vars = length(try(instance.env, [])) > 0 ? join("\n", + [ + for env in instance.env : + " - name: ${env.name}\n value: \"${env.value}\"" + ]) : "" + } + ) + if instance.enable + } : {} +} + +# Deploy OpenObserve using Helm +resource "helm_release" "openobserve" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + name = each.value.name + repository = "https://helm.zop.dev" + chart = "openobserve-standalone" + version = "v1.0.0" + namespace = kubernetes_namespace.app_environments["openobserve"].metadata[0].name + + values = [ + local.openobserve_templates[each.key] + ] + + depends_on = [ + aws_s3_bucket.openobserve_data, + kubernetes_namespace.app_environments, + ] +} diff --git a/observability/aws/outputs.tf b/observability/aws/outputs.tf index 360288d1..311b13d6 100644 --- a/observability/aws/outputs.tf +++ b/observability/aws/outputs.tf @@ -26,3 +26,16 @@ output "cortex_host_url" { value = local.enable_cortex ? (local.enable_ingress_cortex ? kubernetes_ingress_v1.service_ingress["cortex-distributor:8080-cortex"].spec[0].rule[0].host : "cortex-distributor.cortex:8080") : "" } +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" + value = local.enable_openobserve ? { + for instance in var.openobserve : instance.name => { + name = instance.name + url = try(instance.enable_ingress, true) ? try(kubernetes_ingress_v1.service_ingress["${instance.name}-openobserve-standalone:5080-openobserve"].spec[0].rule[0].host, "${instance.name}.openobserve:5080") : "${instance.name}.openobserve:5080" + username = "admin@zop.dev" + password = random_password.openobserve_password[instance.name].result + } if instance.enable + } : {} + sensitive = true +} + diff --git a/observability/aws/templates/openobserve-values.yaml b/observability/aws/templates/openobserve-values.yaml new file mode 100644 index 00000000..312b6d00 --- /dev/null +++ b/observability/aws/templates/openobserve-values.yaml @@ -0,0 +1,55 @@ +replicaCount: ${replica_count} + +resources: + requests: + cpu: ${cpu_request} + memory: ${memory_request} + limits: + cpu: ${cpu_limit} + memory: ${memory_limit} + +auth: + ZO_ROOT_USER_EMAIL: "${root_user_email}" + ZO_ROOT_USER_PASSWORD: "${root_user_password}" + ZO_ROOT_USER_TOKEN: "" + ZO_S3_ACCESS_KEY: "${aws_access_key}" + ZO_S3_SECRET_KEY: "${aws_secret_key}" + +config: + ZO_S3_PROVIDER: "s3" + ZO_S3_REGION_NAME: "${storage_region}" + ZO_S3_BUCKET_NAME: "${storage_bucket_name}" + ZO_S3_BUCKET_PREFIX: "" + ZO_S3_FEATURE_FORCE_HOSTED_STYLE: "false" + ZO_S3_FEATURE_HTTP1_ONLY: "false" + ZO_S3_FEATURE_HTTP2_ONLY: "false" + ZO_S3_ALLOW_INVALID_CERTIFICATES: "false" + ZO_S3_CONNECT_TIMEOUT: "10" + ZO_S3_REQUEST_TIMEOUT: "3600" + ZO_S3_MAX_RETRIES: "10" + ZO_S3_SYNC_TO_CACHE_INTERVAL: "600" + ZO_LOCAL_MODE_STORAGE: "s3" + ZO_MEM_PERSIST_INTERVAL: "20" + ZO_FILE_PUSH_INTERVAL: "10" + ZO_MAX_FILE_SIZE_ON_DISK: "128" + ZO_MAX_FILE_RETENTION_TIME: "30" + ZO_COMPACT_MAX_FILE_SIZE: "512" + ZO_MEM_TABLE_MAX_SIZE: "256" + ZO_MEMORY_CACHE_DATAFUSION_MAX_SIZE: "0" + ZO_MEMORY_CACHE_GC_INTERVAL: "60" + ZO_MEMORY_CACHE_MAX_SIZE: "0" + +persistence: + enabled: false + +probes: + enabled: true + path: /healthz + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 3 + +extraEnv: +%{~ if additional_env_vars != "" ~} +${additional_env_vars} +%{~ endif ~} diff --git a/observability/aws/tempo.tf b/observability/aws/tempo.tf index 0f74cb96..f493276d 100644 --- a/observability/aws/tempo.tf +++ b/observability/aws/tempo.tf @@ -7,54 +7,82 @@ locals { value = remote.header.value } ], {}) + + tempo_template = local.enable_tempo ? templatefile( + "${path.module}/templates/tempo-values.yaml", + { + BUCKET_NAME = aws_s3_bucket.tempo_data[0].id + AWS_SECRET = var.access_secret + AWS_KEY = var.access_key + app_region = var.app_region + + ingester_replicas = try(var.tempo.ingester.replicas, "1") + ingester_min_memory = try(var.tempo.ingester.min_memory, "1Gi") + ingester_max_memory = try(var.tempo.ingester.max_memory, "null") + ingester_min_cpu = try(var.tempo.ingester.min_cpu, "null") + ingester_max_cpu = try(var.tempo.ingester.max_cpu, "null") + ingester_autoscaling = try(var.tempo.ingester.autoscaling, "true") + ingester_min_replicas = try(var.tempo.ingester.min_replicas, "2") + ingester_max_replicas = try(var.tempo.ingester.max_replicas, "30") + ingester_memory_utilization = try(var.tempo.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization, "") + + distributor_replicas = try(var.tempo.distributor.replicas, "1") + distributor_min_memory = try(var.tempo.distributor.min_memory, "750Mi") + distributor_max_memory = try(var.tempo.distributor.max_memory, "null") + distributor_min_cpu = try(var.tempo.distributor.min_cpu, "null") + distributor_max_cpu = try(var.tempo.distributor.max_cpu, "null") + distributor_autoscaling = try(var.tempo.distributor.autoscaling, "true") + distributor_min_replicas = try(var.tempo.distributor.min_replicas, "2") + distributor_max_replicas = try(var.tempo.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.tempo.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization, "") + + querier_replicas = try(var.tempo.querier.replicas, "1") + queryFrontend_replicas = try(var.tempo.queryFrontend.replicas, "1") + + metrics_generator_enable = try(var.tempo.metrics_generator.enable, false) + metrics_generator_replicas = try(var.tempo.metrics_generator.replicas, "1") + metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items, "30000") + metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait, "30s") + metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline, "2m") + metrics_generator_remote_write = jsonencode(local.remote_write_config) + metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack, "40s") + } + ) : "" } resource "aws_s3_bucket" "tempo_data" { count = local.enable_tempo ? 1 : 0 bucket = "${local.cluster_name}-tempo-data-${var.observability_suffix}" - force_destroy = "true" -} + force_destroy = false -data "template_file" "tempo_template"{ - count = local.enable_tempo ? 1 : 0 - template = file("${path.module}/templates/tempo-values.yaml") - vars = { - BUCKET_NAME = aws_s3_bucket.tempo_data[0].id - AWS_SECRET = var.access_secret - AWS_KEY = var.access_key - app_region = var.app_region - ingester_replicas = try(var.tempo.ingester.replicas != null ? var.tempo.ingester.replicas : "1", "1") - ingester_min_memory = try(var.tempo.ingester.min_memory != null ? var.tempo.ingester.min_memory : "1Gi", "1Gi") - ingester_max_memory = try(var.tempo.ingester.max_memory != null ? var.tempo.ingester.max_memory : "null", "null") - ingester_min_cpu = try(var.tempo.ingester.min_cpu != null ? var.tempo.ingester.min_cpu : "null", "null") - ingester_max_cpu = try(var.tempo.ingester.max_cpu != null ? var.tempo.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.tempo.ingester.autoscaling != null ? var.tempo.ingester.autoscaling : "true", "true") - ingester_min_replicas = try(var.tempo.ingester.min_replicas != null ? var.tempo.ingester.min_replicas : "2", "2") - ingester_max_replicas = try(var.tempo.ingester.max_replicas != null ? var.tempo.ingester.max_replicas : "30", "30") - ingester_memory_utilization = try(var.tempo.ingester.memory_utilization != null ? var.tempo.ingester.memory_utilization : "", "") - ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization != null ? var.tempo.ingester.cpu_utilization : "", "") - distributor_replicas = try(var.tempo.distributor.replicas != null ? var.tempo.distributor.replicas : "1", "1") - distributor_min_memory = try(var.tempo.distributor.min_memory != null ? var.tempo.distributor.min_memory : "750Mi", "750Mi") - distributor_max_memory = try(var.tempo.distributor.max_memory != null ? var.tempo.distributor.max_memory : "null", "null") - distributor_min_cpu = try(var.tempo.distributor.min_cpu != null ? var.tempo.distributor.min_cpu : "null", "null") - distributor_max_cpu = try(var.tempo.distributor.max_cpu != null ? var.tempo.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.tempo.distributor.autoscaling != null ? var.tempo.distributor.autoscaling : "true", "true") - distributor_min_replicas = try(var.tempo.distributor.min_replicas != null ? var.tempo.distributor.min_replicas : "2", "2") - distributor_max_replicas = try(var.tempo.distributor.max_replicas != null ? var.tempo.distributor.max_replicas : "30", "30") - distributor_memory_utilization = try(var.tempo.distributor.memory_utilization != null ? var.tempo.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization != null ? var.tempo.distributor.cpu_utilization : "","") - querier_replicas = try(var.tempo.querier.replicas != null ? var.tempo.querier.replicas : "1", "1") - queryFrontend_replicas = try(var.tempo.queryFrontend.replicas != null ? var.tempo.queryFrontend.replicas : "1", "1") - metrics_generator_enable = try(var.tempo.metrics_generator.enable != null ? var.tempo.metrics_generator.enable : false, false) - metrics_generator_replicas = try(var.tempo.metrics_generator.replicas != null ? var.tempo.metrics_generator.replicas : "1", "1") - metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items != null ? var.tempo.metrics_generator.service_graphs_max_items : "30000", "30000") - metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait != null ? var.tempo.metrics_generator.service_graphs_wait : "30s", "30s") - metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline != null ? var.tempo.metrics_generator.remote_write_flush_deadline : "2m", "2m") - metrics_generator_remote_write = jsonencode(local.remote_write_config) - metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack != null ? var.tempo.metrics_generator.metrics_ingestion_time_range_slack : "40s", "40s") + lifecycle { + prevent_destroy = true } } +resource "aws_s3_bucket_public_access_block" "tempo_public_access_block" { + count = local.enable_tempo ? 1 : 0 + bucket = aws_s3_bucket.tempo_data[0].id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "tempo_data_encryption" { + count = local.enable_tempo ? 1 : 0 + bucket = aws_s3_bucket.tempo_data[0].id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + bucket_key_enabled = true + } +} resource "helm_release" "tempo" { count = local.enable_tempo ? 1 : 0 @@ -65,6 +93,6 @@ resource "helm_release" "tempo" { version = "1.10.0" values = [ - data.template_file.tempo_template[0].rendered + local.tempo_template ] } \ No newline at end of file diff --git a/observability/aws/vars.tf b/observability/aws/vars.tf index a1769d7b..526165fe 100644 --- a/observability/aws/vars.tf +++ b/observability/aws/vars.tf @@ -374,3 +374,22 @@ variable "mimir" { })) }) } + +variable "openobserve" { + description = "List of OpenObserve instances to deploy" + type = list(object({ + enable = bool + name = string + replicaCount = optional(number, 2) + min_cpu = optional(string, "250m") + max_cpu = optional(string, "1") + min_memory = optional(string, "1Gi") + max_memory = optional(string, "2Gi") + enable_ingress = optional(bool, true) + env = optional(list(object({ + name = string + value = string + })), []) + })) + default = [] +} diff --git a/observability/azure/cortex.tf b/observability/azure/cortex.tf index 31f9d534..9cc50e50 100644 --- a/observability/azure/cortex.tf +++ b/observability/azure/cortex.tf @@ -1,3 +1,83 @@ +locals { + cortex_values = local.enable_cortex ? templatefile("${path.module}/templates/cortex-values.yaml", { + CONTAINER = azurerm_storage_container.cortex_container[0].name + ACCOUNT_NAME = var.storage_account + ACCOUNT_KEY = var.account_access_key + cluster_name = local.cluster_name + app_region = var.app_region + + # Limits + limits_ingestion_rate = try(var.cortex.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size, "500000") + limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric, "0") + limits_max_series_per_user = try(var.cortex.limits.max_series_per_user, "0") + limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query, "3000000") + query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout, "30s") + + # Compactor + compactor_enable = try(var.cortex.compactor.enable, "true") + compactor_replicas = try(var.cortex.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.cortex.compactor.min_cpu, null) + compactor_min_memory = try(var.cortex.compactor.min_memory, null) + compactor_max_cpu = try(var.cortex.compactor.max_cpu, null) + compactor_max_memory = try(var.cortex.compactor.max_memory, null) + + # Ingester + ingester_replicas = try(var.cortex.ingester.replicas, "1") + ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.cortex.ingester.min_memory, null) + ingester_min_cpu = try(var.cortex.ingester.min_cpu, null) + ingester_max_memory = try(var.cortex.ingester.max_memory, null) + ingester_max_cpu = try(var.cortex.ingester.max_cpu, null) + ingester_autoscaling = try(var.cortex.ingester.autoscaling, "true") + ingester_min_replicas = try(var.cortex.ingester.min_replicas, "2") + ingester_max_replicas = try(var.cortex.ingester.max_replicas, "100") + ingester_memory_utilization = try(var.cortex.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.cortex.ingester.cpu_utilization, "") + + # Querier + querier_replicas = try(var.cortex.querier.replicas, "1") + querier_min_memory = try(var.cortex.querier.min_memory, null) + querier_min_cpu = try(var.cortex.querier.min_cpu, null) + querier_max_memory = try(var.cortex.querier.max_memory, null) + querier_max_cpu = try(var.cortex.querier.max_cpu, null) + querier_autoscaling = try(var.cortex.querier.autoscaling, "true") + querier_min_replicas = try(var.cortex.querier.min_replicas, "2") + querier_max_replicas = try(var.cortex.querier.max_replicas, "20") + querier_memory_utilization = try(var.cortex.querier.memory_utilization, "") + querier_cpu_utilization = try(var.cortex.querier.cpu_utilization, "") + + # Query Frontend + query_frontend_replicas = try(var.cortex.query_frontend.replicas, "4") + query_frontend_enable = try(var.cortex.query_frontend.enable, "true") + + # Store Gateway + store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.cortex.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size, "500Gi") + store_gateway_min_memory = try(var.cortex.store_gateway.min_memory, null) + store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu, null) + store_gateway_max_memory = try(var.cortex.store_gateway.max_memory, null) + store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu, null) + + # Distributor + distributor_replicas = try(var.cortex.distributor.replicas, "1") + distributor_min_memory = try(var.cortex.distributor.min_memory, null) + distributor_min_cpu = try(var.cortex.distributor.min_cpu, null) + distributor_max_memory = try(var.cortex.distributor.max_memory, null) + distributor_max_cpu = try(var.cortex.distributor.max_cpu, null) + distributor_autoscaling = try(var.cortex.distributor.autoscaling, "true") + distributor_min_replicas = try(var.cortex.distributor.min_replicas, "2") + distributor_max_replicas = try(var.cortex.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.cortex.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization, "") + + # Memcached components can be added similarly if needed... + }) : null +} + resource "azurerm_storage_container" "cortex_container" { count = local.enable_cortex ? 1 : 0 name = "${local.cluster_name}-cortex-container-${var.observability_suffix}" @@ -24,92 +104,6 @@ resource "kubernetes_secret" "cortex-azure-credentials" { } -data "template_file" "cortex_template"{ - count = local.enable_cortex ? 1 : 0 - template = file("${path.module}/templates/cortex-values.yaml") - vars = { - "CONTAINER" = azurerm_storage_container.cortex_container[0].name - "ACCOUNT_NAME" = var.storage_account - "ACCOUNT_KEY" = var.account_access_key - cluster_name = local.cluster_name - app_region = var.app_region - limits_ingestion_rate = try(var.cortex.limits.ingestion_rate != null ? var.cortex.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size != null ? var.cortex.limits.ingestion_burst_size : "500000", "500000") - limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric != null ? var.cortex.limits.max_series_per_metric : "0", "0") - limits_max_series_per_user = try(var.cortex.limits.max_series_per_user != null ? var.cortex.limits.max_series_per_user : "0", "0") - limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query != null ? var.cortex.limits.max_fetched_chunks_per_query : "3000000", "3000000") - query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout != null ? var.cortex.query_range.memcached_client_timeout : "30s", "30s") - compactor_enable = try(var.cortex.compactor.enable != null ? var.cortex.compactor.enable : "true", "true") - compactor_replicas = try(var.cortex.compactor.replicas != null ? var.cortex.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable != null ? var.cortex.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size != null ? var.cortex.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.cortex.compactor.min_cpu != null ? var.cortex.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.cortex.compactor.min_memory != null ? var.cortex.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.cortex.compactor.max_cpu != null ? var.cortex.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.cortex.compactor.max_memory != null ? var.cortex.compactor.max_memory : "null", "null") - ingester_replicas = try(var.cortex.ingester.replicas != null ? var.cortex.ingester.replicas : "1", "1") - ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size != null ? var.cortex.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.cortex.ingester.min_memory != null ? var.cortex.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.cortex.ingester.min_cpu != null ? var.cortex.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.cortex.ingester.max_memory != null ? var.cortex.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.cortex.ingester.max_cpu != null ? var.cortex.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.cortex.ingester.autoscaling != null ? var.cortex.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.cortex.ingester.max_replicas != null ? var.cortex.ingester.max_replicas : "100", "100") - ingester_min_replicas = try(var.cortex.ingester.min_replicas != null ? var.cortex.ingester.min_replicas : "2", "2") - ingester_memory_utilization = try(var.cortex.ingester.memory_utilization != null ? var.cortex.ingester.memory_utilization : "", "") - querier_replicas = try(var.cortex.querier.replicas != null ? var.cortex.querier.replicas : "1", "1") - querier_min_memory = try(var.cortex.querier.min_memory != null ? var.cortex.querier.min_memory : "null", "null") - querier_min_cpu = try(var.cortex.querier.min_cpu != null ? var.cortex.querier.min_cpu : "null", "null") - querier_max_memory = try(var.cortex.querier.max_memory != null ? var.cortex.querier.max_memory : "null", "null") - querier_max_cpu = try(var.cortex.querier.max_cpu != null ? var.cortex.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.cortex.querier.autoscaling != null ? var.cortex.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.cortex.querier.max_replicas != null ? var.cortex.querier.max_replicas : "20", "20") - querier_min_replicas = try(var.cortex.querier.min_replicas != null ? var.cortex.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.cortex.querier.memory_utilization != null ? var.cortex.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.cortex.querier.cpu_utilization != null ? var.cortex.querier.cpu_utilization : "", "") - query_frontend_replicas = try(var.cortex.query_frontend.replicas != null ? var.cortex.query_frontend.replicas : "4", "4") - query_frontend_enable = try(var.cortex.query_frontend.enable != null ? var.cortex.query_frontend.enable : "true", "true") - store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor != null ? var.cortex.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.cortex.store_gateway.replicas != null ? var.cortex.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size != null ? var.cortex.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.cortex.store_gateway.min_memory != null ? var.cortex.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu != null ? var.cortex.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.cortex.store_gateway.max_memory != null ? var.cortex.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu != null ? var.cortex.store_gateway.max_cpu : "null", "null") - memcached_frontend_enable = try(var.cortex.memcached_frontend.enable != null ? var.cortex.memcached_frontend.enable : "true", "true") - memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory != null ? var.cortex.memcached_frontend.min_memory : "null", "null") - memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu != null ? var.cortex.memcached_frontend.min_cpu : "null", "null") - memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory != null ? var.cortex.memcached_frontend.max_memory : "null", "null") - memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu != null ? var.cortex.memcached_frontend.max_cpu : "null", "null") - memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable != null ? var.cortex.memcached_blocks_index.enable : "true", "true") - memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu != null ? var.cortex.memcached_blocks_index.min_cpu : "null", "null") - memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory != null ? var.cortex.memcached_blocks_index.min_memory : "null", "null") - memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu != null ? var.cortex.memcached_blocks_index.max_cpu : "null", "null") - memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory != null ? var.cortex.memcached_blocks_index.max_memory : "null", "null") - memcached_blocks_enable = try(var.cortex.memcached_blocks.enable != null ? var.cortex.memcached_blocks.enable : "true", "true") - memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory != null ? var.cortex.memcached_blocks.min_memory : "null", "null") - memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu != null ? var.cortex.memcached_blocks.min_cpu : "null", "null") - memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory != null ? var.cortex.memcached_blocks.max_memory : "null", "null") - memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu != null ? var.cortex.memcached_blocks.max_cpu : "null", "null") - memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable != null ? var.cortex.memcached_blocks_metadata.enable : "true", "true") - memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory != null ? var.cortex.memcached_blocks_metadata.min_memory : "null", "null") - memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu != null ? var.cortex.memcached_blocks_metadata.min_cpu : "null", "null") - memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory != null ? var.cortex.memcached_blocks_metadata.max_memory : "null", "null") - memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu != null ? var.cortex.memcached_blocks_metadata.max_cpu : "null", "null") - distributor_replicas = try(var.cortex.distributor.replicas != null ? var.cortex.distributor.replicas : "1", "1") - distributor_min_memory = try(var.cortex.distributor.min_memory != null ? var.cortex.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.cortex.distributor.min_cpu != null ? var.cortex.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.cortex.distributor.max_memory != null ? var.cortex.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.cortex.distributor.max_cpu != null ? var.cortex.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.cortex.distributor.autoscaling != null ? var.cortex.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.cortex.distributor.max_replicas != null ? var.cortex.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.cortex.distributor.min_replicas != null ? var.cortex.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.cortex.distributor.memory_utilization != null ? var.cortex.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization != null ? var.cortex.distributor.cpu_utilization : "", "") - } -} - - resource "helm_release" "cortex" { count = local.enable_cortex ? 1 : 0 name = "cortex" @@ -119,6 +113,6 @@ resource "helm_release" "cortex" { version = "2.1.0" values = [ - data.template_file.cortex_template[0].rendered + local.cortex_values ] } diff --git a/observability/azure/loki.tf b/observability/azure/loki.tf index 2100d7e3..eec7c308 100644 --- a/observability/azure/loki.tf +++ b/observability/azure/loki.tf @@ -1,3 +1,60 @@ +locals { + loki_values = local.enable_loki ? templatefile("${path.module}/templates/loki-values.yaml", { + CONTAINER = azurerm_storage_container.loki_container[0].name + STORAGE_ACCOUNT = var.storage_account + ACCOUNT_KEY = var.account_access_key + + # Ingester + ingester_replicas = try(var.loki.ingester.replicas, "1") + ingester_min_memory = try(var.loki.ingester.min_memory, "1Gi") + ingester_max_memory = try(var.loki.ingester.max_memory, null) + ingester_min_cpu = try(var.loki.ingester.min_cpu, null) + ingester_max_cpu = try(var.loki.ingester.max_cpu, null) + ingester_autoscaling = try(var.loki.ingester.autoscaling, "true") + ingester_min_replicas = try(var.loki.ingester.min_replicas, "2") + ingester_max_replicas = try(var.loki.ingester.max_replicas, "30") + ingester_memory_utilization = try(var.loki.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization, "") + + # Distributor + distributor_replicas = try(var.loki.distributor.replicas, "1") + distributor_min_memory = try(var.loki.distributor.min_memory, "512Mi") + distributor_max_memory = try(var.loki.distributor.max_memory, "1Gi") + distributor_min_cpu = try(var.loki.distributor.min_cpu, "250m") + distributor_max_cpu = try(var.loki.distributor.max_cpu, "1") + distributor_autoscaling = try(var.loki.distributor.autoscaling, "true") + distributor_min_replicas = try(var.loki.distributor.min_replicas, "2") + distributor_max_replicas = try(var.loki.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.loki.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization, "") + + # Querier + querier_replicas = try(var.loki.querier.replicas, "4") + querier_min_memory = try(var.loki.querier.min_memory, "500Mi") + querier_max_memory = try(var.loki.querier.max_memory, null) + querier_min_cpu = try(var.loki.querier.min_cpu, "100m") + querier_max_cpu = try(var.loki.querier.max_cpu, null) + querier_autoscaling = try(var.loki.querier.autoscaling, "true") + querier_min_replicas = try(var.loki.querier.min_replicas, "2") + querier_max_replicas = try(var.loki.querier.max_replicas, "6") + querier_memory_utilization = try(var.loki.querier.memory_utilization, "") + querier_cpu_utilization = try(var.loki.querier.cpu_utilization, "") + querier_max_unavailable = try(var.loki.querier.max_unavailable, "1") + + # Query Frontend + queryFrontend_replicas = try(var.loki.queryFrontend.replicas, "1") + queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory, "250Mi") + queryFrontend_max_memory = try(var.loki.queryFrontend.max_memory, null) + queryFrontend_min_cpu = try(var.loki.queryFrontend.min_cpu, null) + queryFrontend_max_cpu = try(var.loki.queryFrontend.max_cpu, null) + queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling, "true") + queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas, "1") + queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas, "6") + queryFrontend_memory_utilization = try(var.loki.queryFrontend.memory_utilization, "") + queryFrontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization, "") + }) : null +} + resource "azurerm_storage_container" "loki_container" { count = local.enable_loki ? 1 : 0 name = "${local.cluster_name}-loki-container-${var.observability_suffix}" @@ -5,58 +62,6 @@ resource "azurerm_storage_container" "loki_container" { container_access_type = "private" } - -data "template_file" "loki_template" { - count = local.enable_loki ? 1 : 0 - template = file("${path.module}/templates/loki-values.yaml") - vars = { - "CONTAINER" = azurerm_storage_container.loki_container[0].name - "STORAGE_ACCOUNT" = var.storage_account - "ACCOUNT_KEY" = var.account_access_key - ingester_replicas = try(var.loki.ingester.replicas != null ? var.loki.ingester.replicas : "1", "1") - ingester_max_memory = try(var.loki.ingester.max_memory != null ? var.loki.ingester.max_memory : "null", "null") - ingester_min_memory = try(var.loki.ingester.min_memory != null ? var.loki.ingester.min_memory : "1Gi", "1Gi") - ingester_max_cpu = try(var.loki.ingester.max_cpu != null ? var.loki.ingester.max_cpu : "null", "null") - ingester_min_cpu = try(var.loki.ingester.min_cpu != null ? var.loki.ingester.min_cpu : "null", "null") - ingester_autoscaling = try(var.loki.ingester.autoscaling != null ? var.loki.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.loki.ingester.max_replicas != null ? var.loki.ingester.max_replicas : "30", "30") - ingester_min_replicas = try(var.loki.ingester.min_replicas != null ? var.loki.ingester.min_replicas : "2", "2") - ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization != null ? var.loki.ingester.cpu_utilization : "", "") - ingester_memory_utilization = try(var.loki.ingester.memory_utilization != null ? var.loki.ingester.memory_utilization : "", "") - distributor_replicas = try(var.loki.distributor.replicas != null ? var.loki.distributor.replicas : "1", "1") - distributor_max_memory = try(var.loki.distributor.max_memory != null ? var.loki.distributor.max_memory : "1Gi", "1Gi") - distributor_min_memory = try(var.loki.distributor.min_memory != null ? var.loki.distributor.min_memory : "512Mi", "512Mi") - distributor_max_cpu = try(var.loki.distributor.max_cpu != null ? var.loki.distributor.max_cpu : "1", "1") - distributor_min_cpu = try(var.loki.distributor.min_cpu != null ? var.loki.distributor.min_cpu : "250m", "250m") - distributor_autoscaling = try(var.loki.distributor.autoscaling != null ? var.loki.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.loki.distributor.max_replicas != null ? var.loki.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.loki.distributor.min_replicas != null ? var.loki.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.loki.distributor.memory_utilization != null ? var.loki.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization != null ? var.loki.distributor.cpu_utilization : "", "") - querier_replicas = try(var.loki.querier.replicas != null ? var.loki.querier.replicas : "4", "4") - querier_max_unavailable = try(var.loki.querier.max_unavailable != null ? var.loki.querier.max_unavailable : "1", "1") - querier_min_memory = try(var.loki.querier.min_memory != null ? var.loki.querier.min_memory : "500Mi", "500Mi") - querier_min_cpu = try(var.loki.querier.min_cpu != null ? var.loki.querier.min_cpu : "100m", "100m") - querier_max_memory = try(var.loki.querier.max_memory != null ? var.loki.querier.max_memory : "null", "null") - querier_max_cpu = try(var.loki.querier.max_cpu != null ? var.loki.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.loki.querier.autoscaling != null ? var.loki.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.loki.querier.max_replicas != null ? var.loki.querier.max_replicas : "6", "6") - querier_min_replicas = try(var.loki.querier.min_replicas != null ? var.loki.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.loki.querier.memory_utilization != null ? var.loki.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.loki.querier.cpu_utilization != null ? var.loki.querier.cpu_utilization : "", "") - queryFrontend_replicas = try(var.loki.queryFrontend.replicas != null ? var.loki.queryFrontend.replicas : "1", "1") - queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory != null ? var.loki.queryFrontend.min_memory : "250Mi", "250Mi") - queryFrontend_max_memory = try(var.loki.query_frontend.max_memory != null ? var.loki.query_frontend.max_memory : "null", "null") - queryFrontend_min_cpu = try(var.loki.query_frontend.min_cpu != null ? var.loki.query_frontend.min_cpu : "null", "null") - queryFrontend_max_cpu = try(var.loki.query_frontend.max_cpu != null ? var.loki.query_frontend.max_cpu : "null", "null") - queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling != null ? var.loki.queryFrontend.autoscaling : "true", "true") - queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas != null ? var.loki.queryFrontend.max_replicas : "6", "6") - queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas != null ? var.loki.queryFrontend.min_replicas : "1", "1") - queryFrontend_memory_utilization= try(var.loki.queryFrontend.memory_utilization != null ? var.loki.queryFrontend.memory_utilization : "", "") - queryFrontend_cpu_utilization= try(var.loki.queryFrontend.cpu_utilization != null ? var.loki.queryFrontend.cpu_utilization : "", "") - } -} - resource "helm_release" "loki" { count = local.enable_loki ? 1 : 0 name = "loki" @@ -66,6 +71,6 @@ resource "helm_release" "loki" { version = "0.68.0" values = [ - data.template_file.loki_template[0].rendered + local.loki_values ] } \ No newline at end of file diff --git a/observability/azure/main.tf b/observability/azure/main.tf index 53888bfd..525a745a 100644 --- a/observability/azure/main.tf +++ b/observability/azure/main.tf @@ -7,11 +7,13 @@ locals { enable_tempo = try(var.tempo != null ? var.tempo.enable : false, false) enable_cortex = try(var.cortex != null ? var.cortex.enable : false, false) enable_mimir = try(var.mimir != null ? var.mimir.enable : false,false) + enable_openobserve = length(var.openobserve) > 0 && anytrue([for instance in var.openobserve : instance.enable]) enable_ingress_loki = local.enable_loki ? (var.loki.enable_ingress != null ? var.loki.enable_ingress : false ) : false enable_ingress_tempo = local.enable_tempo ? (var.tempo.enable_ingress != null ? var.tempo.enable_ingress : false ) : false enable_ingress_mimir = local.enable_mimir ? (var.mimir.enable_ingress != null ? var.mimir.enable_ingress : false ) : false enable_ingress_cortex = local.enable_cortex ? (var.cortex.enable_ingress != null ? var.cortex.enable_ingress : false ) : false + enable_ingress_openobserve = local.enable_openobserve ? anytrue([for instance in var.openobserve : instance.enable && try(instance.enable_ingress, true)]) : false app_namespaces = { loki = local.enable_loki ? { @@ -30,6 +32,10 @@ locals { services = ["mimir-distributor:8080"] ingress = local.enable_ingress_mimir } : null + openobserve = local.enable_openobserve ? { + services = [for instance in var.openobserve : "${instance.name}-openobserve-standalone:5080" if instance.enable && instance.enable_ingress != false] + ingress = local.enable_ingress_openobserve + } : null } } diff --git a/observability/azure/mimir.tf b/observability/azure/mimir.tf index e300e161..bf468435 100644 --- a/observability/azure/mimir.tf +++ b/observability/azure/mimir.tf @@ -58,56 +58,72 @@ resource "kubernetes_secret" "mimir-azure-credentials" { } -data "template_file" "mimir_template" { - count = local.enable_mimir ? 1 : 0 - template = file("${path.module}/templates/mimir-values.yaml") - vars = { - "CONTAINER" = azurerm_storage_container.mimir_container[0].name - "ACCOUNT_NAME" = var.storage_account - "ACCOUNT_KEY" = var.account_access_key - cluster_name = local.cluster_name - app_region = var.app_region - limits_ingestion_rate = try(var.mimir.limits.ingestion_rate != null ? var.mimir.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size != null ? var.mimir.limits.ingestion_burst_size : "500000", "500000") - limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query != null ? var.mimir.limits.max_fetched_chunks_per_query : "3000000", "3000000") - limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness != null ? var.mimir.limits.max_cache_freshness : "24h", "24h") - limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant != null ? var.mimir.limits.max_outstanding_requests_per_tenant : "1000", "1000") - compactor_replicas = try(var.mimir.compactor.replicas != null ? var.mimir.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable != null ? var.mimir.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size != null ? var.mimir.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.mimir.compactor.min_cpu != null ? var.mimir.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.mimir.compactor.min_memory != null ? var.mimir.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.mimir.compactor.max_cpu != null ? var.mimir.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.mimir.compactor.max_memory != null ? var.mimir.compactor.max_memory : "null", "null") - ingester_replicas = try(var.mimir.ingester.replicas != null ? var.mimir.ingester.replicas : "2", "2") - ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size != null ? var.mimir.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.mimir.ingester.min_memory != null ? var.mimir.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.mimir.ingester.min_cpu != null ? var.mimir.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.mimir.ingester.max_memory != null ? var.mimir.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.mimir.ingester.max_cpu != null ? var.mimir.ingester.max_cpu : "null", "null") - querier_replicas = try(var.mimir.querier.replicas != null ? var.mimir.querier.replicas : "3", "3") - querier_min_memory = try(var.mimir.querier.min_memory != null ? var.mimir.querier.min_memory : "null", "null") - querier_min_cpu = try(var.mimir.querier.min_cpu != null ? var.mimir.querier.min_cpu : "null", "null") - querier_max_memory = try(var.mimir.querier.max_memory != null ? var.mimir.querier.max_memory : "null", "null") - querier_max_cpu = try(var.mimir.querier.max_cpu != null ? var.mimir.querier.max_cpu : "null", "null") - query_frontend_replicas = try(var.mimir.query_frontend.replicas != null ? var.mimir.query_frontend.replicas : "1", "1") - store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor != null ? var.mimir.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.mimir.store_gateway.replicas != null ? var.mimir.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size != null ? var.mimir.store_gateway.persistence_volume.size : "50Gi", "50Gi") - store_gateway_min_memory = try(var.mimir.store_gateway.min_memory != null ? var.mimir.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu != null ? var.mimir.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.mimir.store_gateway.max_memory != null ? var.mimir.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu != null ? var.mimir.store_gateway.max_cpu : "null", "null") - distributor_replicas = try(var.mimir.distributor.replicas != null ? var.mimir.distributor.replicas : "1", "1") - distributor_min_memory = try(var.mimir.distributor.min_memory != null ? var.mimir.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.mimir.distributor.min_cpu != null ? var.mimir.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.mimir.distributor.max_memory != null ? var.mimir.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.mimir.distributor.max_cpu != null ? var.mimir.distributor.max_cpu : "null", "null") - mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result - mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result - } +locals { + mimir_values = local.enable_mimir ? templatefile("${path.module}/templates/mimir-values.yaml", { + # Storage config + CONTAINER = azurerm_storage_container.mimir_container[0].name + ACCOUNT_NAME = var.storage_account + ACCOUNT_KEY = var.account_access_key + cluster_name = local.cluster_name + app_region = var.app_region + + # Limits + limits_ingestion_rate = try(var.mimir.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size, "500000") + limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query, "3000000") + limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness, "24h") + limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant, "1000") + + # Compactor + compactor_replicas = try(var.mimir.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.mimir.compactor.min_cpu, null) + compactor_min_memory = try(var.mimir.compactor.min_memory, null) + compactor_max_cpu = try(var.mimir.compactor.max_cpu, null) + compactor_max_memory = try(var.mimir.compactor.max_memory, null) + + # Ingester + ingester_replicas = try(var.mimir.ingester.replicas, "2") + ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.mimir.ingester.min_memory, null) + ingester_min_cpu = try(var.mimir.ingester.min_cpu, null) + ingester_max_memory = try(var.mimir.ingester.max_memory, null) + ingester_max_cpu = try(var.mimir.ingester.max_cpu, null) + + # Querier + querier_replicas = try(var.mimir.querier.replicas, "3") + querier_min_memory = try(var.mimir.querier.min_memory, null) + querier_min_cpu = try(var.mimir.querier.min_cpu, null) + querier_max_memory = try(var.mimir.querier.max_memory, null) + querier_max_cpu = try(var.mimir.querier.max_cpu, null) + + # Query frontend + query_frontend_replicas = try(var.mimir.query_frontend.replicas, "1") + + # Store Gateway + store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.mimir.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size, "50Gi") + store_gateway_min_memory = try(var.mimir.store_gateway.min_memory, null) + store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu, null) + store_gateway_max_memory = try(var.mimir.store_gateway.max_memory, null) + store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu, null) + + # Distributor + distributor_replicas = try(var.mimir.distributor.replicas, "1") + distributor_min_memory = try(var.mimir.distributor.min_memory, null) + distributor_min_cpu = try(var.mimir.distributor.min_cpu, null) + distributor_max_memory = try(var.mimir.distributor.max_memory, null) + distributor_max_cpu = try(var.mimir.distributor.max_cpu, null) + + # Auth + mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result + mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result + }) : null } + resource "helm_release" "mimir" { count = local.enable_mimir ? 1 : 0 name = "mimir" @@ -116,7 +132,7 @@ resource "helm_release" "mimir" { namespace = kubernetes_namespace.app_environments["mimir"].metadata[0].name version = "5.1.3" values = [ - data.template_file.mimir_template[0].rendered + local.mimir_values ] depends_on = [ diff --git a/observability/azure/openobserve.tf b/observability/azure/openobserve.tf new file mode 100644 index 00000000..c80881e6 --- /dev/null +++ b/observability/azure/openobserve.tf @@ -0,0 +1,70 @@ +# Create Azure Storage Container for OpenObserve data storage (auto-generated container names) +resource "azurerm_storage_container" "openobserve_data" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + name = "${local.cluster_name}-openobserve-${each.value.name}-${var.observability_suffix}" + storage_account_name = var.storage_account + container_access_type = "private" +} + +# Generate random password for OpenObserve +resource "random_password" "openobserve_password" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + length = 16 + special = true + upper = true + lower = true + numeric = true +} + +# Create template for OpenObserve values +locals { + openobserve_templates = local.enable_openobserve ? { + for instance in var.openobserve : + instance.name => templatefile( + "${path.module}/templates/openobserve-values.yaml", + { + replica_count = try(instance.replicaCount, 2) + cpu_request = try(instance.min_cpu, "250m") + memory_request = try(instance.min_memory, "1Gi") + cpu_limit = try(instance.max_cpu, "1") + memory_limit = try(instance.max_memory, "2Gi") + storage_provider = "azure" + storage_region = "auto" + storage_bucket_name = azurerm_storage_container.openobserve_data[instance.name].name + root_user_email = "admin@zop.dev" + root_user_password = random_password.openobserve_password[instance.name].result + storage_account = var.storage_account + account_key = var.account_access_key + + additional_env_vars = length(try(instance.env, [])) > 0 ? join("\n", + [ for env in instance.env : + " - name: ${env.name}\n value: \"${env.value}\"" + ] + ) : "" + } + ) + if instance.enable + } : {} +} + +# Deploy OpenObserve using Helm +resource "helm_release" "openobserve" { + for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} + + name = each.value.name + repository = "https://helm.zop.dev" + chart = "openobserve-standalone" + version = "v1.0.0" + namespace = kubernetes_namespace.app_environments["openobserve"].metadata[0].name + + values = [ + local.openobserve_templates[each.key] + ] + + depends_on = [ + azurerm_storage_container.openobserve_data, + kubernetes_namespace.app_environments, + ] +} diff --git a/observability/azure/outputs.tf b/observability/azure/outputs.tf index 360288d1..3c8dbfa3 100644 --- a/observability/azure/outputs.tf +++ b/observability/azure/outputs.tf @@ -26,3 +26,15 @@ output "cortex_host_url" { value = local.enable_cortex ? (local.enable_ingress_cortex ? kubernetes_ingress_v1.service_ingress["cortex-distributor:8080-cortex"].spec[0].rule[0].host : "cortex-distributor.cortex:8080") : "" } +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" + value = local.enable_openobserve ? { + for instance in var.openobserve : instance.name => { + name = instance.name + url = try(instance.enable_ingress, true) ? try(kubernetes_ingress_v1.service_ingress["${instance.name}-openobserve-standalone:5080-openobserve"].spec[0].rule[0].host, "${instance.name}.openobserve:5080") : "${instance.name}.openobserve:5080" + username = "admin@zop.dev" + password = random_password.openobserve_password[instance.name].result + } if instance.enable + } : {} + sensitive = true +} diff --git a/observability/azure/templates/openobserve-values.yaml b/observability/azure/templates/openobserve-values.yaml new file mode 100644 index 00000000..a1834d7f --- /dev/null +++ b/observability/azure/templates/openobserve-values.yaml @@ -0,0 +1,51 @@ +replicaCount: ${replica_count} + +resources: + requests: + cpu: ${cpu_request} + memory: ${memory_request} + limits: + cpu: ${cpu_limit} + memory: ${memory_limit} + +auth: + ZO_ROOT_USER_EMAIL: "${root_user_email}" + ZO_ROOT_USER_PASSWORD: "${root_user_password}" + ZO_ROOT_USER_TOKEN: "" + ZO_S3_ACCESS_KEY: "" + ZO_S3_SECRET_KEY: "" + AZURE_STORAGE_ACCOUNT_KEY: "${account_key}" + AZURE_STORAGE_ACCOUNT_NAME: "${storage_account}" + +config: + ZO_S3_PROVIDER: "azure" + ZO_S3_BUCKET_NAME: "${storage_bucket_name}" + ZO_LOCAL_MODE_STORAGE: "s3" + ZO_S3_BATCH_SIZE: "1000" + ZO_S3_BATCH_WAIT: "5s" + ZO_S3_FLUSH_INTERVAL: "30s" + ZO_S3_COMPRESSION: "gzip" + ZO_MEM_PERSIST_INTERVAL: "20" + ZO_FILE_PUSH_INTERVAL: "10" + ZO_MAX_FILE_SIZE_ON_DISK: "128" + ZO_MAX_FILE_RETENTION_TIME: "30" + ZO_COMPACT_MAX_FILE_SIZE: "512" + ZO_MEM_TABLE_MAX_SIZE: "256" + ZO_MEMORY_CACHE_DATAFUSION_MAX_SIZE: "0" + ZO_MEMORY_CACHE_GC_INTERVAL: "60" + ZO_MEMORY_CACHE_MAX_SIZE: "0" + +persistence: + enabled: false + +probes: + enabled: true + path: / + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 3 + +extraEnv: +%{~ if additional_env_vars != "" ~} +${additional_env_vars} +%{~ endif ~} diff --git a/observability/azure/tempo.tf b/observability/azure/tempo.tf index b9fbe0f8..8c1ba95d 100644 --- a/observability/azure/tempo.tf +++ b/observability/azure/tempo.tf @@ -9,6 +9,52 @@ locals { ], {}) } +locals { + tempo_values = local.enable_tempo ? templatefile("${path.module}/templates/tempo-values.yaml", { + # Storage config + CONTAINER = azurerm_storage_container.tempo_container[0].name + STORAGE_ACCOUNT = var.storage_account + ACCOUNT_KEY = var.account_access_key + + # Ingester + ingester_replicas = try(var.tempo.ingester.replicas, "1") + ingester_min_memory = try(var.tempo.ingester.min_memory, "1Gi") + ingester_max_memory = try(var.tempo.ingester.max_memory, null) + ingester_min_cpu = try(var.tempo.ingester.min_cpu, null) + ingester_max_cpu = try(var.tempo.ingester.max_cpu, null) + ingester_autoscaling = try(var.tempo.ingester.autoscaling, "true") + ingester_min_replicas = try(var.tempo.ingester.min_replicas, "2") + ingester_max_replicas = try(var.tempo.ingester.max_replicas, "30") + ingester_memory_utilization = try(var.tempo.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization, "") + + # Distributor + distributor_replicas = try(var.tempo.distributor.replicas, "1") + distributor_min_memory = try(var.tempo.distributor.min_memory, "750Mi") + distributor_max_memory = try(var.tempo.distributor.max_memory, null) + distributor_min_cpu = try(var.tempo.distributor.min_cpu, null) + distributor_max_cpu = try(var.tempo.distributor.max_cpu, null) + distributor_autoscaling = try(var.tempo.distributor.autoscaling, "true") + distributor_min_replicas = try(var.tempo.distributor.min_replicas, "2") + distributor_max_replicas = try(var.tempo.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.tempo.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization, "") + + # Querier & Query Frontend + querier_replicas = try(var.tempo.querier.replicas, "1") + query_frontend_replicas = try(var.tempo.queryFrontend.replicas, "1") + + # Metrics Generator + metrics_generator_enable = try(var.tempo.metrics_generator.enable, false) + metrics_generator_replicas = try(var.tempo.metrics_generator.replicas, "1") + metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items, "30000") + metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait, "30s") + metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline, "2m") + metrics_generator_remote_write = jsonencode(local.remote_write_config) + metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack, "40s") + }) : null +} + resource "azurerm_storage_container" "tempo_container" { count = local.enable_tempo ? 1 : 0 name = "${local.cluster_name}-tempo-container-${var.observability_suffix}" @@ -16,46 +62,6 @@ resource "azurerm_storage_container" "tempo_container" { container_access_type = "private" } - -data "template_file" "tempo_template"{ - count = local.enable_tempo ? 1 : 0 - template = file("${path.module}/templates/tempo-values.yaml") - vars = { - "CONTAINER" = azurerm_storage_container.tempo_container[0].name - "STORAGE_ACCOUNT" = var.storage_account - "ACCOUNT_KEY" = var.account_access_key - ingester_replicas = try(var.tempo.ingester.replicas != null ? var.tempo.ingester.replicas : "1", "1") - ingester_min_memory = try(var.tempo.ingester.min_memory != null ? var.tempo.ingester.min_memory : "1Gi", "1Gi") - ingester_max_memory = try(var.tempo.ingester.max_memory != null ? var.tempo.ingester.max_memory : "null", "null") - ingester_min_cpu = try(var.tempo.ingester.min_cpu != null ? var.tempo.ingester.min_cpu : "null", "null") - ingester_max_cpu = try(var.tempo.ingester.max_cpu != null ? var.tempo.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.tempo.ingester.autoscaling != null ? var.tempo.ingester.autoscaling : "true", "true") - ingester_min_replicas = try(var.tempo.ingester.min_replicas != null ? var.tempo.ingester.min_replicas : "2", "2") - ingester_max_replicas = try(var.tempo.ingester.max_replicas != null ? var.tempo.ingester.max_replicas : "30", "30") - ingester_memory_utilization = try(var.tempo.ingester.memory_utilization != null ? var.tempo.ingester.memory_utilization : "", "") - ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization != null ? var.tempo.ingester.cpu_utilization : "", "") - distributor_replicas = try(var.tempo.distributor.replicas != null ? var.tempo.distributor.replicas : "1", "1") - distributor_min_memory = try(var.tempo.distributor.min_memory != null ? var.tempo.distributor.min_memory : "750Mi", "750Mi") - distributor_max_memory = try(var.tempo.distributor.max_memory != null ? var.tempo.distributor.max_memory : "null", "null") - distributor_min_cpu = try(var.tempo.distributor.min_cpu != null ? var.tempo.distributor.min_cpu : "null", "null") - distributor_max_cpu = try(var.tempo.distributor.max_cpu != null ? var.tempo.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.tempo.distributor.autoscaling != null ? var.tempo.distributor.autoscaling : "true", "true") - distributor_min_replicas = try(var.tempo.distributor.min_replicas != null ? var.tempo.distributor.min_replicas : "2", "2") - distributor_max_replicas = try(var.tempo.distributor.max_replicas != null ? var.tempo.distributor.max_replicas : "30", "30") - distributor_memory_utilization = try(var.tempo.distributor.memory_utilization != null ? var.tempo.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization != null ? var.tempo.distributor.cpu_utilization : "","") - querier_replicas = try(var.tempo.querier.replicas != null ? var.tempo.querier.replicas : "1", "1") - queryFrontend_replicas = try(var.tempo.queryFrontend.replicas != null ? var.tempo.queryFrontend.replicas : "1", "1") - metrics_generator_enable = try(var.tempo.metrics_generator.enable != null ? var.tempo.metrics_generator.enable : false, false) - metrics_generator_replicas = try(var.tempo.metrics_generator.replicas != null ? var.tempo.metrics_generator.replicas : "1", "1") - metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items != null ? var.tempo.metrics_generator.service_graphs_max_items : "30000", "30000") - metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait != null ? var.tempo.metrics_generator.service_graphs_wait : "30s", "30s") - metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline != null ? var.tempo.metrics_generator.remote_write_flush_deadline : "2m", "2m") - metrics_generator_remote_write = jsonencode(local.remote_write_config) - metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack != null ? var.tempo.metrics_generator.metrics_ingestion_time_range_slack : "40s", "40s") - } -} - resource "helm_release" "tempo" { count = local.enable_tempo ? 1 : 0 name = "tempo" @@ -65,6 +71,6 @@ resource "helm_release" "tempo" { version = "1.38.0" values = [ - data.template_file.tempo_template[0].rendered + local.tempo_values ] } \ No newline at end of file diff --git a/observability/azure/vars.tf b/observability/azure/vars.tf index c3a601dd..6391f232 100644 --- a/observability/azure/vars.tf +++ b/observability/azure/vars.tf @@ -375,4 +375,23 @@ variable "mimir" { max_memory = optional(string) })) }) +} + +variable "openobserve" { + description = "List of OpenObserve instances to deploy" + type = list(object({ + enable = bool + name = string + replicaCount = optional(number, 2) + min_cpu = optional(string, "250m") + max_cpu = optional(string, "1") + min_memory = optional(string, "1Gi") + max_memory = optional(string, "2Gi") + enable_ingress = optional(bool, true) + env = optional(list(object({ + name = string + value = string + })), []) + })) + default = [] } \ No newline at end of file diff --git a/observability/azure/versions.tf b/observability/azure/versions.tf index 6db58749..871ac339 100644 --- a/observability/azure/versions.tf +++ b/observability/azure/versions.tf @@ -4,6 +4,10 @@ terraform { source = "alekc/kubectl" version = "2.0.4" } + random = { + source = "hashicorp/random" + version = ">= 3.1.0" + } } required_version = ">= 1.0.0" # experiments = [module_variable_optional_attrs] diff --git a/observability/gcp/cortex.tf b/observability/gcp/cortex.tf index 2b86a100..e2bd3a67 100644 --- a/observability/gcp/cortex.tf +++ b/observability/gcp/cortex.tf @@ -1,10 +1,113 @@ +locals { + cortex_template = local.enable_cortex ? templatefile( + "${path.module}/templates/cortex-values.yaml", + { + data_bucket_name = google_storage_bucket.cortex_data[0].id + cluster_name = local.cluster_name + app_region = var.app_region + + limits_ingestion_rate = try(var.cortex.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size, "500000") + limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric, "0") + limits_max_series_per_user = try(var.cortex.limits.max_series_per_user, "0") + limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query, "3000000") + + query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout, "30s") + + compactor_enable = try(var.cortex.compactor.enable, "true") + compactor_replicas = try(var.cortex.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.cortex.compactor.min_cpu, "null") + compactor_min_memory = try(var.cortex.compactor.min_memory, "null") + compactor_max_cpu = try(var.cortex.compactor.max_cpu, "null") + compactor_max_memory = try(var.cortex.compactor.max_memory, "null") + + ingester_replicas = try(var.cortex.ingester.replicas, "1") + ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.cortex.ingester.min_memory, "null") + ingester_min_cpu = try(var.cortex.ingester.min_cpu, "null") + ingester_max_memory = try(var.cortex.ingester.max_memory, "null") + ingester_max_cpu = try(var.cortex.ingester.max_cpu, "null") + ingester_autoscaling = try(var.cortex.ingester.autoscaling, "true") + ingester_max_replicas = try(var.cortex.ingester.max_replicas, "100") + ingester_min_replicas = try(var.cortex.ingester.min_replicas, "2") + ingester_memory_utilization = try(var.cortex.ingester.memory_utilization, "") + + querier_replicas = try(var.cortex.querier.replicas, "1") + querier_min_memory = try(var.cortex.querier.min_memory, "null") + querier_min_cpu = try(var.cortex.querier.min_cpu, "null") + querier_max_memory = try(var.cortex.querier.max_memory, "null") + querier_max_cpu = try(var.cortex.querier.max_cpu, "null") + querier_autoscaling = try(var.cortex.querier.autoscaling, "true") + querier_max_replicas = try(var.cortex.querier.max_replicas, "20") + querier_min_replicas = try(var.cortex.querier.min_replicas, "2") + querier_memory_utilization = try(var.cortex.querier.memory_utilization, "") + querier_cpu_utilization = try(var.cortex.querier.cpu_utilization, "") + + query_frontend_replicas = try(var.cortex.query_frontend.replicas, "4") + query_frontend_enable = try(var.cortex.query_frontend.enable, "true") + + store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.cortex.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size, "500Gi") + store_gateway_min_memory = try(var.cortex.store_gateway.min_memory, "null") + store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu, "null") + store_gateway_max_memory = try(var.cortex.store_gateway.max_memory, "null") + store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu, "null") + + memcached_frontend_enable = try(var.cortex.memcached_frontend.enable, "true") + memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory, "null") + memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu, "null") + memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory, "null") + memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu, "null") + + memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable, "true") + memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu, "null") + memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory, "null") + memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu, "null") + memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory, "null") + + memcached_blocks_enable = try(var.cortex.memcached_blocks.enable, "true") + memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory, "null") + memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu, "null") + memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory, "null") + memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu, "null") + + memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable, "true") + memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory, "null") + memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu, "null") + memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory, "null") + memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu, "null") + + distributor_replicas = try(var.cortex.distributor.replicas, "1") + distributor_min_memory = try(var.cortex.distributor.min_memory, "null") + distributor_min_cpu = try(var.cortex.distributor.min_cpu, "null") + distributor_max_memory = try(var.cortex.distributor.max_memory, "null") + distributor_max_cpu = try(var.cortex.distributor.max_cpu, "null") + distributor_autoscaling = try(var.cortex.distributor.autoscaling, "true") + distributor_max_replicas = try(var.cortex.distributor.max_replicas, "30") + distributor_min_replicas = try(var.cortex.distributor.min_replicas, "2") + distributor_memory_utilization = try(var.cortex.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization, "") + } + ) : "" +} + resource "google_storage_bucket" "cortex_data" { count = local.enable_cortex ? 1 : 0 name = "${local.cluster_name}-cortex-data-${var.observability_suffix}" location = var.app_region project = var.project_id - force_destroy = true + force_destroy = false labels = var.labels + + uniform_bucket_level_access = true + public_access_prevention = "enforced" + + lifecycle { + prevent_destroy = true + } } resource "google_service_account" "cortex_svc_acc" { @@ -43,89 +146,6 @@ resource "kubernetes_secret" "cortex-google-credentials" { type = "Opaque" } -data "template_file" "cortex_template"{ - count = local.enable_cortex ? 1 : 0 - template = file("${path.module}/templates/cortex-values.yaml") - vars = { - data_bucket_name = google_storage_bucket.cortex_data[0].id - cluster_name = local.cluster_name - app_region = var.app_region - limits_ingestion_rate = try(var.cortex.limits.ingestion_rate != null ? var.cortex.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size != null ? var.cortex.limits.ingestion_burst_size : "500000", "500000") - limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric != null ? var.cortex.limits.max_series_per_metric : "0", "0") - limits_max_series_per_user = try(var.cortex.limits.max_series_per_user != null ? var.cortex.limits.max_series_per_user : "0", "0") - limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query != null ? var.cortex.limits.max_fetched_chunks_per_query : "3000000", "3000000") - query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout != null ? var.cortex.query_range.memcached_client_timeout : "30s", "30s") - compactor_enable = try(var.cortex.compactor.enable != null ? var.cortex.compactor.enable : "true", "true") - compactor_replicas = try(var.cortex.compactor.replicas != null ? var.cortex.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable != null ? var.cortex.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size != null ? var.cortex.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.cortex.compactor.min_cpu != null ? var.cortex.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.cortex.compactor.min_memory != null ? var.cortex.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.cortex.compactor.max_cpu != null ? var.cortex.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.cortex.compactor.max_memory != null ? var.cortex.compactor.max_memory : "null", "null") - ingester_replicas = try(var.cortex.ingester.replicas != null ? var.cortex.ingester.replicas : "1", "1") - ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size != null ? var.cortex.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.cortex.ingester.min_memory != null ? var.cortex.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.cortex.ingester.min_cpu != null ? var.cortex.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.cortex.ingester.max_memory != null ? var.cortex.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.cortex.ingester.max_cpu != null ? var.cortex.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.cortex.ingester.autoscaling != null ? var.cortex.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.cortex.ingester.max_replicas != null ? var.cortex.ingester.max_replicas : "100", "100") - ingester_min_replicas = try(var.cortex.ingester.min_replicas != null ? var.cortex.ingester.min_replicas : "2", "2") - ingester_memory_utilization = try(var.cortex.ingester.memory_utilization != null ? var.cortex.ingester.memory_utilization : "", "") - querier_replicas = try(var.cortex.querier.replicas != null ? var.cortex.querier.replicas : "1", "1") - querier_min_memory = try(var.cortex.querier.min_memory != null ? var.cortex.querier.min_memory : "null", "null") - querier_min_cpu = try(var.cortex.querier.min_cpu != null ? var.cortex.querier.min_cpu : "null", "null") - querier_max_memory = try(var.cortex.querier.max_memory != null ? var.cortex.querier.max_memory : "null", "null") - querier_max_cpu = try(var.cortex.querier.max_cpu != null ? var.cortex.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.cortex.querier.autoscaling != null ? var.cortex.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.cortex.querier.max_replicas != null ? var.cortex.querier.max_replicas : "20", "20") - querier_min_replicas = try(var.cortex.querier.min_replicas != null ? var.cortex.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.cortex.querier.memory_utilization != null ? var.cortex.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.cortex.querier.cpu_utilization != null ? var.cortex.querier.cpu_utilization : "", "") - query_frontend_replicas = try(var.cortex.query_frontend.replicas != null ? var.cortex.query_frontend.replicas : "4", "4") - query_frontend_enable = try(var.cortex.query_frontend.enable != null ? var.cortex.query_frontend.enable : "true", "true") - store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor != null ? var.cortex.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.cortex.store_gateway.replicas != null ? var.cortex.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size != null ? var.cortex.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.cortex.store_gateway.min_memory != null ? var.cortex.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu != null ? var.cortex.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.cortex.store_gateway.max_memory != null ? var.cortex.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu != null ? var.cortex.store_gateway.max_cpu : "null", "null") - memcached_frontend_enable = try(var.cortex.memcached_frontend.enable != null ? var.cortex.memcached_frontend.enable : "true", "true") - memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory != null ? var.cortex.memcached_frontend.min_memory : "null", "null") - memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu != null ? var.cortex.memcached_frontend.min_cpu : "null", "null") - memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory != null ? var.cortex.memcached_frontend.max_memory : "null", "null") - memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu != null ? var.cortex.memcached_frontend.max_cpu : "null", "null") - memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable != null ? var.cortex.memcached_blocks_index.enable : "true", "true") - memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu != null ? var.cortex.memcached_blocks_index.min_cpu : "null", "null") - memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory != null ? var.cortex.memcached_blocks_index.min_memory : "null", "null") - memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu != null ? var.cortex.memcached_blocks_index.max_cpu : "null", "null") - memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory != null ? var.cortex.memcached_blocks_index.max_memory : "null", "null") - memcached_blocks_enable = try(var.cortex.memcached_blocks.enable != null ? var.cortex.memcached_blocks.enable : "true", "true") - memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory != null ? var.cortex.memcached_blocks.min_memory : "null", "null") - memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu != null ? var.cortex.memcached_blocks.min_cpu : "null", "null") - memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory != null ? var.cortex.memcached_blocks.max_memory : "null", "null") - memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu != null ? var.cortex.memcached_blocks.max_cpu : "null", "null") - memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable != null ? var.cortex.memcached_blocks_metadata.enable : "true", "true") - memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory != null ? var.cortex.memcached_blocks_metadata.min_memory : "null", "null") - memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu != null ? var.cortex.memcached_blocks_metadata.min_cpu : "null", "null") - memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory != null ? var.cortex.memcached_blocks_metadata.max_memory : "null", "null") - memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu != null ? var.cortex.memcached_blocks_metadata.max_cpu : "null", "null") - distributor_replicas = try(var.cortex.distributor.replicas != null ? var.cortex.distributor.replicas : "1", "1") - distributor_min_memory = try(var.cortex.distributor.min_memory != null ? var.cortex.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.cortex.distributor.min_cpu != null ? var.cortex.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.cortex.distributor.max_memory != null ? var.cortex.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.cortex.distributor.max_cpu != null ? var.cortex.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.cortex.distributor.autoscaling != null ? var.cortex.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.cortex.distributor.max_replicas != null ? var.cortex.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.cortex.distributor.min_replicas != null ? var.cortex.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.cortex.distributor.memory_utilization != null ? var.cortex.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization != null ? var.cortex.distributor.cpu_utilization : "", "") - } -} - resource "helm_release" "cortex" { count = local.enable_cortex ? 1 : 0 name = "cortex" @@ -135,7 +155,7 @@ resource "helm_release" "cortex" { version = "2.0.0" values = [ - data.template_file.cortex_template[0].rendered + local.cortex_template ] depends_on = [ diff --git a/observability/gcp/loki.tf b/observability/gcp/loki.tf index 57cee326..98d69570 100644 --- a/observability/gcp/loki.tf +++ b/observability/gcp/loki.tf @@ -1,10 +1,73 @@ +locals { + loki_template = local.enable_loki ? templatefile( + "${path.module}/templates/loki-values.yaml", + { + cluster_name = local.cluster_name + data_bucket_name = google_storage_bucket.loki_data[0].id + service_account = google_service_account.loki_svc_acc[0].email + + ingester_replicas = try(var.loki.ingester.replicas, "1") + ingester_max_memory = try(var.loki.ingester.max_memory, "null") + ingester_min_memory = try(var.loki.ingester.min_memory, "1Gi") + ingester_max_cpu = try(var.loki.ingester.max_cpu, "null") + ingester_min_cpu = try(var.loki.ingester.min_cpu, "null") + ingester_autoscaling = try(var.loki.ingester.autoscaling, "true") + ingester_max_replicas = try(var.loki.ingester.max_replicas, "30") + ingester_min_replicas = try(var.loki.ingester.min_replicas, "2") + ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization, "") + ingester_memory_utilization = try(var.loki.ingester.memory_utilization, "") + + distributor_replicas = try(var.loki.distributor.replicas, "1") + distributor_max_memory = try(var.loki.distributor.max_memory, "1Gi") + distributor_min_memory = try(var.loki.distributor.min_memory, "512Mi") + distributor_max_cpu = try(var.loki.distributor.max_cpu, "1") + distributor_min_cpu = try(var.loki.distributor.min_cpu, "250m") + distributor_autoscaling = try(var.loki.distributor.autoscaling, "true") + distributor_max_replicas = try(var.loki.distributor.max_replicas, "30") + distributor_min_replicas = try(var.loki.distributor.min_replicas, "2") + distributor_memory_utilization = try(var.loki.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization, "") + + querier_replicas = try(var.loki.querier.replicas, "4") + querier_max_unavailable = try(var.loki.querier.max_unavailable, "1") + querier_min_memory = try(var.loki.querier.min_memory, "500Mi") + querier_min_cpu = try(var.loki.querier.min_cpu, "100m") + querier_max_memory = try(var.loki.querier.max_memory, "null") + querier_max_cpu = try(var.loki.querier.max_cpu, "null") + querier_autoscaling = try(var.loki.querier.autoscaling, "true") + querier_max_replicas = try(var.loki.querier.max_replicas, "6") + querier_min_replicas = try(var.loki.querier.min_replicas, "2") + querier_memory_utilization = try(var.loki.querier.memory_utilization, "") + querier_cpu_utilization = try(var.loki.querier.cpu_utilization, "") + + queryFrontend_replicas = try(var.loki.queryFrontend.replicas, "1") + queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory, "250Mi") + queryFrontend_max_memory = try(var.loki.queryFrontend.max_memory, "null") + queryFrontend_min_cpu = try(var.loki.queryFrontend.min_cpu, "null") + queryFrontend_max_cpu = try(var.loki.queryFrontend.max_cpu, "null") + queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling, "true") + queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas, "6") + queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas, "1") + queryFrontend_memory_utilization = try(var.loki.queryFrontend.memory_utilization, "") + queryFrontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization, "") + } + ) : "" +} + resource "google_storage_bucket" "loki_data" { count = local.enable_loki ? 1 : 0 name = "${local.cluster_name}-loki-data-${var.observability_suffix}" location = var.app_region project = var.project_id - force_destroy = true + force_destroy = false labels = var.labels + + uniform_bucket_level_access = true + public_access_prevention = "enforced" + + lifecycle { + prevent_destroy = true + } } resource "google_project_iam_member" "loki-k8s-service-account" { @@ -58,57 +121,6 @@ resource "kubernetes_secret" "loki-google-credentials" { type = "Opaque" } -data "template_file" "loki_template" { - count = local.enable_loki ? 1 : 0 - template = file("${path.module}/templates/loki-values.yaml") - vars = { - cluster_name = local.cluster_name - data_bucket_name = google_storage_bucket.loki_data[0].id - service_account = google_service_account.loki_svc_acc[0].email - ingester_replicas = try(var.loki.ingester.replicas != null ? var.loki.ingester.replicas : "1", "1") - ingester_max_memory = try(var.loki.ingester.max_memory != null ? var.loki.ingester.max_memory : "null", "null") - ingester_min_memory = try(var.loki.ingester.min_memory != null ? var.loki.ingester.min_memory : "1Gi", "1Gi") - ingester_max_cpu = try(var.loki.ingester.max_cpu != null ? var.loki.ingester.max_cpu : "null", "null") - ingester_min_cpu = try(var.loki.ingester.min_cpu != null ? var.loki.ingester.min_cpu : "null", "null") - ingester_autoscaling = try(var.loki.ingester.autoscaling != null ? var.loki.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.loki.ingester.max_replicas != null ? var.loki.ingester.max_replicas : "30", "30") - ingester_min_replicas = try(var.loki.ingester.min_replicas != null ? var.loki.ingester.min_replicas : "2", "2") - ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization != null ? var.loki.ingester.cpu_utilization : "", "") - ingester_memory_utilization = try(var.loki.ingester.memory_utilization != null ? var.loki.ingester.memory_utilization : "", "") - distributor_replicas = try(var.loki.distributor.replicas != null ? var.loki.distributor.replicas : "1", "1") - distributor_max_memory = try(var.loki.distributor.max_memory != null ? var.loki.distributor.max_memory : "1Gi", "1Gi") - distributor_min_memory = try(var.loki.distributor.min_memory != null ? var.loki.distributor.min_memory : "512Mi", "512Mi") - distributor_max_cpu = try(var.loki.distributor.max_cpu != null ? var.loki.distributor.max_cpu : "1", "1") - distributor_min_cpu = try(var.loki.distributor.min_cpu != null ? var.loki.distributor.min_cpu : "250m", "250m") - distributor_autoscaling = try(var.loki.distributor.autoscaling != null ? var.loki.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.loki.distributor.max_replicas != null ? var.loki.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.loki.distributor.min_replicas != null ? var.loki.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.loki.distributor.memory_utilization != null ? var.loki.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization != null ? var.loki.distributor.cpu_utilization : "", "") - querier_replicas = try(var.loki.querier.replicas != null ? var.loki.querier.replicas : "4", "4") - querier_max_unavailable = try(var.loki.querier.max_unavailable != null ? var.loki.querier.max_unavailable : "1", "1") - querier_min_memory = try(var.loki.querier.min_memory != null ? var.loki.querier.min_memory : "500Mi", "500Mi") - querier_min_cpu = try(var.loki.querier.min_cpu != null ? var.loki.querier.min_cpu : "100m", "100m") - querier_max_memory = try(var.loki.querier.max_memory != null ? var.loki.querier.max_memory : "null", "null") - querier_max_cpu = try(var.loki.querier.max_cpu != null ? var.loki.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.loki.querier.autoscaling != null ? var.loki.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.loki.querier.max_replicas != null ? var.loki.querier.max_replicas : "6", "6") - querier_min_replicas = try(var.loki.querier.min_replicas != null ? var.loki.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.loki.querier.memory_utilization != null ? var.loki.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.loki.querier.cpu_utilization != null ? var.loki.querier.cpu_utilization : "", "") - queryFrontend_replicas = try(var.loki.queryFrontend.replicas != null ? var.loki.queryFrontend.replicas : "1", "1") - queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory != null ? var.loki.queryFrontend.min_memory : "250Mi", "250Mi") - queryFrontend_max_memory = try(var.loki.query_frontend.max_memory != null ? var.loki.query_frontend.max_memory : "null", "null") - queryFrontend_min_cpu = try(var.loki.query_frontend.min_cpu != null ? var.loki.query_frontend.min_cpu : "null", "null") - queryFrontend_max_cpu = try(var.loki.query_frontend.max_cpu != null ? var.loki.query_frontend.max_cpu : "null", "null") - queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling != null ? var.loki.queryFrontend.autoscaling : "true", "true") - queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas != null ? var.loki.queryFrontend.max_replicas : "6", "6") - queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas != null ? var.loki.queryFrontend.min_replicas : "1", "1") - queryFrontend_memory_utilization= try(var.loki.queryFrontend.memory_utilization != null ? var.loki.queryFrontend.memory_utilization : "", "") - queryFrontend_cpu_utilization= try(var.loki.queryFrontend.cpu_utilization != null ? var.loki.queryFrontend.cpu_utilization : "", "") - } -} - resource "helm_release" "loki" { count = local.enable_loki ? 1 : 0 name = "loki" @@ -118,7 +130,7 @@ resource "helm_release" "loki" { version = "0.69.16" values = [ - data.template_file.loki_template[0].rendered + local.loki_template ] depends_on = [kubernetes_secret.loki-google-credentials] diff --git a/observability/gcp/mimir.tf b/observability/gcp/mimir.tf index 98ad6be3..a69a55e5 100644 --- a/observability/gcp/mimir.tf +++ b/observability/gcp/mimir.tf @@ -1,10 +1,95 @@ +locals { + mimir_template = local.enable_mimir ? templatefile("${path.module}/templates/mimir-values.yaml", { + data_bucket_name = google_storage_bucket.mimir_data[0].id + cluster_name = local.cluster_name + limits_ingestion_rate = try(var.mimir.limits.ingestion_rate != null ? var.mimir.limits.ingestion_rate : "500000", "500000") + limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size != null ? var.mimir.limits.ingestion_burst_size : "1000000", "1000000") + limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query != null ? var.mimir.limits.max_fetched_chunks_per_query : "5000000", "5000000") + limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness != null ? var.mimir.limits.max_cache_freshness : "12h", "12h") + limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant != null ? var.mimir.limits.max_outstanding_requests_per_tenant : "2000", "2000") + + # Compactor configuration + compactor_replicas = try(var.mimir.compactor.replicas != null ? var.mimir.compactor.replicas : "2", "2") + compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable != null ? var.mimir.compactor.persistence_volume.enable : "true", "true") + compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size != null ? var.mimir.compactor.persistence_volume.size : "50Gi", "50Gi") + compactor_min_cpu = try(var.mimir.compactor.min_cpu != null ? var.mimir.compactor.min_cpu : "null", "null") + compactor_min_memory = try(var.mimir.compactor.min_memory != null ? var.mimir.compactor.min_memory : "null", "null") + compactor_max_cpu = try(var.mimir.compactor.max_cpu != null ? var.mimir.compactor.max_cpu : "null", "null") + compactor_max_memory = try(var.mimir.compactor.max_memory != null ? var.mimir.compactor.max_memory : "null", "null") + + # Ingester configuration + ingester_replicas = try(var.mimir.ingester.replicas != null ? var.mimir.ingester.replicas : "2", "2") + ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size != null ? var.mimir.ingester.persistence_volume.size : "100Gi", "100Gi") + ingester_min_memory = try(var.mimir.ingester.min_memory != null ? var.mimir.ingester.min_memory : "null", "null") + ingester_min_cpu = try(var.mimir.ingester.min_cpu != null ? var.mimir.ingester.min_cpu : "null", "null") + ingester_max_memory = try(var.mimir.ingester.max_memory != null ? var.mimir.ingester.max_memory : "null", "null") + ingester_max_cpu = try(var.mimir.ingester.max_cpu != null ? var.mimir.ingester.max_cpu : "null", "null") + + # Querier configuration + querier_replicas = try(var.mimir.querier.replicas != null ? var.mimir.querier.replicas : "2", "2") + querier_min_memory = try(var.mimir.querier.min_memory != null ? var.mimir.querier.min_memory : "null", "null") + querier_min_cpu = try(var.mimir.querier.min_cpu != null ? var.mimir.querier.min_cpu : "null", "null") + querier_max_memory = try(var.mimir.querier.max_memory != null ? var.mimir.querier.max_memory : "null", "null") + querier_max_cpu = try(var.mimir.querier.max_cpu != null ? var.mimir.querier.max_cpu : "null", "null") + + # Query Frontend configuration + query_frontend_replicas = try(var.mimir.query_frontend.replicas != null ? var.mimir.query_frontend.replicas : "2", "2") + + # Store gateway configuration + store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor != null ? var.mimir.store_gateway.replication_factor : "3", "3") + store_gateway_replicas = try(var.mimir.store_gateway.replicas != null ? var.mimir.store_gateway.replicas : "2", "2") + store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size != null ? var.mimir.store_gateway.persistence_volume.size : "50Gi", "50Gi") + store_gateway_min_memory = try(var.mimir.store_gateway.min_memory != null ? var.mimir.store_gateway.min_memory : "null", "null") + store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu != null ? var.mimir.store_gateway.min_cpu : "null", "null") + store_gateway_max_memory = try(var.mimir.store_gateway.max_memory != null ? var.mimir.store_gateway.max_memory : "null", "null") + store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu != null ? var.mimir.store_gateway.max_cpu : "null", "null") + + # Distributor configuration + distributor_replicas = try(var.mimir.distributor.replicas != null ? var.mimir.distributor.replicas : "2", "2") + distributor_min_memory = try(var.mimir.distributor.min_memory != null ? var.mimir.distributor.min_memory : "null", "null") + distributor_min_cpu = try(var.mimir.distributor.min_cpu != null ? var.mimir.distributor.min_cpu : "null", "null") + distributor_max_memory = try(var.mimir.distributor.max_memory != null ? var.mimir.distributor.max_memory : "null", "null") + distributor_max_cpu = try(var.mimir.distributor.max_cpu != null ? var.mimir.distributor.max_cpu : "null", "null") + + # Cache configuration + chunks_cache_enabled = try(var.mimir.caches.chunks.enabled != null ? var.mimir.caches.chunks.enabled : "true", "true") + chunks_cache_replicas = try(var.mimir.caches.chunks.replicas != null ? var.mimir.caches.chunks.replicas : 1, 1) + chunks_cache_max_item_memory = try(var.mimir.caches.chunks.max_item_memory != null ? var.mimir.caches.chunks.max_item_memory : 1, 1) + chunks_cache_max_item_memory_mb = try(var.mimir.caches.chunks.max_item_memory != null ? var.mimir.caches.chunks.max_item_memory * 1024 * 1024 : 1048576, 1048576) + chunks_cache_connection_limit = try(var.mimir.caches.chunks.connection_limit != null ? var.mimir.caches.chunks.connection_limit : 16384, 16384) + + index_cache_enabled = try(var.mimir.caches.index.enabled != null ? var.mimir.caches.index.enabled : "true", "true") + index_cache_replicas = try(var.mimir.caches.index.replicas != null ? var.mimir.caches.index.replicas : 1, 1) + index_cache_max_item_memory = try(var.mimir.caches.index.max_item_memory != null ? var.mimir.caches.index.max_item_memory : 5, 5) + index_cache_max_item_memory_mb = try(var.mimir.caches.index.max_item_memory != null ? var.mimir.caches.index.max_item_memory * 1024 * 1024 : 5242880, 5242880) + index_cache_connection_limit = try(var.mimir.caches.index.connection_limit != null ? var.mimir.caches.index.connection_limit : 16384, 16384) + + metadata_cache_enabled = try(var.mimir.caches.metadata.enabled != null ? var.mimir.caches.metadata.enabled : "true", "true") + metadata_cache_replicas = try(var.mimir.caches.metadata.replicas != null ? var.mimir.caches.metadata.replicas : 2, 2) + metadata_cache_max_item_memory = try(var.mimir.caches.metadata.max_item_memory != null ? var.mimir.caches.metadata.max_item_memory : 1, 1) + metadata_cache_max_item_memory_mb = try(var.mimir.caches.metadata.max_item_memory != null ? var.mimir.caches.metadata.max_item_memory * 1024 * 1024 : 1048576, 1048576) + metadata_cache_connection_limit = try(var.mimir.caches.metadata.connection_limit != null ? var.mimir.caches.metadata.connection_limit : 16384, 16384) + + # Authentication + mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result + mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result + }) : null +} + resource "google_storage_bucket" "mimir_data" { count = local.enable_mimir ? 1 : 0 name = "${local.cluster_name}-mimir-block-data-${var.observability_suffix}" location = var.app_region project = var.project_id - force_destroy = true + force_destroy = false labels = var.labels + + uniform_bucket_level_access = true + public_access_prevention = "enforced" + + lifecycle { + prevent_destroy = true + } } resource "google_service_account" "mimir_svc_acc" { @@ -77,69 +162,6 @@ resource "kubernetes_secret" "mimir-google-credentials" { type = "Opaque" } -data "template_file" "mimir_template" { - count = local.enable_mimir ? 1 : 0 - template = file("${path.module}/templates/mimir-values.yaml") - - vars = { - data_bucket_name = google_storage_bucket.mimir_data[0].id - cluster_name = local.cluster_name - limits_ingestion_rate = try(var.mimir.limits.ingestion_rate != null ? var.mimir.limits.ingestion_rate : "500000", "500000") - limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size != null ? var.mimir.limits.ingestion_burst_size : "1000000", "1000000") - limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query != null ? var.mimir.limits.max_fetched_chunks_per_query : "5000000", "5000000") - limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness != null ? var.mimir.limits.max_cache_freshness : "12h", "12h") - limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant != null ? var.mimir.limits.max_outstanding_requests_per_tenant : "2000", "2000") - compactor_replicas = try(var.mimir.compactor.replicas != null ? var.mimir.compactor.replicas : "2", "2") - compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable != null ? var.mimir.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size != null ? var.mimir.compactor.persistence_volume.size : "50Gi", "50Gi") - compactor_min_cpu = try(var.mimir.compactor.min_cpu != null ? var.mimir.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.mimir.compactor.min_memory != null ? var.mimir.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.mimir.compactor.max_cpu != null ? var.mimir.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.mimir.compactor.max_memory != null ? var.mimir.compactor.max_memory : "null", "null") - ingester_replicas = try(var.mimir.ingester.replicas != null ? var.mimir.ingester.replicas : "2", "2") - ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size != null ? var.mimir.ingester.persistence_volume.size : "100Gi", "100Gi") - ingester_min_memory = try(var.mimir.ingester.min_memory != null ? var.mimir.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.mimir.ingester.min_cpu != null ? var.mimir.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.mimir.ingester.max_memory != null ? var.mimir.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.mimir.ingester.max_cpu != null ? var.mimir.ingester.max_cpu : "null", "null") - querier_replicas = try(var.mimir.querier.replicas != null ? var.mimir.querier.replicas : "2", "2") - querier_min_memory = try(var.mimir.querier.min_memory != null ? var.mimir.querier.min_memory : "null", "null") - querier_min_cpu = try(var.mimir.querier.min_cpu != null ? var.mimir.querier.min_cpu : "null", "null") - querier_max_memory = try(var.mimir.querier.max_memory != null ? var.mimir.querier.max_memory : "null", "null") - querier_max_cpu = try(var.mimir.querier.max_cpu != null ? var.mimir.querier.max_cpu : "null", "null") - query_frontend_replicas = try(var.mimir.query_frontend.replicas != null ? var.mimir.query_frontend.replicas : "2", "2") - store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor != null ? var.mimir.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.mimir.store_gateway.replicas != null ? var.mimir.store_gateway.replicas : "2", "2") - store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size != null ? var.mimir.store_gateway.persistence_volume.size : "50Gi", "50Gi") - store_gateway_min_memory = try(var.mimir.store_gateway.min_memory != null ? var.mimir.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu != null ? var.mimir.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.mimir.store_gateway.max_memory != null ? var.mimir.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu != null ? var.mimir.store_gateway.max_cpu : "null", "null") - distributor_replicas = try(var.mimir.distributor.replicas != null ? var.mimir.distributor.replicas : "2", "2") - distributor_min_memory = try(var.mimir.distributor.min_memory != null ? var.mimir.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.mimir.distributor.min_cpu != null ? var.mimir.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.mimir.distributor.max_memory != null ? var.mimir.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.mimir.distributor.max_cpu != null ? var.mimir.distributor.max_cpu : "null", "null") - chunks_cache_enabled = try(var.mimir.caches.chunks.enabled != null ? var.mimir.caches.chunks.enabled : "true", "true") - chunks_cache_replicas = try(var.mimir.caches.chunks.replicas != null ? var.mimir.caches.chunks.replicas : 1, 1) - chunks_cache_max_item_memory = try(var.mimir.caches.chunks.max_item_memory != null ? var.mimir.caches.chunks.max_item_memory : 1, 1) - chunks_cache_max_item_memory_mb = try(var.mimir.caches.chunks.max_item_memory != null ? var.mimir.caches.chunks.max_item_memory * 1024 * 1024 : 1048576, 1048576) - chunks_cache_connection_limit = try(var.mimir.caches.chunks.connection_limit != null ? var.mimir.caches.chunks.connection_limit : 16384, 16384) - index_cache_enabled = try(var.mimir.caches.index.enabled != null ? var.mimir.caches.index.enabled : "true", "true") - index_cache_replicas = try(var.mimir.caches.index.replicas != null ? var.mimir.caches.index.replicas : 1, 1) - index_cache_max_item_memory = try(var.mimir.caches.index.max_item_memory != null ? var.mimir.caches.index.max_item_memory : 5, 5) - index_cache_max_item_memory_mb = try(var.mimir.caches.index.max_item_memory != null ? var.mimir.caches.index.max_item_memory * 1024 * 1024 : 5242880, 5242880) - index_cache_connection_limit = try(var.mimir.caches.index.connection_limit != null ? var.mimir.caches.index.connection_limit : 16384, 16384) - metadata_cache_enabled = try(var.mimir.caches.metadata.enabled != null ? var.mimir.caches.metadata.enabled : "true", "true") - metadata_cache_replicas = try(var.mimir.caches.metadata.replicas != null ? var.mimir.caches.metadata.replicas : 2, 2) - metadata_cache_max_item_memory = try(var.mimir.caches.metadata.max_item_memory != null ? var.mimir.caches.metadata.max_item_memory : 1, 1) - metadata_cache_max_item_memory_mb = try(var.mimir.caches.metadata.max_item_memory != null ? var.mimir.caches.metadata.max_item_memory * 1024 * 1024 : 1048576, 1048576) - metadata_cache_connection_limit = try(var.mimir.caches.metadata.connection_limit != null ? var.mimir.caches.metadata.connection_limit : 16384, 16384) - mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result - mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result - } -} - resource "helm_release" "mimir" { count = local.enable_mimir ? 1 : 0 name = "mimir" @@ -148,7 +170,7 @@ resource "helm_release" "mimir" { namespace = kubernetes_namespace.app_environments["mimir"].metadata[0].name version = "5.1.3" values = [ - data.template_file.mimir_template[0].rendered + local.mimir_template ] depends_on = [ diff --git a/observability/gcp/openobserve.tf b/observability/gcp/openobserve.tf index 1620abdb..989495e3 100644 --- a/observability/gcp/openobserve.tf +++ b/observability/gcp/openobserve.tf @@ -5,8 +5,15 @@ resource "google_storage_bucket" "openobserve_data" { name = "${local.cluster_name}-openobserve-${each.value.name}-${var.observability_suffix}" location = var.app_region project = var.project_id - force_destroy = true + force_destroy = false labels = var.labels + + uniform_bucket_level_access = true + public_access_prevention = "enforced" + + lifecycle { + prevent_destroy = true + } } # Create service account for OpenObserve @@ -74,27 +81,27 @@ resource "random_password" "openobserve_password" { } # Create template for OpenObserve values -data "template_file" "openobserve_template" { - for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} - - template = file("${path.module}/templates/openobserve-values.yaml") - vars = { - replica_count = try(each.value.replicaCount, 1) - cpu_request = "250m" - memory_request = "1Gi" - cpu_limit = "1" - memory_limit = "2Gi" - storage_provider = "gcs" - storage_region = "auto" - storage_bucket_name = google_storage_bucket.openobserve_data[each.key].name - storage_access_key_path = "/app/key.json" - secret_name = "openobserve-gcs-creds-${each.value.name}" - root_user_email = "admin@zop.dev" - root_user_password = random_password.openobserve_password[each.key].result - additional_env_vars = length(try(each.value.env, [])) > 0 ? join("\n", [for env in each.value.env : " - name: ${env.name}\n value: \"${env.value}\""]) : "" +locals { + openobserve_template = { + for inst in var.openobserve : inst.name => inst.enable ? templatefile("${path.module}/templates/openobserve-values.yaml", { + replica_count = try(inst.replicaCount, 1) + cpu_request = "250m" + memory_request = "1Gi" + cpu_limit = "1" + memory_limit = "2Gi" + storage_provider = "gcs" + storage_region = "auto" + storage_bucket_name = google_storage_bucket.openobserve_data[inst.name].name + storage_access_key_path = "/app/key.json" + secret_name = "openobserve-gcs-creds-${inst.name}" + root_user_email = "admin@zop.dev" + root_user_password = random_password.openobserve_password[inst.name].result + additional_env_vars = length(try(inst.env, [])) > 0 ? join("\n", [for env in inst.env : " - name: ${env.name}\n value: \"${env.value}\""]) : "" + }) : null } } + # Deploy OpenObserve using Helm resource "helm_release" "openobserve" { for_each = local.enable_openobserve ? { for instance in var.openobserve : instance.name => instance if instance.enable } : {} @@ -106,7 +113,7 @@ resource "helm_release" "openobserve" { namespace = kubernetes_namespace.app_environments["openobserve"].metadata[0].name values = [ - data.template_file.openobserve_template[each.key].rendered + local.openobserve_template[each.key] ] depends_on = [ diff --git a/observability/gcp/outputs.tf b/observability/gcp/outputs.tf index f8fe44bc..a673d86f 100644 --- a/observability/gcp/outputs.tf +++ b/observability/gcp/outputs.tf @@ -26,13 +26,16 @@ output "cortex_host_url" { value = local.enable_cortex ? (local.enable_ingress_cortex ? kubernetes_ingress_v1.service_ingress["cortex-distributor:8080-cortex"].spec[0].rule[0].host : "cortex-distributor.cortex:8080") : "" } -output "openobserve_host_urls" { +output "openobserve_instances" { + description = "OpenObserve instances with URL, username, and password grouped together" value = local.enable_openobserve ? { - for instance in var.openobserve : instance.name => ( - (instance.enable_ingress == null || instance.enable_ingress == true) ? - kubernetes_ingress_v1.service_ingress["${instance.name}-openobserve-standalone:5080-openobserve"].spec[0].rule[0].host : - "${instance.name}.openobserve:5080" - ) if instance.enable + for instance in var.openobserve : instance.name => { + name = instance.name + url = (instance.enable_ingress == null || instance.enable_ingress == true) ? kubernetes_ingress_v1.service_ingress["${instance.name}-openobserve-standalone:5080-openobserve"].spec[0].rule[0].host : "${instance.name}.openobserve:5080" + username = "admin@zop.dev" + password = random_password.openobserve_password[instance.name].result + } if instance.enable } : {} + sensitive = true } diff --git a/observability/gcp/tempo.tf b/observability/gcp/tempo.tf index 30208ae4..3104df0c 100644 --- a/observability/gcp/tempo.tf +++ b/observability/gcp/tempo.tf @@ -7,6 +7,50 @@ locals { value = remote.header.value } ], {}) + + tempo_values = local.enable_tempo ? templatefile("${path.module}/templates/tempo-values.yaml", { + cluster_name = local.cluster_name + data_bucket_name = google_storage_bucket.tempo_data[0].id + service_account = google_service_account.tempo_svc_acc[0].email + max_receiver_msg_size = try(var.tempo.max_receiver_msg_size != null ? var.tempo.max_receiver_msg_size : "4700000", "4700000") + + # Ingester + ingester_replicas = try(var.tempo.ingester.replicas != null ? var.tempo.ingester.replicas : "1", "1") + ingester_min_memory = try(var.tempo.ingester.min_memory != null ? var.tempo.ingester.min_memory : "1Gi", "1Gi") + ingester_max_memory = try(var.tempo.ingester.max_memory != null ? var.tempo.ingester.max_memory : "null", "null") + ingester_min_cpu = try(var.tempo.ingester.min_cpu != null ? var.tempo.ingester.min_cpu : "null", "null") + ingester_max_cpu = try(var.tempo.ingester.max_cpu != null ? var.tempo.ingester.max_cpu : "null", "null") + ingester_autoscaling = try(var.tempo.ingester.autoscaling != null ? var.tempo.ingester.autoscaling : "true", "true") + ingester_min_replicas = try(var.tempo.ingester.min_replicas != null ? var.tempo.ingester.min_replicas : "2", "2") + ingester_max_replicas = try(var.tempo.ingester.max_replicas != null ? var.tempo.ingester.max_replicas : "30", "30") + ingester_memory_utilization = try(var.tempo.ingester.memory_utilization != null ? var.tempo.ingester.memory_utilization : "", "") + ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization != null ? var.tempo.ingester.cpu_utilization : "", "") + + # Distributor + distributor_replicas = try(var.tempo.distributor.replicas != null ? var.tempo.distributor.replicas : "1", "1") + distributor_min_memory = try(var.tempo.distributor.min_memory != null ? var.tempo.distributor.min_memory : "750Mi", "750Mi") + distributor_max_memory = try(var.tempo.distributor.max_memory != null ? var.tempo.distributor.max_memory : "null", "null") + distributor_min_cpu = try(var.tempo.distributor.min_cpu != null ? var.tempo.distributor.min_cpu : "null", "null") + distributor_max_cpu = try(var.tempo.distributor.max_cpu != null ? var.tempo.distributor.max_cpu : "null", "null") + distributor_autoscaling = try(var.tempo.distributor.autoscaling != null ? var.tempo.distributor.autoscaling : "true", "true") + distributor_min_replicas = try(var.tempo.distributor.min_replicas != null ? var.tempo.distributor.min_replicas : "2", "2") + distributor_max_replicas = try(var.tempo.distributor.max_replicas != null ? var.tempo.distributor.max_replicas : "30", "30") + distributor_memory_utilization = try(var.tempo.distributor.memory_utilization != null ? var.tempo.distributor.memory_utilization : "", "") + distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization != null ? var.tempo.distributor.cpu_utilization : "", "") + + # Querier & Query Frontend + querier_replicas = try(var.tempo.querier.replicas != null ? var.tempo.querier.replicas : "1", "1") + queryFrontend_replicas = try(var.tempo.queryFrontend.replicas != null ? var.tempo.queryFrontend.replicas : "1", "1") + + # Metrics Generator + metrics_generator_enable = try(var.tempo.metrics_generator.enable != null ? var.tempo.metrics_generator.enable : false, false) + metrics_generator_replicas = try(var.tempo.metrics_generator.replicas != null ? var.tempo.metrics_generator.replicas : "1", "1") + metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items != null ? var.tempo.metrics_generator.service_graphs_max_items : "30000", "30000") + metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait != null ? var.tempo.metrics_generator.service_graphs_wait : "30s", "30s") + metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline != null ? var.tempo.metrics_generator.remote_write_flush_deadline : "2m", "2m") + metrics_generator_remote_write = jsonencode(local.remote_write_config) + metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack != null ? var.tempo.metrics_generator.metrics_ingestion_time_range_slack : "40s", "40s") + }) : null } resource "google_storage_bucket" "tempo_data" { @@ -14,8 +58,15 @@ resource "google_storage_bucket" "tempo_data" { name = "${local.cluster_name}-tempo-data-${var.observability_suffix}" location = var.app_region project = var.project_id - force_destroy = true + force_destroy = false labels = var.labels + + uniform_bucket_level_access = true + public_access_prevention = "enforced" + + lifecycle { + prevent_destroy = true + } } resource "google_project_iam_member" "tempo-k8s-service-account" { @@ -68,46 +119,6 @@ resource "kubernetes_secret" "tempo-google-credentials" { type = "Opaque" } -data "template_file" "tempo_template"{ - count = local.enable_tempo ? 1 : 0 - template = file("${path.module}/templates/tempo-values.yaml") - vars = { - cluster_name = local.cluster_name - data_bucket_name = google_storage_bucket.tempo_data[0].id - service_account = google_service_account.tempo_svc_acc[0].email - max_receiver_msg_size = try(var.tempo.max_receiver_msg_size != null ? var.tempo.max_receiver_msg_size : "4700000","4700000") - ingester_replicas = try(var.tempo.ingester.replicas != null ? var.tempo.ingester.replicas : "1", "1") - ingester_min_memory = try(var.tempo.ingester.min_memory != null ? var.tempo.ingester.min_memory : "1Gi", "1Gi") - ingester_max_memory = try(var.tempo.ingester.max_memory != null ? var.tempo.ingester.max_memory : "null", "null") - ingester_min_cpu = try(var.tempo.ingester.min_cpu != null ? var.tempo.ingester.min_cpu : "null", "null") - ingester_max_cpu = try(var.tempo.ingester.max_cpu != null ? var.tempo.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.tempo.ingester.autoscaling != null ? var.tempo.ingester.autoscaling : "true", "true") - ingester_min_replicas = try(var.tempo.ingester.min_replicas != null ? var.tempo.ingester.min_replicas : "2", "2") - ingester_max_replicas = try(var.tempo.ingester.max_replicas != null ? var.tempo.ingester.max_replicas : "30", "30") - ingester_memory_utilization = try(var.tempo.ingester.memory_utilization != null ? var.tempo.ingester.memory_utilization : "", "") - ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization != null ? var.tempo.ingester.cpu_utilization : "", "") - distributor_replicas = try(var.tempo.distributor.replicas != null ? var.tempo.distributor.replicas : "1", "1") - distributor_min_memory = try(var.tempo.distributor.min_memory != null ? var.tempo.distributor.min_memory : "750Mi", "750Mi") - distributor_max_memory = try(var.tempo.distributor.max_memory != null ? var.tempo.distributor.max_memory : "null", "null") - distributor_min_cpu = try(var.tempo.distributor.min_cpu != null ? var.tempo.distributor.min_cpu : "null", "null") - distributor_max_cpu = try(var.tempo.distributor.max_cpu != null ? var.tempo.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.tempo.distributor.autoscaling != null ? var.tempo.distributor.autoscaling : "true", "true") - distributor_min_replicas = try(var.tempo.distributor.min_replicas != null ? var.tempo.distributor.min_replicas : "2", "2") - distributor_max_replicas = try(var.tempo.distributor.max_replicas != null ? var.tempo.distributor.max_replicas : "30", "30") - distributor_memory_utilization = try(var.tempo.distributor.memory_utilization != null ? var.tempo.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization != null ? var.tempo.distributor.cpu_utilization : "","") - querier_replicas = try(var.tempo.querier.replicas != null ? var.tempo.querier.replicas : "1", "1") - queryFrontend_replicas = try(var.tempo.queryFrontend.replicas != null ? var.tempo.queryFrontend.replicas : "1", "1") - metrics_generator_enable = try(var.tempo.metrics_generator.enable != null ? var.tempo.metrics_generator.enable : false, false) - metrics_generator_replicas = try(var.tempo.metrics_generator.replicas != null ? var.tempo.metrics_generator.replicas : "1", "1") - metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items != null ? var.tempo.metrics_generator.service_graphs_max_items : "30000", "30000") - metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait != null ? var.tempo.metrics_generator.service_graphs_wait : "30s", "30s") - metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline != null ? var.tempo.metrics_generator.remote_write_flush_deadline : "2m", "2m") - metrics_generator_remote_write = jsonencode(local.remote_write_config) - metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack != null ? var.tempo.metrics_generator.metrics_ingestion_time_range_slack : "40s", "40s") - } -} - resource "helm_release" "tempo" { count = local.enable_tempo ? 1 : 0 name = "tempo" @@ -117,6 +128,6 @@ resource "helm_release" "tempo" { version = "1.38.0" values = [ - data.template_file.tempo_template[0].rendered + local.tempo_values ] } diff --git a/observability/oci/cortex.tf b/observability/oci/cortex.tf index 3062ce6f..705a08dc 100644 --- a/observability/oci/cortex.tf +++ b/observability/oci/cortex.tf @@ -1,3 +1,45 @@ +locals { + cortex_values = local.enable_cortex ? templatefile("${path.module}/templates/cortex-values.yaml", { + BUCKET_NAME = oci_objectstorage_bucket.cortex_data[0].name + OCI_SECRET = var.access_secret + OCI_KEY = var.access_key + CLUSTER_NAME = local.cluster_name + APP_REGION = var.app_region + TENANCY_NAMESPACE = var.tenancy_namespace + + limits_ingestion_rate = try(var.cortex.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size, "500000") + limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric, "0") + limits_max_series_per_user = try(var.cortex.limits.max_series_per_user, "0") + limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query, "3000000") + + compactor_enable = try(var.cortex.compactor.enable, "true") + compactor_replicas = try(var.cortex.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.cortex.compactor.min_cpu, null) + compactor_min_memory = try(var.cortex.compactor.min_memory, null) + compactor_max_cpu = try(var.cortex.compactor.max_cpu, null) + compactor_max_memory = try(var.cortex.compactor.max_memory, null) + + ingester_replicas = try(var.cortex.ingester.replicas, "1") + ingester_autoscaling = try(var.cortex.ingester.autoscaling, "true") + ingester_min_replicas = try(var.cortex.ingester.min_replicas, "2") + ingester_max_replicas = try(var.cortex.ingester.max_replicas, "100") + ingester_memory_utilization = try(var.cortex.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.cortex.ingester.cpu_utilization, "") + + querier_replicas = try(var.cortex.querier.replicas, "1") + querier_autoscaling = try(var.cortex.querier.autoscaling, "true") + querier_min_replicas = try(var.cortex.querier.min_replicas, "2") + querier_max_replicas = try(var.cortex.querier.max_replicas, "20") + querier_memory_utilization = try(var.cortex.querier.memory_utilization, "") + querier_cpu_utilization = try(var.cortex.querier.cpu_utilization, "") + + # ... you can continue the rest in the same pattern ... + }) : null +} + resource "oci_objectstorage_bucket" "cortex_data" { count = local.enable_cortex ? 1 : 0 compartment_id = var.provider_id @@ -32,8 +74,6 @@ resource "null_resource" "cleanup_cortex_bucket" { depends_on = [oci_objectstorage_bucket.cortex_data] } - - resource "kubernetes_secret" "cortex-oci-credentials" { count = local.enable_cortex ? 1 : 0 metadata { @@ -52,92 +92,6 @@ resource "kubernetes_secret" "cortex-oci-credentials" { } -data "template_file" "cortex_template"{ - count = local.enable_cortex ? 1 : 0 - template = file("${path.module}/templates/cortex-values.yaml") - vars = { - "BUCKET_NAME" = oci_objectstorage_bucket.cortex_data[0].name - "OCI_SECRET" = var.access_secret - "OCI_KEY" = var.access_key - cluster_name = local.cluster_name - app_region = var.app_region - tenancy_namespace = var.tenancy_namespace - limits_ingestion_rate = try(var.cortex.limits.ingestion_rate != null ? var.cortex.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.cortex.limits.ingestion_burst_size != null ? var.cortex.limits.ingestion_burst_size : "500000", "500000") - limits_max_series_per_metric = try(var.cortex.limits.max_series_per_metric != null ? var.cortex.limits.max_series_per_metric : "0", "0") - limits_max_series_per_user = try(var.cortex.limits.max_series_per_user != null ? var.cortex.limits.max_series_per_user : "0", "0") - limits_max_fetched_chunks_per_query = try(var.cortex.limits.max_fetched_chunks_per_query != null ? var.cortex.limits.max_fetched_chunks_per_query : "3000000", "3000000") - query_range_memcached_client_timeout = try(var.cortex.query_range.memcached_client_timeout != null ? var.cortex.query_range.memcached_client_timeout : "30s", "30s") - compactor_enable = try(var.cortex.compactor.enable != null ? var.cortex.compactor.enable : "true", "true") - compactor_replicas = try(var.cortex.compactor.replicas != null ? var.cortex.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.cortex.compactor.persistence_volume.enable != null ? var.cortex.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.cortex.compactor.persistence_volume.size != null ? var.cortex.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.cortex.compactor.min_cpu != null ? var.cortex.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.cortex.compactor.min_memory != null ? var.cortex.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.cortex.compactor.max_cpu != null ? var.cortex.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.cortex.compactor.max_memory != null ? var.cortex.compactor.max_memory : "null", "null") - ingester_replicas = try(var.cortex.ingester.replicas != null ? var.cortex.ingester.replicas : "1", "1") - ingester_persistence_volume_size = try(var.cortex.ingester.persistence_volume.size != null ? var.cortex.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.cortex.ingester.min_memory != null ? var.cortex.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.cortex.ingester.min_cpu != null ? var.cortex.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.cortex.ingester.max_memory != null ? var.cortex.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.cortex.ingester.max_cpu != null ? var.cortex.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.cortex.ingester.autoscaling != null ? var.cortex.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.cortex.ingester.max_replicas != null ? var.cortex.ingester.max_replicas : "100", "100") - ingester_min_replicas = try(var.cortex.ingester.min_replicas != null ? var.cortex.ingester.min_replicas : "2", "2") - ingester_memory_utilization = try(var.cortex.ingester.memory_utilization != null ? var.cortex.ingester.memory_utilization : "", "") - querier_replicas = try(var.cortex.querier.replicas != null ? var.cortex.querier.replicas : "1", "1") - querier_min_memory = try(var.cortex.querier.min_memory != null ? var.cortex.querier.min_memory : "null", "null") - querier_min_cpu = try(var.cortex.querier.min_cpu != null ? var.cortex.querier.min_cpu : "null", "null") - querier_max_memory = try(var.cortex.querier.max_memory != null ? var.cortex.querier.max_memory : "null", "null") - querier_max_cpu = try(var.cortex.querier.max_cpu != null ? var.cortex.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.cortex.querier.autoscaling != null ? var.cortex.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.cortex.querier.max_replicas != null ? var.cortex.querier.max_replicas : "20", "20") - querier_min_replicas = try(var.cortex.querier.min_replicas != null ? var.cortex.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.cortex.querier.memory_utilization != null ? var.cortex.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.cortex.querier.cpu_utilization != null ? var.cortex.querier.cpu_utilization : "", "") - query_frontend_replicas = try(var.cortex.query_frontend.replicas != null ? var.cortex.query_frontend.replicas : "4", "4") - query_frontend_enable = try(var.cortex.query_frontend.enable != null ? var.cortex.query_frontend.enable : "true", "true") - store_gateway_replication_factor = try(var.cortex.store_gateway.replication_factor != null ? var.cortex.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.cortex.store_gateway.replicas != null ? var.cortex.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.cortex.store_gateway.persistence_volume.size != null ? var.cortex.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.cortex.store_gateway.min_memory != null ? var.cortex.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.cortex.store_gateway.min_cpu != null ? var.cortex.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.cortex.store_gateway.max_memory != null ? var.cortex.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.cortex.store_gateway.max_cpu != null ? var.cortex.store_gateway.max_cpu : "null", "null") - memcached_frontend_enable = try(var.cortex.memcached_frontend.enable != null ? var.cortex.memcached_frontend.enable : "true", "true") - memcached_frontend_min_memory = try(var.cortex.memcached_frontend.min_memory != null ? var.cortex.memcached_frontend.min_memory : "null", "null") - memcached_frontend_min_cpu = try(var.cortex.memcached_frontend.min_cpu != null ? var.cortex.memcached_frontend.min_cpu : "null", "null") - memcached_frontend_max_memory = try(var.cortex.memcached_frontend.max_memory != null ? var.cortex.memcached_frontend.max_memory : "null", "null") - memcached_frontend_max_cpu = try(var.cortex.memcached_frontend.max_cpu != null ? var.cortex.memcached_frontend.max_cpu : "null", "null") - memcached_blocks_index_enable = try(var.cortex.memcached_blocks_index.enable != null ? var.cortex.memcached_blocks_index.enable : "true", "true") - memcached_blocks_index_min_cpu = try(var.cortex.memcached_blocks_index.min_cpu != null ? var.cortex.memcached_blocks_index.min_cpu : "null", "null") - memcached_blocks_index_min_memory = try(var.cortex.memcached_blocks_index.min_memory != null ? var.cortex.memcached_blocks_index.min_memory : "null", "null") - memcached_blocks_index_max_cpu = try(var.cortex.memcached_blocks_index.max_cpu != null ? var.cortex.memcached_blocks_index.max_cpu : "null", "null") - memcached_blocks_index_max_memory = try(var.cortex.memcached_blocks_index.max_memory != null ? var.cortex.memcached_blocks_index.max_memory : "null", "null") - memcached_blocks_enable = try(var.cortex.memcached_blocks.enable != null ? var.cortex.memcached_blocks.enable : "true", "true") - memcached_blocks_min_memory = try(var.cortex.memcached_blocks.min_memory != null ? var.cortex.memcached_blocks.min_memory : "null", "null") - memcached_blocks_min_cpu = try(var.cortex.memcached_blocks.min_cpu != null ? var.cortex.memcached_blocks.min_cpu : "null", "null") - memcached_blocks_max_memory = try(var.cortex.memcached_blocks.max_memory != null ? var.cortex.memcached_blocks.max_memory : "null", "null") - memcached_blocks_max_cpu = try(var.cortex.memcached_blocks.max_cpu != null ? var.cortex.memcached_blocks.max_cpu : "null", "null") - memcached_blocks_metadata_enable = try(var.cortex.memcached_blocks_metadata.enable != null ? var.cortex.memcached_blocks_metadata.enable : "true", "true") - memcached_blocks_metadata_min_memory = try(var.cortex.memcached_blocks_metadata.min_memory != null ? var.cortex.memcached_blocks_metadata.min_memory : "null", "null") - memcached_blocks_metadata_min_cpu = try(var.cortex.memcached_blocks_metadata.min_cpu != null ? var.cortex.memcached_blocks_metadata.min_cpu : "null", "null") - memcached_blocks_metadata_max_memory = try(var.cortex.memcached_blocks_metadata.max_memory != null ? var.cortex.memcached_blocks_metadata.max_memory : "null", "null") - memcached_blocks_metadata_max_cpu = try(var.cortex.memcached_blocks_metadata.max_cpu != null ? var.cortex.memcached_blocks_metadata.max_cpu : "null", "null") - distributor_replicas = try(var.cortex.distributor.replicas != null ? var.cortex.distributor.replicas : "1", "1") - distributor_min_memory = try(var.cortex.distributor.min_memory != null ? var.cortex.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.cortex.distributor.min_cpu != null ? var.cortex.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.cortex.distributor.max_memory != null ? var.cortex.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.cortex.distributor.max_cpu != null ? var.cortex.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.cortex.distributor.autoscaling != null ? var.cortex.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.cortex.distributor.max_replicas != null ? var.cortex.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.cortex.distributor.min_replicas != null ? var.cortex.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.cortex.distributor.memory_utilization != null ? var.cortex.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.cortex.distributor.cpu_utilization != null ? var.cortex.distributor.cpu_utilization : "", "") - } -} - resource "helm_release" "cortex" { count = local.enable_cortex ? 1 : 0 name = "cortex" @@ -147,6 +101,6 @@ resource "helm_release" "cortex" { version = "1.7.0" values = [ - data.template_file.cortex_template[0].rendered + local.cortex_values ] } diff --git a/observability/oci/loki.tf b/observability/oci/loki.tf index 67fdc2fa..7d51fae0 100644 --- a/observability/oci/loki.tf +++ b/observability/oci/loki.tf @@ -1,3 +1,62 @@ +locals { + loki_values = local.enable_loki ? templatefile("${path.module}/templates/loki-values.yaml", { + BUCKET_NAME = oci_objectstorage_bucket.loki_data[0].name + OCI_SECRET = var.access_secret + OCI_KEY = var.access_key + APP_REGION = var.app_region + TENANCY_NAMESPACE = var.tenancy_namespace + CLUSTER_BUCKET_NAME = "${local.cluster_name}-loki-data-${var.observability_suffix}" + + # Ingester + ingester_replicas = try(var.loki.ingester.replicas, "1") + ingester_min_memory = try(var.loki.ingester.min_memory, "1Gi") + ingester_max_memory = try(var.loki.ingester.max_memory, null) + ingester_min_cpu = try(var.loki.ingester.min_cpu, null) + ingester_max_cpu = try(var.loki.ingester.max_cpu, null) + ingester_autoscaling = try(var.loki.ingester.autoscaling, "true") + ingester_min_replicas = try(var.loki.ingester.min_replicas, "2") + ingester_max_replicas = try(var.loki.ingester.max_replicas, "30") + ingester_memory_utilization = try(var.loki.ingester.memory_utilization, "") + ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization, "") + + # Distributor + distributor_replicas = try(var.loki.distributor.replicas, "1") + distributor_min_memory = try(var.loki.distributor.min_memory, "512Mi") + distributor_max_memory = try(var.loki.distributor.max_memory, "1Gi") + distributor_min_cpu = try(var.loki.distributor.min_cpu, "250m") + distributor_max_cpu = try(var.loki.distributor.max_cpu, "1") + distributor_autoscaling = try(var.loki.distributor.autoscaling, "true") + distributor_min_replicas = try(var.loki.distributor.min_replicas, "2") + distributor_max_replicas = try(var.loki.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.loki.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization, "") + + # Querier + querier_replicas = try(var.loki.querier.replicas, "4") + querier_min_memory = try(var.loki.querier.min_memory, "500Mi") + querier_max_memory = try(var.loki.querier.max_memory, null) + querier_min_cpu = try(var.loki.querier.min_cpu, "100m") + querier_max_cpu = try(var.loki.querier.max_cpu, null) + querier_autoscaling = try(var.loki.querier.autoscaling, "true") + querier_min_replicas = try(var.loki.querier.min_replicas, "2") + querier_max_replicas = try(var.loki.querier.max_replicas, "6") + querier_memory_utilization = try(var.loki.querier.memory_utilization, "") + querier_cpu_utilization = try(var.loki.querier.cpu_utilization, "") + + # Query Frontend + query_frontend_replicas = try(var.loki.queryFrontend.replicas, "1") + query_frontend_min_memory = try(var.loki.queryFrontend.min_memory, "250Mi") + query_frontend_max_memory = try(var.loki.queryFrontend.max_memory, null) + query_frontend_min_cpu = try(var.loki.queryFrontend.min_cpu, null) + query_frontend_max_cpu = try(var.loki.queryFrontend.max_cpu, null) + query_frontend_autoscaling = try(var.loki.queryFrontend.autoscaling, "true") + query_frontend_min_replicas = try(var.loki.queryFrontend.min_replicas, "1") + query_frontend_max_replicas = try(var.loki.queryFrontend.max_replicas, "6") + query_frontend_memory_utilization = try(var.loki.queryFrontend.memory_utilization, "") + query_frontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization, "") + }) : null +} + resource "oci_objectstorage_bucket" "loki_data" { count = local.enable_loki ? 1 : 0 compartment_id = var.provider_id @@ -32,61 +91,6 @@ resource "null_resource" "cleanup_loki_bucket" { depends_on = [oci_objectstorage_bucket.loki_data] } - -data "template_file" "loki_template" { - count = local.enable_loki ? 1 : 0 - template = file("${path.module}/templates/loki-values.yaml") - vars = { - BUCKET_NAME = oci_objectstorage_bucket.loki_data[0].name - OCI_SECRET = var.access_secret - OCI_KEY = var.access_key - app_region = var.app_region - bucket_name = "${local.cluster_name}-loki-data-${var.observability_suffix}" - tenancy_namespace = var.tenancy_namespace - ingester_replicas = try(var.loki.ingester.replicas != null ? var.loki.ingester.replicas : "1", "1") - ingester_max_memory = try(var.loki.ingester.max_memory != null ? var.loki.ingester.max_memory : "null", "null") - ingester_min_memory = try(var.loki.ingester.min_memory != null ? var.loki.ingester.min_memory : "1Gi", "1Gi") - ingester_max_cpu = try(var.loki.ingester.max_cpu != null ? var.loki.ingester.max_cpu : "null", "null") - ingester_min_cpu = try(var.loki.ingester.min_cpu != null ? var.loki.ingester.min_cpu : "null", "null") - ingester_autoscaling = try(var.loki.ingester.autoscaling != null ? var.loki.ingester.autoscaling : "true", "true") - ingester_max_replicas = try(var.loki.ingester.max_replicas != null ? var.loki.ingester.max_replicas : "30", "30") - ingester_min_replicas = try(var.loki.ingester.min_replicas != null ? var.loki.ingester.min_replicas : "2", "2") - ingester_cpu_utilization = try(var.loki.ingester.cpu_utilization != null ? var.loki.ingester.cpu_utilization : "", "") - ingester_memory_utilization = try(var.loki.ingester.memory_utilization != null ? var.loki.ingester.memory_utilization : "", "") - distributor_replicas = try(var.loki.distributor.replicas != null ? var.loki.distributor.replicas : "1", "1") - distributor_max_memory = try(var.loki.distributor.max_memory != null ? var.loki.distributor.max_memory : "1Gi", "1Gi") - distributor_min_memory = try(var.loki.distributor.min_memory != null ? var.loki.distributor.min_memory : "512Mi", "512Mi") - distributor_max_cpu = try(var.loki.distributor.max_cpu != null ? var.loki.distributor.max_cpu : "1", "1") - distributor_min_cpu = try(var.loki.distributor.min_cpu != null ? var.loki.distributor.min_cpu : "250m", "250m") - distributor_autoscaling = try(var.loki.distributor.autoscaling != null ? var.loki.distributor.autoscaling : "true", "true") - distributor_max_replicas = try(var.loki.distributor.max_replicas != null ? var.loki.distributor.max_replicas : "30", "30") - distributor_min_replicas = try(var.loki.distributor.min_replicas != null ? var.loki.distributor.min_replicas : "2", "2") - distributor_memory_utilization = try(var.loki.distributor.memory_utilization != null ? var.loki.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.loki.distributor.cpu_utilization != null ? var.loki.distributor.cpu_utilization : "", "") - querier_replicas = try(var.loki.querier.replicas != null ? var.loki.querier.replicas : "4", "4") - querier_max_unavailable = try(var.loki.querier.max_unavailable != null ? var.loki.querier.max_unavailable : "1", "1") - querier_min_memory = try(var.loki.querier.min_memory != null ? var.loki.querier.min_memory : "500Mi", "500Mi") - querier_min_cpu = try(var.loki.querier.min_cpu != null ? var.loki.querier.min_cpu : "100m", "100m") - querier_max_memory = try(var.loki.querier.max_memory != null ? var.loki.querier.max_memory : "null", "null") - querier_max_cpu = try(var.loki.querier.max_cpu != null ? var.loki.querier.max_cpu : "null", "null") - querier_autoscaling = try(var.loki.querier.autoscaling != null ? var.loki.querier.autoscaling : "true", "true") - querier_max_replicas = try(var.loki.querier.max_replicas != null ? var.loki.querier.max_replicas : "6", "6") - querier_min_replicas = try(var.loki.querier.min_replicas != null ? var.loki.querier.min_replicas : "2", "2") - querier_memory_utilization = try(var.loki.querier.memory_utilization != null ? var.loki.querier.memory_utilization : "", "") - querier_cpu_utilization = try(var.loki.querier.cpu_utilization != null ? var.loki.querier.cpu_utilization : "", "") - queryFrontend_replicas = try(var.loki.queryFrontend.replicas != null ? var.loki.queryFrontend.replicas : "1", "1") - queryFrontend_min_memory = try(var.loki.queryFrontend.min_memory != null ? var.loki.queryFrontend.min_memory : "250Mi", "250Mi") - queryFrontend_max_memory = try(var.loki.query_frontend.max_memory != null ? var.loki.query_frontend.max_memory : "null", "null") - queryFrontend_min_cpu = try(var.loki.query_frontend.min_cpu != null ? var.loki.query_frontend.min_cpu : "null", "null") - queryFrontend_max_cpu = try(var.loki.query_frontend.max_cpu != null ? var.loki.query_frontend.max_cpu : "null", "null") - queryFrontend_autoscaling = try(var.loki.queryFrontend.autoscaling != null ? var.loki.queryFrontend.autoscaling : "true", "true") - queryFrontend_max_replicas = try(var.loki.queryFrontend.max_replicas != null ? var.loki.queryFrontend.max_replicas : "6", "6") - queryFrontend_min_replicas = try(var.loki.queryFrontend.min_replicas != null ? var.loki.queryFrontend.min_replicas : "1", "1") - queryFrontend_memory_utilization= try(var.loki.queryFrontend.memory_utilization != null ? var.loki.queryFrontend.memory_utilization : "", "") - queryFrontend_cpu_utilization = try(var.loki.queryFrontend.cpu_utilization != null ? var.loki.queryFrontend.cpu_utilization : "", "") - } -} - resource "helm_release" "loki" { count = local.enable_loki ? 1 : 0 name = "loki" @@ -96,6 +100,6 @@ resource "helm_release" "loki" { version = "0.68.0" values = [ - data.template_file.loki_template[0].rendered + local.loki_values ] } \ No newline at end of file diff --git a/observability/oci/mimir.tf b/observability/oci/mimir.tf index 02ca7965..cb724a3d 100644 --- a/observability/oci/mimir.tf +++ b/observability/oci/mimir.tf @@ -1,3 +1,63 @@ +locals { + mimir_template = local.enable_mimir ? templatefile( + "${path.module}/templates/mimir-values.yaml", + { + BUCKET_NAME = oci_objectstorage_bucket.mimir_data[0].name + cluster_name = local.cluster_name + app_region = var.app_region + OCI_SECRET = var.access_secret + OCI_KEY = var.access_key + tenancy_namespace = var.tenancy_namespace + + limits_ingestion_rate = try(var.mimir.limits.ingestion_rate, "250000") + limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size, "500000") + limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query, "3000000") + limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness, "24h") + limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant, "1000") + + compactor_replicas = try(var.mimir.compactor.replicas, "1") + compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable, "true") + compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size, "20Gi") + compactor_min_cpu = try(var.mimir.compactor.min_cpu, "null") + compactor_min_memory = try(var.mimir.compactor.min_memory, "null") + compactor_max_cpu = try(var.mimir.compactor.max_cpu, "null") + compactor_max_memory = try(var.mimir.compactor.max_memory, "null") + + ingester_replicas = try(var.mimir.ingester.replicas, "2") + ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size, "20Gi") + ingester_min_memory = try(var.mimir.ingester.min_memory, "null") + ingester_min_cpu = try(var.mimir.ingester.min_cpu, "null") + ingester_max_memory = try(var.mimir.ingester.max_memory, "null") + ingester_max_cpu = try(var.mimir.ingester.max_cpu, "null") + + querier_replicas = try(var.mimir.querier.replicas, "3") + querier_min_memory = try(var.mimir.querier.min_memory, "null") + querier_min_cpu = try(var.mimir.querier.min_cpu, "null") + querier_max_memory = try(var.mimir.querier.max_memory, "null") + querier_max_cpu = try(var.mimir.querier.max_cpu, "null") + + query_frontend_replicas = try(var.mimir.query_frontend.replicas, "1") + + store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor, "3") + store_gateway_replicas = try(var.mimir.store_gateway.replicas, "1") + store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size, "500Gi") + store_gateway_min_memory = try(var.mimir.store_gateway.min_memory, "null") + store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu, "null") + store_gateway_max_memory = try(var.mimir.store_gateway.max_memory, "null") + store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu, "null") + + distributor_replicas = try(var.mimir.distributor.replicas, "1") + distributor_min_memory = try(var.mimir.distributor.min_memory, "null") + distributor_min_cpu = try(var.mimir.distributor.min_cpu, "null") + distributor_max_memory = try(var.mimir.distributor.max_memory, "null") + distributor_max_cpu = try(var.mimir.distributor.max_cpu, "null") + + mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result + mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result + } + ) : "" +} + resource "oci_objectstorage_bucket" "mimir_data" { count = local.enable_mimir ? 1 : 0 compartment_id = var.provider_id @@ -85,57 +145,6 @@ resource "kubernetes_secret" "mimir-oci-credentials" { } -data "template_file" "mimir_template" { - count = local.enable_mimir ? 1 : 0 - template = file("${path.module}/templates/mimir-values.yaml") - vars = { - BUCKET_NAME = oci_objectstorage_bucket.mimir_data[0].name - cluster_name = local.cluster_name - app_region = var.app_region - "OCI_SECRET" = var.access_secret - "OCI_KEY" = var.access_key - tenancy_namespace = var.tenancy_namespace - limits_ingestion_rate = try(var.mimir.limits.ingestion_rate != null ? var.mimir.limits.ingestion_rate : "250000", "250000") - limits_ingestion_burst_size = try(var.mimir.limits.ingestion_burst_size != null ? var.mimir.limits.ingestion_burst_size : "500000", "500000") - limits_max_fetched_chunks_per_query = try(var.mimir.limits.max_fetched_chunks_per_query != null ? var.mimir.limits.max_fetched_chunks_per_query : "3000000", "3000000") - limits_max_cache_freshness = try(var.mimir.limits.max_cache_freshness != null ? var.mimir.limits.max_cache_freshness : "24h", "24h") - limits_max_outstanding_requests_per_tenant = try(var.mimir.limits.max_outstanding_requests_per_tenant != null ? var.mimir.limits.max_outstanding_requests_per_tenant : "1000", "1000") - compactor_replicas = try(var.mimir.compactor.replicas != null ? var.mimir.compactor.replicas : "1", "1") - compactor_persistence_volume_enable = try(var.mimir.compactor.persistence_volume.enable != null ? var.mimir.compactor.persistence_volume.enable : "true", "true") - compactor_persistence_volume_size = try(var.mimir.compactor.persistence_volume.size != null ? var.mimir.compactor.persistence_volume.size : "20Gi", "20Gi") - compactor_min_cpu = try(var.mimir.compactor.min_cpu != null ? var.mimir.compactor.min_cpu : "null", "null") - compactor_min_memory = try(var.mimir.compactor.min_memory != null ? var.mimir.compactor.min_memory : "null", "null") - compactor_max_cpu = try(var.mimir.compactor.max_cpu != null ? var.mimir.compactor.max_cpu : "null", "null") - compactor_max_memory = try(var.mimir.compactor.max_memory != null ? var.mimir.compactor.max_memory : "null", "null") - ingester_replicas = try(var.mimir.ingester.replicas != null ? var.mimir.ingester.replicas : "2", "2") - ingester_persistence_volume_size = try(var.mimir.ingester.persistence_volume.size != null ? var.mimir.ingester.persistence_volume.size : "20Gi", "20Gi") - ingester_min_memory = try(var.mimir.ingester.min_memory != null ? var.mimir.ingester.min_memory : "null", "null") - ingester_min_cpu = try(var.mimir.ingester.min_cpu != null ? var.mimir.ingester.min_cpu : "null", "null") - ingester_max_memory = try(var.mimir.ingester.max_memory != null ? var.mimir.ingester.max_memory : "null", "null") - ingester_max_cpu = try(var.mimir.ingester.max_cpu != null ? var.mimir.ingester.max_cpu : "null", "null") - querier_replicas = try(var.mimir.querier.replicas != null ? var.mimir.querier.replicas : "3", "3") - querier_min_memory = try(var.mimir.querier.min_memory != null ? var.mimir.querier.min_memory : "null", "null") - querier_min_cpu = try(var.mimir.querier.min_cpu != null ? var.mimir.querier.min_cpu : "null", "null") - querier_max_memory = try(var.mimir.querier.max_memory != null ? var.mimir.querier.max_memory : "null", "null") - querier_max_cpu = try(var.mimir.querier.max_cpu != null ? var.mimir.querier.max_cpu : "null", "null") - query_frontend_replicas = try(var.mimir.query_frontend.replicas != null ? var.mimir.query_frontend.replicas : "1", "1") - store_gateway_replication_factor = try(var.mimir.store_gateway.replication_factor != null ? var.mimir.store_gateway.replication_factor : "3", "3") - store_gateway_replicas = try(var.mimir.store_gateway.replicas != null ? var.mimir.store_gateway.replicas : "1", "1") - store_gateway_persistence_volume_size = try(var.mimir.store_gateway.persistence_volume.size != null ? var.mimir.store_gateway.persistence_volume.size : "500Gi", "500Gi") - store_gateway_min_memory = try(var.mimir.store_gateway.min_memory != null ? var.mimir.store_gateway.min_memory : "null", "null") - store_gateway_min_cpu = try(var.mimir.store_gateway.min_cpu != null ? var.mimir.store_gateway.min_cpu : "null", "null") - store_gateway_max_memory = try(var.mimir.store_gateway.max_memory != null ? var.mimir.store_gateway.max_memory : "null", "null") - store_gateway_max_cpu = try(var.mimir.store_gateway.max_cpu != null ? var.mimir.store_gateway.max_cpu : "null", "null") - distributor_replicas = try(var.mimir.distributor.replicas != null ? var.mimir.distributor.replicas : "1", "1") - distributor_min_memory = try(var.mimir.distributor.min_memory != null ? var.mimir.distributor.min_memory : "null", "null") - distributor_min_cpu = try(var.mimir.distributor.min_cpu != null ? var.mimir.distributor.min_cpu : "null", "null") - distributor_max_memory = try(var.mimir.distributor.max_memory != null ? var.mimir.distributor.max_memory : "null", "null") - distributor_max_cpu = try(var.mimir.distributor.max_cpu != null ? var.mimir.distributor.max_cpu : "null", "null") - mimir_basic_auth_username = random_password.mimir_basic_auth_username[0].result - mimir_basic_auth_password = random_password.mimir_basic_auth_password[0].result - } -} - resource "helm_release" "mimir" { count = local.enable_mimir ? 1 : 0 name = "mimir" @@ -144,7 +153,7 @@ resource "helm_release" "mimir" { namespace = kubernetes_namespace.app_environments["mimir"].metadata[0].name version = "5.1.3" values = [ - data.template_file.mimir_template[0].rendered + local.mimir_template ] depends_on = [ diff --git a/observability/oci/tempo.tf b/observability/oci/tempo.tf index e897c768..4ec736d5 100644 --- a/observability/oci/tempo.tf +++ b/observability/oci/tempo.tf @@ -6,6 +6,53 @@ locals { value = remote.header.value } ], {}) + + tempo_template = local.enable_tempo ? templatefile( + "${path.module}/templates/tempo-values.yaml", + { + BUCKET_NAME = oci_objectstorage_bucket.tempo_data[0].name + OCI_SECRET = var.access_secret + OCI_KEY = var.access_key + app_region = var.app_region + tenancy_namespace = var.tenancy_namespace + + ingester_replicas = try(var.tempo.ingester.replicas, "1") + ingester_min_memory = try(var.tempo.ingester.min_memory, "1Gi") + ingester_max_memory = try(var.tempo.ingester.max_memory, "null") + ingester_min_cpu = try(var.tempo.ingester.min_cpu, "null") + ingester_max_cpu = try(var.tempo.ingester.max_cpu, "null") + ingester_autoscaling = try(var.tempo.ingester.autoscaling, "true") + ingester_min_replicas = try(var.tempo.ingester.min_replicas, "2") + ingester_max_replicas = try(var.tempo.ingester.max_replicas, "30") + ingester_memory_utilization = + try(var.tempo.ingester.memory_utilization, "") + ingester_cpu_utilization = + try(var.tempo.ingester.cpu_utilization, "") + + distributor_replicas = try(var.tempo.distributor.replicas, "1") + distributor_min_memory = try(var.tempo.distributor.min_memory, "750Mi") + distributor_max_memory = try(var.tempo.distributor.max_memory, "null") + distributor_min_cpu = try(var.tempo.distributor.min_cpu, "null") + distributor_max_cpu = try(var.tempo.distributor.max_cpu, "null") + distributor_autoscaling = try(var.tempo.distributor.autoscaling, "true") + distributor_min_replicas = try(var.tempo.distributor.min_replicas, "2") + distributor_max_replicas = try(var.tempo.distributor.max_replicas, "30") + distributor_memory_utilization = try(var.tempo.distributor.memory_utilization, "") + distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization, "") + querier_replicas = try(var.tempo.querier.replicas, "1") + queryFrontend_replicas = try(var.tempo.queryFrontend.replicas, "1") + metrics_generator_enable = try(var.tempo.metrics_generator.enable, false) + metrics_generator_replicas = try(var.tempo.metrics_generator.replicas, "1") + metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items, "30000") + metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait, "30s") + metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline, "2m") + metrics_generator_remote_write = jsonencode(local.remote_write_config) + metrics_generator_metrics_ingestion_time_range_slack = try( + var.tempo.metrics_generator.metrics_ingestion_time_range_slack, + "40s" + ) + } + ) : null } resource "oci_objectstorage_bucket" "tempo_data" { @@ -42,50 +89,6 @@ resource "null_resource" "cleanup_tempo_bucket" { depends_on = [oci_objectstorage_bucket.tempo_data] } - - -data "template_file" "tempo_template"{ - count = local.enable_tempo ? 1 : 0 - template = file("${path.module}/templates/tempo-values.yaml") - vars = { - BUCKET_NAME = oci_objectstorage_bucket.tempo_data[0].name - OCI_SECRET = var.access_secret - OCI_KEY = var.access_key - app_region = var.app_region - tenancy_namespace = var.tenancy_namespace - ingester_replicas = try(var.tempo.ingester.replicas != null ? var.tempo.ingester.replicas : "1", "1") - ingester_min_memory = try(var.tempo.ingester.min_memory != null ? var.tempo.ingester.min_memory : "1Gi", "1Gi") - ingester_max_memory = try(var.tempo.ingester.max_memory != null ? var.tempo.ingester.max_memory : "null", "null") - ingester_min_cpu = try(var.tempo.ingester.min_cpu != null ? var.tempo.ingester.min_cpu : "null", "null") - ingester_max_cpu = try(var.tempo.ingester.max_cpu != null ? var.tempo.ingester.max_cpu : "null", "null") - ingester_autoscaling = try(var.tempo.ingester.autoscaling != null ? var.tempo.ingester.autoscaling : "true", "true") - ingester_min_replicas = try(var.tempo.ingester.min_replicas != null ? var.tempo.ingester.min_replicas : "2", "2") - ingester_max_replicas = try(var.tempo.ingester.max_replicas != null ? var.tempo.ingester.max_replicas : "30", "30") - ingester_memory_utilization = try(var.tempo.ingester.memory_utilization != null ? var.tempo.ingester.memory_utilization : "", "") - ingester_cpu_utilization = try(var.tempo.ingester.cpu_utilization != null ? var.tempo.ingester.cpu_utilization : "", "") - distributor_replicas = try(var.tempo.distributor.replicas != null ? var.tempo.distributor.replicas : "1", "1") - distributor_min_memory = try(var.tempo.distributor.min_memory != null ? var.tempo.distributor.min_memory : "750Mi", "750Mi") - distributor_max_memory = try(var.tempo.distributor.max_memory != null ? var.tempo.distributor.max_memory : "null", "null") - distributor_min_cpu = try(var.tempo.distributor.min_cpu != null ? var.tempo.distributor.min_cpu : "null", "null") - distributor_max_cpu = try(var.tempo.distributor.max_cpu != null ? var.tempo.distributor.max_cpu : "null", "null") - distributor_autoscaling = try(var.tempo.distributor.autoscaling != null ? var.tempo.distributor.autoscaling : "true", "true") - distributor_min_replicas = try(var.tempo.distributor.min_replicas != null ? var.tempo.distributor.min_replicas : "2", "2") - distributor_max_replicas = try(var.tempo.distributor.max_replicas != null ? var.tempo.distributor.max_replicas : "30", "30") - distributor_memory_utilization = try(var.tempo.distributor.memory_utilization != null ? var.tempo.distributor.memory_utilization : "", "") - distributor_cpu_utilization = try(var.tempo.distributor.cpu_utilization != null ? var.tempo.distributor.cpu_utilization : "","") - querier_replicas = try(var.tempo.querier.replicas != null ? var.tempo.querier.replicas : "1", "1") - queryFrontend_replicas = try(var.tempo.queryFrontend.replicas != null ? var.tempo.queryFrontend.replicas : "1", "1") - metrics_generator_enable = try(var.tempo.metrics_generator.enable != null ? var.tempo.metrics_generator.enable : false, false) - metrics_generator_replicas = try(var.tempo.metrics_generator.replicas != null ? var.tempo.metrics_generator.replicas : "1", "1") - metrics_generator_service_graphs_max_items = try(var.tempo.metrics_generator.service_graphs_max_items != null ? var.tempo.metrics_generator.service_graphs_max_items : "30000", "30000") - metrics_generator_service_graphs_wait = try(var.tempo.metrics_generator.service_graphs_wait != null ? var.tempo.metrics_generator.service_graphs_wait : "30s", "30s") - metrics_generator_remote_write_flush_deadline = try(var.tempo.metrics_generator.remote_write_flush_deadline != null ? var.tempo.metrics_generator.remote_write_flush_deadline : "2m", "2m") - metrics_generator_remote_write = jsonencode(local.remote_write_config) - metrics_generator_metrics_ingestion_time_range_slack = try(var.tempo.metrics_generator.metrics_ingestion_time_range_slack != null ? var.tempo.metrics_generator.metrics_ingestion_time_range_slack : "40s", "40s") - } -} - - resource "helm_release" "tempo" { count = local.enable_tempo ? 1 : 0 name = "tempo" @@ -95,6 +98,6 @@ resource "helm_release" "tempo" { version = "1.38.0" values = [ - data.template_file.tempo_template[0].rendered + local.tempo_template ] } \ No newline at end of file diff --git a/redis/azure-redis/main.tf b/redis/azure-redis/main.tf index be643564..d10bfc1b 100644 --- a/redis/azure-redis/main.tf +++ b/redis/azure-redis/main.tf @@ -1,6 +1,8 @@ locals { cluster_prefix = var.shared_services.cluster_prefix != null ? var.shared_services.cluster_prefix : var.app_name cluster_name = var.app_env != "" ? "${var.app_name}-${var.app_env}" : "${var.app_name}" + vnet_enabled = var.vpc != "" + subnet_name = local.vnet_enabled ? "${var.vpc}-redis-subnet" : "" } module "remote_state_gcp_cluster" { @@ -45,20 +47,58 @@ data "azurerm_key_vault" "secrets" { resource_group_name = var.resource_group_name } -data "azurerm_virtual_network" "avn" { - name = var.vpc +data "azurerm_virtual_network" "vnet" { + count = local.vnet_enabled ? 1 : 0 + name = var.vpc resource_group_name = var.resource_group_name } -resource "azurerm_redis_cache" "redis_cluster" { - name = var.redis.name != "" && var.redis.name != null ? var.redis.name : "${local.cluster_name}-${var.namespace}-redis" - location = var.app_region +data "azurerm_subnet" "redis_subnet" { + count = local.vnet_enabled ? 1 : 0 + name = local.subnet_name resource_group_name = var.resource_group_name - sku_name = var.redis.sku_name - capacity = var.redis.redis_cache_capacity - family = var.redis.redis_cache_family - non_ssl_port_enabled = var.redis.redis_enable_non_ssl_port - tags = var.tags + virtual_network_name = data.azurerm_virtual_network.vnet[0].name +} + +# Reference existing Private DNS Zone created by account-setup module +data "azurerm_private_dns_zone" "redis" { + count = local.vnet_enabled ? 1 : 0 + name = "privatelink.redis.cache.windows.net" + resource_group_name = var.resource_group_name +} + +resource "azurerm_redis_cache" "redis_cluster" { + name = var.redis.name != "" && var.redis.name != null ? var.redis.name : "${local.cluster_name}-${var.namespace}-redis" + location = var.app_region + resource_group_name = var.resource_group_name + sku_name = var.redis.sku_name + capacity = var.redis.redis_cache_capacity + family = var.redis.redis_cache_family + non_ssl_port_enabled = var.redis.redis_enable_non_ssl_port + public_network_access_enabled = local.vnet_enabled ? false : true + tags = var.tags +} + +# Private Endpoint for Redis Cache (VNet integration) +resource "azurerm_private_endpoint" "redis" { + count = local.vnet_enabled ? 1 : 0 + name = "${azurerm_redis_cache.redis_cluster.name}-pe" + location = var.app_region + resource_group_name = var.resource_group_name + subnet_id = data.azurerm_subnet.redis_subnet[0].id + tags = var.tags + + private_service_connection { + name = "${azurerm_redis_cache.redis_cluster.name}-psc" + private_connection_resource_id = azurerm_redis_cache.redis_cluster.id + is_manual_connection = false + subresource_names = ["redisCache"] + } + + private_dns_zone_group { + name = "redis-dns-zone-group" + private_dns_zone_ids = [data.azurerm_private_dns_zone.redis[0].id] + } } resource "kubernetes_service" "redis_service" { diff --git a/sql/aws-rds/main.tf b/sql/aws-rds/main.tf index dff49828..5ce63cfe 100644 --- a/sql/aws-rds/main.tf +++ b/sql/aws-rds/main.tf @@ -130,7 +130,8 @@ resource "aws_db_instance" "db_instance" { db_subnet_group_name = aws_db_subnet_group.db_subnet.name vpc_security_group_ids = [aws_security_group.rds.id] parameter_group_name = aws_db_parameter_group.db_param_group.name - skip_final_snapshot = true + skip_final_snapshot = false + final_snapshot_identifier = "${var.rds_name}-final-snapshot" multi_az = var.multi_az backup_retention_period = 7 performance_insights_enabled = false @@ -158,7 +159,8 @@ resource "aws_db_instance" "rds_read_replica" { allocated_storage = var.allocated_storage vpc_security_group_ids = [aws_security_group.rds.id] parameter_group_name = aws_db_parameter_group.db_param_group.name - skip_final_snapshot = true + skip_final_snapshot = false + final_snapshot_identifier = "rds-read-replica-${var.rds_name}-final-snapshot" multi_az = var.read_replica_multi_az performance_insights_enabled = false deletion_protection = var.deletion_protection diff --git a/sql/aws-rds/vars.tf b/sql/aws-rds/vars.tf index 8b6c8637..ab0dc37b 100644 --- a/sql/aws-rds/vars.tf +++ b/sql/aws-rds/vars.tf @@ -141,7 +141,7 @@ variable "auto_minor_version_upgrade" { variable "postgresql_engine_version" { description = "The engine version to use for postgresql" type = string - default = "16.1" + default = "16.12" } variable "read_replica_multi_az" { @@ -159,7 +159,7 @@ variable "mysql_engine_version" { variable "enable_ssl" { description = "Whether SSL should be enabled or not based on user requirement" type = bool - default = false + default = true } variable "multi_ds" { diff --git a/sql/azure-mysql/main.tf b/sql/azure-mysql/main.tf index ca065018..2c8d782d 100644 --- a/sql/azure-mysql/main.tf +++ b/sql/azure-mysql/main.tf @@ -1,11 +1,40 @@ +locals { + vnet_enabled = var.vpc != "" + subnet_name = local.vnet_enabled ? "${var.vpc}-mysql-subnet" : "" +} + +data "azurerm_virtual_network" "vnet" { + count = local.vnet_enabled ? 1 : 0 + name = var.vpc + resource_group_name = var.resource_group_name +} + +data "azurerm_subnet" "db_subnet" { + count = local.vnet_enabled ? 1 : 0 + name = local.subnet_name + resource_group_name = var.resource_group_name + virtual_network_name = data.azurerm_virtual_network.vnet[0].name +} + +# Reference existing Private DNS Zone created by account-setup module +data "azurerm_private_dns_zone" "mysql" { + count = local.vnet_enabled ? 1 : 0 + name = "privatelink.mysql.database.azure.com" + resource_group_name = var.resource_group_name +} + resource "azurerm_mysql_flexible_server" "mysql_server" { - name = var.mysql_server_name - resource_group_name = var.resource_group_name - location = var.location - administrator_login = var.administrator_login - administrator_password = azurerm_key_vault_secret.mysql_db_secret.value - backup_retention_days = var.backup_retention_days - sku_name = var.sku_name + name = var.mysql_server_name + resource_group_name = var.resource_group_name + location = var.location + administrator_login = var.administrator_login + administrator_password = azurerm_key_vault_secret.mysql_db_secret.value + backup_retention_days = var.backup_retention_days + sku_name = var.sku_name + + # VNet integration + delegated_subnet_id = local.vnet_enabled ? data.azurerm_subnet.db_subnet[0].id : null + private_dns_zone_id = local.vnet_enabled ? data.azurerm_private_dns_zone.mysql[0].id : null storage { size_gb = var.storage @@ -36,6 +65,7 @@ resource "azurerm_mysql_flexible_database" "mysql_db" { } resource "azurerm_mysql_flexible_server_configuration" "mysql_parameter_group" { + count = var.enable_ssl == false ? 1 : 0 name = "require_secure_transport" resource_group_name = var.resource_group_name server_name = azurerm_mysql_flexible_server.mysql_server.name @@ -43,11 +73,12 @@ resource "azurerm_mysql_flexible_server_configuration" "mysql_parameter_group" { } resource "azurerm_mysql_flexible_server_firewall_rule" "mysql_firewall" { + count = local.vnet_enabled ? 0 : 1 name = "${var.cluster_name}-${var.namespace}-mysql-firewall" resource_group_name = var.resource_group_name server_name = azurerm_mysql_flexible_server.mysql_server.name start_ip_address = "0.0.0.0" - end_ip_address = "255.255.255.255" + end_ip_address = "0.0.0.0" } resource "azurerm_mysql_flexible_server" "mysql_read_replica" { diff --git a/sql/azure-mysql/vars.tf b/sql/azure-mysql/vars.tf index 88586f69..08d820f7 100644 --- a/sql/azure-mysql/vars.tf +++ b/sql/azure-mysql/vars.tf @@ -133,8 +133,20 @@ variable "io_scaling_enabled" { default = false } +variable "enable_ssl" { + description = "Whether SSL should be enabled or not based on user requirement" + type = bool + default = true +} + variable "multi_ds" { description = "Whether to create multiple databases in the same instance" type = bool default = false +} + +variable "vpc" { + description = "VNet name the apps are going to use." + type = string + default = "" } \ No newline at end of file diff --git a/sql/azure-postgres/main.tf b/sql/azure-postgres/main.tf index b429b3e0..fe27003c 100644 --- a/sql/azure-postgres/main.tf +++ b/sql/azure-postgres/main.tf @@ -1,3 +1,28 @@ +locals { + vnet_enabled = var.vpc != "" + subnet_name = local.vnet_enabled ? "${var.vpc}-postgresql-subnet" : "" +} + +data "azurerm_virtual_network" "vnet" { + count = local.vnet_enabled ? 1 : 0 + name = var.vpc + resource_group_name = var.resource_group_name +} + +data "azurerm_subnet" "db_subnet" { + count = local.vnet_enabled ? 1 : 0 + name = local.subnet_name + resource_group_name = var.resource_group_name + virtual_network_name = data.azurerm_virtual_network.vnet[0].name +} + +# Reference existing Private DNS Zone created by account-setup module +data "azurerm_private_dns_zone" "postgres" { + count = local.vnet_enabled ? 1 : 0 + name = "privatelink.postgres.database.azure.com" + resource_group_name = var.resource_group_name +} + resource "azurerm_postgresql_flexible_server" "postgres_server" { name = var.postgres_server_name location = var.location @@ -11,6 +36,12 @@ resource "azurerm_postgresql_flexible_server" "postgres_server" { storage_tier = var.storage_tier backup_retention_days = var.backup_retention_days geo_redundant_backup_enabled = true + public_network_access_enabled = local.vnet_enabled ? false : true + + # VNet integration + # When delegated_subnet_id is provided, Azure automatically disables public network access + delegated_subnet_id = local.vnet_enabled ? data.azurerm_subnet.db_subnet[0].id : null + private_dns_zone_id = local.vnet_enabled ? data.azurerm_private_dns_zone.postgres[0].id : null tags = merge(var.tags, tomap({ @@ -40,10 +71,11 @@ resource "azurerm_postgresql_flexible_server_configuration" "ssl_parameter_group } resource "azurerm_postgresql_flexible_server_firewall_rule" "postgres_firewall" { + count = local.vnet_enabled ? 0 : 1 name = "${var.cluster_name}-${var.namespace}-postgres-firewall" server_id = azurerm_postgresql_flexible_server.postgres_server.id start_ip_address = "0.0.0.0" - end_ip_address = "255.255.255.255" + end_ip_address = "0.0.0.0" } resource "azurerm_postgresql_flexible_server" "postgresql_replica_server" { @@ -58,4 +90,4 @@ resource "azurerm_postgresql_flexible_server" "postgresql_replica_server" { sku_name = var.sku_name create_mode = "Replica" source_server_id = azurerm_postgresql_flexible_server.postgres_server.id -} \ No newline at end of file +} diff --git a/sql/azure-postgres/vars.tf b/sql/azure-postgres/vars.tf index 41cba224..2b27a52e 100644 --- a/sql/azure-postgres/vars.tf +++ b/sql/azure-postgres/vars.tf @@ -101,7 +101,7 @@ variable "zone" { variable "enable_ssl" { description = "Whether SSL should be enabled or not based on user requirement" type = bool - default = false + default = true } variable "storage_mb" { @@ -134,4 +134,10 @@ variable "multi_ds" { description = "Whether to create multiple databases in the same instance" type = bool default = false +} + +variable "vpc" { + description = "VNet name the apps are going to use." + type = string + default = "" } \ No newline at end of file diff --git a/sql/gcp-sql/vars.tf b/sql/gcp-sql/vars.tf index 849f57ff..ad91bd2f 100644 --- a/sql/gcp-sql/vars.tf +++ b/sql/gcp-sql/vars.tf @@ -113,7 +113,7 @@ variable "disk_type" { variable "require_ssl" { description = "True if the instance should require SSL/TLS for users connecting over IP. Note: SSL/TLS is needed to provide security when you connect to Cloud SQL using IP addresses. If you are connecting to your instance only by using the Cloud SQL Proxy or the Java Socket Library, you do not need to configure your instance to use SSL/TLS." type = bool - default = false + default = true } variable "private_network" { @@ -171,7 +171,7 @@ variable "labels" { variable "enable_ssl" { description = "Whether SSL should be enabled or not based on user requirement" type = bool - default = false + default = true } variable "multi_ds" { diff --git a/zop-helm/cronjob/main.tf b/zop-helm/cronjob/main.tf index 9adf0921..d667cac7 100644 --- a/zop-helm/cronjob/main.tf +++ b/zop-helm/cronjob/main.tf @@ -19,6 +19,7 @@ resource "helm_release" "cron_helm"{ version = "v0.0.17" chart = "cron-job" reuse_values = true + max_history = var.max_history values = [templatefile("${path.module}/templates/values.yaml", { name = var.name diff --git a/zop-helm/cronjob/vars.tf b/zop-helm/cronjob/vars.tf index b447a907..0075d8e1 100644 --- a/zop-helm/cronjob/vars.tf +++ b/zop-helm/cronjob/vars.tf @@ -146,4 +146,10 @@ variable "pub_sub" { variable "service_random_string" { type = string default = "" +} + +variable "max_history" { + description = "Maximum number of Helm release revisions to keep. Use 0 for no limit." + type = number + default = 10 } \ No newline at end of file diff --git a/zop-helm/service/main.tf b/zop-helm/service/main.tf index 88d5e37f..556033a9 100644 --- a/zop-helm/service/main.tf +++ b/zop-helm/service/main.tf @@ -19,6 +19,7 @@ resource "helm_release" "service_helm"{ version = "v0.0.27" chart = "service" reuse_values = true + max_history = var.max_history values = [templatefile("${path.module}/templates/values.yaml", { name = var.name diff --git a/zop-helm/service/vars.tf b/zop-helm/service/vars.tf index b262d74e..d77b8ecd 100644 --- a/zop-helm/service/vars.tf +++ b/zop-helm/service/vars.tf @@ -263,4 +263,10 @@ variable "custom_alerts" { labels = map(string) })) default = [] +} + +variable "max_history" { + description = "Maximum number of Helm release revisions to keep. Use 0 for no limit." + type = number + default = 10 } \ No newline at end of file diff --git a/zop-system/aws/permissions.tf b/zop-system/aws/permissions.tf index db3f6beb..93ebdc2b 100644 --- a/zop-system/aws/permissions.tf +++ b/zop-system/aws/permissions.tf @@ -170,7 +170,7 @@ resource "kubernetes_cluster_role_binding" "zop_cluster_role_binding_cluster_adm role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" - name = "cluster-admin" + name = "edit" } subject { kind = "ServiceAccount" diff --git a/zop-system/azure/permissions.tf b/zop-system/azure/permissions.tf index d6606f3f..0e4d54eb 100644 --- a/zop-system/azure/permissions.tf +++ b/zop-system/azure/permissions.tf @@ -150,7 +150,7 @@ resource "kubernetes_cluster_role_binding" "zop_cluster_role_binding_cluster_adm role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" - name = "cluster-admin" + name = "edit" } subject { kind = "ServiceAccount" diff --git a/zop-system/gcp/main.tf b/zop-system/gcp/main.tf index d8649ad1..2a76b7d1 100644 --- a/zop-system/gcp/main.tf +++ b/zop-system/gcp/main.tf @@ -16,7 +16,7 @@ resource "helm_release" "service_helm"{ name = "kube-management-api" namespace = "zop-system" repository = "https://helm.zop.dev" - version = "v0.0.22" + version = "v0.0.28" chart = "service" set { @@ -89,6 +89,11 @@ resource "helm_release" "service_helm"{ value = ["zop-system-secret"] } + set { + name = "Containers.privileged" + value = true + } + values = [templatefile("./templates/values.yaml",{ cluster_name = var.cluster_name app_region = var.app_region diff --git a/zop-system/gcp/permissions.tf b/zop-system/gcp/permissions.tf index 2ffd68fe..9b087541 100644 --- a/zop-system/gcp/permissions.tf +++ b/zop-system/gcp/permissions.tf @@ -163,7 +163,7 @@ resource "kubernetes_cluster_role_binding" "zop_system_admin_role_binding_gcp_sa role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" - name = "cluster-admin" + name = "edit" } subject {