diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/assets/dd-karpenter.yaml b/cmd/kubectl-datadog/autoscaling/cluster/install/assets/dd-karpenter.yaml index 2735987abf..f6b64436bd 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/assets/dd-karpenter.yaml +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/assets/dd-karpenter.yaml @@ -3,6 +3,7 @@ Description: IAM role for pod identity association Conditions: ShouldDeployPodIdentityAddon: !Equals [!Ref DeployPodIdentityAddon, "true"] ShouldDeployNodeAccessEntry: !Equals [!Ref DeployNodeAccessEntry, "true"] + ShouldDeployFargateProfile: !Equals [!Ref DeployFargateProfile, "true"] Parameters: ClusterName: Type: String @@ -24,6 +25,17 @@ Parameters: AllowedValues: - "true" - "false" + DeployFargateProfile: + Type: String + Description: "Whether to deploy a Fargate profile for the Karpenter namespace" + Default: "false" + AllowedValues: + - "true" + - "false" + FargateSubnets: + Type: CommaDelimitedList + Description: "Comma-separated list of private subnet IDs for the Fargate profile" + Default: "" Resources: EKSPodIdentityAddon: Type: AWS::EKS::Addon @@ -65,3 +77,33 @@ Resources: ClusterName: !Ref ClusterName PrincipalArn: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/KarpenterNodeRole-${ClusterName}" Type: EC2_LINUX + FargatePodExecutionRole: + Type: AWS::IAM::Role + Condition: ShouldDeployFargateProfile + Properties: + RoleName: !Sub "${ClusterName}-karpenter-fargate" + AssumeRolePolicyDocument: + Statement: + - Action: + - sts:AssumeRole + Effect: Allow + Principal: + Service: eks-fargate-pods.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonEKSFargatePodExecutionRolePolicy" + FargateProfile: + Type: AWS::EKS::FargateProfile + Condition: ShouldDeployFargateProfile + DependsOn: + - FargatePodExecutionRole + Properties: + ClusterName: !Ref ClusterName + FargateProfileName: !Sub "${ClusterName}-karpenter" + PodExecutionRoleArn: !GetAtt FargatePodExecutionRole.Arn + Selectors: + - Namespace: !Ref KarpenterNamespace + Subnets: !Ref FargateSubnets + Tags: + - Key: managed-by + Value: dd-karpenter diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets.go b/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets.go new file mode 100644 index 0000000000..b9f9c7058d --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets.go @@ -0,0 +1,133 @@ +package guess + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/aws/aws-sdk-go-v2/service/eks" +) + +// GetPrivateSubnetIDs returns the private subnet IDs configured on the EKS +// cluster. A subnet is considered private if its associated route table has no +// route targeting an Internet Gateway (igw-*). +func GetPrivateSubnetIDs(ctx context.Context, eksClient *eks.Client, ec2Client *ec2.Client, clusterName string) ([]string, error) { + cluster, err := eksClient.DescribeCluster(ctx, &eks.DescribeClusterInput{ + Name: &clusterName, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe cluster: %w", err) + } + + if cluster.Cluster == nil || cluster.Cluster.ResourcesVpcConfig == nil { + return nil, fmt.Errorf("cluster %s has no VPC configuration", clusterName) + } + + vpcConfig := cluster.Cluster.ResourcesVpcConfig + if len(vpcConfig.SubnetIds) == 0 { + return nil, fmt.Errorf("cluster %s has no subnets configured", clusterName) + } + + if vpcConfig.VpcId == nil { + return nil, fmt.Errorf("cluster %s has no VPC ID", clusterName) + } + vpcID := *vpcConfig.VpcId + + routeTables, err := describeAllRouteTables(ctx, ec2Client, vpcID) + if err != nil { + return nil, fmt.Errorf("failed to describe route tables for VPC %s: %w", vpcID, err) + } + + privateSubnets := filterPrivateSubnets(vpcConfig.SubnetIds, routeTables) + if len(privateSubnets) == 0 { + return nil, fmt.Errorf("no private subnets found among the %d cluster subnets in VPC %s; Fargate requires private subnets", len(vpcConfig.SubnetIds), vpcID) + } + + return privateSubnets, nil +} + +func describeAllRouteTables(ctx context.Context, ec2Client *ec2.Client, vpcID string) ([]ec2types.RouteTable, error) { + var routeTables []ec2types.RouteTable + var nextToken *string + + for { + out, err := ec2Client.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("vpc-id"), + Values: []string{vpcID}, + }, + }, + NextToken: nextToken, + }) + if err != nil { + return nil, err + } + + routeTables = append(routeTables, out.RouteTables...) + + nextToken = out.NextToken + if nextToken == nil { + return routeTables, nil + } + } +} + +// filterPrivateSubnets returns the subset of clusterSubnetIDs whose effective +// route table has no route to an Internet Gateway. Subnets not explicitly +// associated with a route table inherit the VPC's main route table. +func filterPrivateSubnets(clusterSubnetIDs []string, routeTables []ec2types.RouteTable) []string { + // Determine which route tables are "public" (have an IGW route). + isPublicRT := make(map[string]bool, len(routeTables)) + for _, rt := range routeTables { + rtID := aws.ToString(rt.RouteTableId) + isPublicRT[rtID] = hasInternetGatewayRoute(rt) + } + + // Map each subnet to its explicitly-associated route table and find the + // main route table (inherited by subnets without an explicit association). + subnetRT := make(map[string]string) // subnetID -> routeTableID + var mainRT string + + for _, rt := range routeTables { + rtID := aws.ToString(rt.RouteTableId) + for _, assoc := range rt.Associations { + if assoc.Main != nil && *assoc.Main { + mainRT = rtID + } + if subnetID := aws.ToString(assoc.SubnetId); subnetID != "" { + subnetRT[subnetID] = rtID + } + } + } + + // Filter: keep cluster subnets whose effective route table is private. + // Skip subnets whose route table cannot be determined (fail closed). + var privateSubnets []string + for _, subnetID := range clusterSubnetIDs { + effectiveRT := subnetRT[subnetID] + if effectiveRT == "" { + effectiveRT = mainRT // inherit main route table + } + if effectiveRT == "" { + continue // no route table found, skip rather than assume private + } + if !isPublicRT[effectiveRT] { + privateSubnets = append(privateSubnets, subnetID) + } + } + + return privateSubnets +} + +func hasInternetGatewayRoute(rt ec2types.RouteTable) bool { + for _, route := range rt.Routes { + if gwID := aws.ToString(route.GatewayId); strings.HasPrefix(gwID, "igw-") { + return true + } + } + return false +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets_test.go b/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets_test.go new file mode 100644 index 0000000000..37751dfdb4 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/guess/privatesubnets_test.go @@ -0,0 +1,177 @@ +package guess + +import ( + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/stretchr/testify/assert" +) + +func routeTable(id string, main bool, igw bool, subnetIDs ...string) ec2types.RouteTable { + var routes []ec2types.Route + // Every route table has a local route. + routes = append(routes, ec2types.Route{ + DestinationCidrBlock: aws.String("10.0.0.0/16"), + GatewayId: aws.String("local"), + }) + if igw { + routes = append(routes, ec2types.Route{ + DestinationCidrBlock: aws.String("0.0.0.0/0"), + GatewayId: aws.String("igw-0123456789abcdef0"), + }) + } else { + // Private route tables typically have a NAT gateway route. + routes = append(routes, ec2types.Route{ + DestinationCidrBlock: aws.String("0.0.0.0/0"), + NatGatewayId: aws.String("nat-0123456789abcdef0"), + }) + } + + var associations []ec2types.RouteTableAssociation + if main { + associations = append(associations, ec2types.RouteTableAssociation{ + Main: aws.Bool(true), + RouteTableId: aws.String(id), + }) + } + for _, subnetID := range subnetIDs { + associations = append(associations, ec2types.RouteTableAssociation{ + Main: aws.Bool(false), + RouteTableId: aws.String(id), + SubnetId: aws.String(subnetID), + }) + } + + return ec2types.RouteTable{ + RouteTableId: aws.String(id), + Routes: routes, + Associations: associations, + } +} + +func TestFilterPrivateSubnets(t *testing.T) { + for _, tc := range []struct { + name string + clusterSubnetIDs []string + routeTables []ec2types.RouteTable + expected []string + }{ + { + name: "Mixed public and private with explicit associations", + clusterSubnetIDs: []string{"subnet-private-1", "subnet-public-1", "subnet-private-2"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + routeTable("rtb-private", false, false, "subnet-private-1", "subnet-private-2"), + routeTable("rtb-public", false, true, "subnet-public-1"), + }, + expected: []string{"subnet-private-1", "subnet-private-2"}, + }, + { + name: "All subnets are private", + clusterSubnetIDs: []string{"subnet-1", "subnet-2", "subnet-3"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + routeTable("rtb-private", false, false, "subnet-1", "subnet-2", "subnet-3"), + }, + expected: []string{"subnet-1", "subnet-2", "subnet-3"}, + }, + { + name: "All subnets are public", + clusterSubnetIDs: []string{"subnet-1", "subnet-2"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + routeTable("rtb-public", false, true, "subnet-1", "subnet-2"), + }, + expected: nil, + }, + { + name: "Subnets inherit private main route table", + clusterSubnetIDs: []string{"subnet-1", "subnet-2"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + }, + expected: []string{"subnet-1", "subnet-2"}, + }, + { + name: "Subnets inherit public main route table", + clusterSubnetIDs: []string{"subnet-1", "subnet-2"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, true), + }, + expected: nil, + }, + { + name: "Mixed: some subnets explicit, some inherit main", + clusterSubnetIDs: []string{"subnet-explicit-private", "subnet-inherits-main", "subnet-explicit-public"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), // main is private + routeTable("rtb-private", false, false, "subnet-explicit-private"), + routeTable("rtb-public", false, true, "subnet-explicit-public"), + }, + expected: []string{"subnet-explicit-private", "subnet-inherits-main"}, + }, + { + name: "Mixed: some subnets explicit, some inherit public main", + clusterSubnetIDs: []string{"subnet-explicit-private", "subnet-inherits-main"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, true), // main is public + routeTable("rtb-private", false, false, "subnet-explicit-private"), + }, + expected: []string{"subnet-explicit-private"}, + }, + { + name: "Gateway-only route table with no subnet associations", + clusterSubnetIDs: []string{"subnet-1"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + // Gateway route table (attached to VPN/IGW, no subnet associations). + { + RouteTableId: aws.String("rtb-gateway"), + Routes: []ec2types.Route{ + { + DestinationCidrBlock: aws.String("10.0.0.0/16"), + GatewayId: aws.String("local"), + }, + }, + Associations: []ec2types.RouteTableAssociation{ + { + Main: aws.Bool(false), + RouteTableId: aws.String("rtb-gateway"), + GatewayId: aws.String("igw-gateway"), + }, + }, + }, + }, + expected: []string{"subnet-1"}, + }, + { + name: "Empty cluster subnet list", + clusterSubnetIDs: []string{}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + }, + expected: nil, + }, + { + name: "No route tables at all skips all subnets (fail closed)", + clusterSubnetIDs: []string{"subnet-1", "subnet-2"}, + routeTables: nil, + expected: nil, + }, + { + name: "Preserves input order", + clusterSubnetIDs: []string{"subnet-c", "subnet-a", "subnet-b"}, + routeTables: []ec2types.RouteTable{ + routeTable("rtb-main", true, false), + routeTable("rtb-private", false, false, "subnet-a", "subnet-b", "subnet-c"), + }, + expected: []string{"subnet-c", "subnet-a", "subnet-b"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + result := filterPrivateSubnets(tc.clusterSubnetIDs, tc.routeTables) + assert.Equal(t, tc.expected, result) + }) + } +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go index df410e5d28..f686c06e54 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go @@ -12,8 +12,10 @@ import ( "os/signal" "slices" "strconv" + "strings" "syscall" + "github.com/aws/aws-sdk-go-v2/service/eks" "github.com/aws/aws-sdk-go-v2/service/sts" "github.com/davecgh/go-spew/spew" "github.com/fatih/color" @@ -126,6 +128,7 @@ var ( karpenterVersion string createKarpenterResources = CreateKarpenterResourcesAll inferenceMethod = InferenceMethodNodeGroups + noFargate bool debug bool installExample = ` # install autoscaling @@ -171,6 +174,7 @@ func New(streams genericclioptions.IOStreams) *cobra.Command { cmd.Flags().StringVar(&karpenterVersion, "karpenter-version", "", "Version of Karpenter to install (default to latest)") cmd.Flags().Var(&createKarpenterResources, "create-karpenter-resources", "Which Karpenter resources to create: none, ec2nodeclass, all (default: all)") cmd.Flags().Var(&inferenceMethod, "inference-method", "Method to infer EC2NodeClass and NodePool properties: nodes, nodegroups") + cmd.Flags().BoolVar(&noFargate, "no-fargate", false, "Skip creating a Fargate profile for the Karpenter namespace") cmd.Flags().BoolVar(&debug, "debug", false, "Enable debug logs") o.ConfigFlags.AddFlags(cmd.Flags()) @@ -231,7 +235,8 @@ func (o *options) run(cmd *cobra.Command) error { return fmt.Errorf("failed to build clients: %w", err) } - if err = createCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { + fargateEnabled, err := createCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace, noFargate) + if err != nil { return err } @@ -239,7 +244,7 @@ func (o *options) run(cmd *cobra.Command) error { return err } - if err = o.installHelmChart(ctx, clusterName, karpenterNamespace, karpenterVersion, debug); err != nil { + if err = o.installHelmChart(ctx, clusterName, karpenterNamespace, karpenterVersion, fargateEnabled, debug); err != nil { return err } @@ -250,21 +255,48 @@ func (o *options) run(cmd *cobra.Command) error { return displaySuccessMessage(cmd, clusterName, createKarpenterResources) } -func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string) error { - if err := aws.CreateOrUpdateStack(ctx, cli.CloudFormation, "dd-karpenter-"+clusterName+"-karpenter", KarpenterCfn, map[string]string{ +func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string, noFargate bool) (fargateEnabled bool, err error) { + if err = aws.CreateOrUpdateStack(ctx, cli.CloudFormation, "dd-karpenter-"+clusterName+"-karpenter", KarpenterCfn, map[string]string{ "ClusterName": clusterName, }); err != nil { - return fmt.Errorf("failed to create or update Cloud Formation stack: %w", err) + return false, fmt.Errorf("failed to create or update Cloud Formation stack: %w", err) } isUnmanagedEKSPIAInstalled, err := guess.IsThereUnmanagedEKSPodIdentityAgentInstalled(ctx, cli.EKS, clusterName) if err != nil { - return fmt.Errorf("failed to check if EKS pod identity agent is installed: %w", err) + return false, fmt.Errorf("failed to check if EKS pod identity agent is installed: %w", err) } supportsAPIAuth, err := guess.SupportsAPIAuthenticationMode(ctx, cli.EKS, clusterName) if err != nil { - return fmt.Errorf("failed to check cluster authentication mode: %w", err) + return false, fmt.Errorf("failed to check cluster authentication mode: %w", err) + } + + // Discover private subnets for the Fargate profile. + deployFargate := !noFargate + var fargateSubnets string + if deployFargate { + privateSubnets, err := guess.GetPrivateSubnetIDs(ctx, cli.EKS, cli.EC2, clusterName) + if err != nil { + // If a Fargate profile already exists (e.g. from a previous install), + // preserve it rather than deleting it due to a transient discovery error. + fargateProfileName := clusterName + "-karpenter" // must match FargateProfileName in dd-karpenter.yaml + existing, descErr := cli.EKS.DescribeFargateProfile(ctx, &eks.DescribeFargateProfileInput{ + ClusterName: &clusterName, + FargateProfileName: &fargateProfileName, + }) + if descErr == nil && existing.FargateProfile != nil { + log.Printf("Warning: could not discover private subnets (%v), preserving existing Fargate profile %s.", err, fargateProfileName) + fargateSubnets = strings.Join(existing.FargateProfile.Subnets, ",") + } else { + log.Printf("Warning: could not discover private subnets for Fargate: %v", err) + log.Println("Karpenter will run on regular nodes. Use --no-fargate to silence this warning.") + deployFargate = false + } + } else { + fargateSubnets = strings.Join(privateSubnets, ",") + log.Printf("Discovered %d private subnet(s) for Fargate profile.", len(privateSubnets)) + } } if err := aws.CreateOrUpdateStack(ctx, cli.CloudFormation, "dd-karpenter-"+clusterName+"-dd-karpenter", DdKarpenterCfn, map[string]string{ @@ -272,11 +304,13 @@ func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clust "KarpenterNamespace": karpenterNamespace, "DeployPodIdentityAddon": strconv.FormatBool(!isUnmanagedEKSPIAInstalled), "DeployNodeAccessEntry": strconv.FormatBool(supportsAPIAuth), + "DeployFargateProfile": strconv.FormatBool(deployFargate), + "FargateSubnets": fargateSubnets, }); err != nil { - return fmt.Errorf("failed to create or update Cloud Formation stack: %w", err) + return false, fmt.Errorf("failed to create or update Cloud Formation stack: %w", err) } - return nil + return deployFargate, nil } func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterName string) error { @@ -312,7 +346,7 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa return nil } -func (o *options) installHelmChart(ctx context.Context, clusterName string, karpenterNamespace string, karpenterVersion string, debug bool) error { +func (o *options) installHelmChart(ctx context.Context, clusterName string, karpenterNamespace string, karpenterVersion string, fargate bool, debug bool) error { actionConfig, err := helm.NewActionConfig(o.ConfigFlags, karpenterNamespace) if err != nil { return err @@ -349,6 +383,22 @@ func (o *options) installHelmChart(ctx context.Context, clusterName string, karp }, } + if fargate { + // Fargate allocates compute based on pod resource requests. + values["controller"] = map[string]any{ + "resources": map[string]any{ + "requests": map[string]any{ + "cpu": "1", + "memory": "1Gi", + }, + "limits": map[string]any{ + "cpu": "1", + "memory": "1Gi", + }, + }, + } + } + if err = helm.CreateOrUpgrade(ctx, actionConfig, "karpenter", karpenterNamespace, karpenterOCIRegistry, karpenterVersion, values); err != nil { return fmt.Errorf("failed to create or update Helm release: %w", err) } diff --git a/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go b/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go index eb10572d0b..0244c2593e 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/uninstall/uninstall.go @@ -16,6 +16,8 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/aws/aws-sdk-go-v2/service/eks" + ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" "github.com/aws/aws-sdk-go-v2/service/iam" iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" "github.com/aws/aws-sdk-go-v2/service/sts" @@ -238,6 +240,19 @@ func displayResourceSummary(ctx context.Context, cmd *cobra.Command, cli *client cmd.Println(" - The Karpenter Helm release") + fargateProfileName := clusterName + "-karpenter" // must match FargateProfileName in dd-karpenter.yaml + if _, descErr := cli.EKS.DescribeFargateProfile(ctx, &eks.DescribeFargateProfileInput{ + ClusterName: &clusterName, + FargateProfileName: &fargateProfileName, + }); descErr == nil { + cmd.Printf(" - Fargate profile: %s (via CloudFormation stack)\n", fargateProfileName) + } else { + var notFound *ekstypes.ResourceNotFoundException + if !errors.As(descErr, ¬Found) { + cmd.Printf(" - Fargate profile: (unable to check: %v)\n", descErr) + } + } + if stacks, err := listCloudFormationStacks(ctx, cli, clusterName); err != nil { cmd.Printf(" - CloudFormation stacks: (unable to list: %v)\n", err) } else if len(stacks) == 0 {