From 762c5d629b1ed4dcb5acf7c873cb8565f65b8c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Fri, 10 Apr 2026 18:09:47 +0200 Subject: [PATCH 1/4] Add external Karpenter guard and upgrade command to kubectl-datadog Add a guard in `kubectl datadog autoscaling cluster install` to detect and block installation when an external Karpenter is already active on the cluster, using webhook-based detection with Helm release ownership validation. This prevents accidentally deploying a second Karpenter controller while still allowing idempotent re-installs of our own. Add `kubectl datadog autoscaling cluster upgrade` to update a Karpenter that was previously installed by the plugin. The command auto-detects the installation namespace and previous parameters (cluster name) from the existing Helm release, avoiding the need to re-specify them. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../autoscaling/cluster/cluster.go | 2 + .../autoscaling/cluster/common/helm/helm.go | 38 ++ .../cluster/common/k8s/karpenter.go | 94 +++++ .../cluster/common/k8s/karpenter_test.go | 353 ++++++++++++++++++ .../autoscaling/cluster/install/install.go | 67 +++- .../autoscaling/cluster/upgrade/upgrade.go | 241 ++++++++++++ .../cluster/upgrade/upgrade_test.go | 104 ++++++ 7 files changed, 888 insertions(+), 11 deletions(-) create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go diff --git a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go index ce16cfcf8d..cca92aed9c 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go @@ -8,6 +8,7 @@ import ( "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/uninstall" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/upgrade" ) // options provides information required by cluster command @@ -33,6 +34,7 @@ func New(streams genericclioptions.IOStreams) *cobra.Command { cmd.AddCommand(install.New(streams)) cmd.AddCommand(uninstall.New(streams)) + cmd.AddCommand(upgrade.New(streams)) o := newOptions(streams) o.configFlags.AddFlags(cmd.Flags()) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go b/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go index ef7821ec8f..71f05a0585 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go @@ -128,6 +128,44 @@ func upgrade(ctx context.Context, ac *action.Configuration, releaseName, namespa return nil } +// GetRelease retrieves the full Helm release including config values. +func GetRelease(_ context.Context, ac *action.Configuration, releaseName string) (*release.Release, error) { + getAction := action.NewGet(ac) + rel, err := getAction.Run(releaseName) + if err != nil { + return nil, fmt.Errorf("failed to get Helm release %s: %w", releaseName, err) + } + return rel, nil +} + +// IsOurRelease checks whether a Karpenter Helm release in the given namespace +// was installed by the kubectl-datadog plugin. Returns the release for reuse +// by the caller. +func IsOurRelease(ctx context.Context, configFlags *genericclioptions.ConfigFlags, namespace, releaseName string) (bool, *release.Release, error) { + actionConfig, err := NewActionConfig(configFlags, namespace) + if err != nil { + return false, nil, fmt.Errorf("failed to create Helm action config for namespace %s: %w", namespace, err) + } + + rel, err := GetRelease(ctx, actionConfig, releaseName) + if err != nil { + return false, nil, err + } + + if rel == nil || rel.Config == nil { + return false, nil, nil + } + + labels, ok := rel.Config["additionalLabels"].(map[string]any) + if !ok { + return false, rel, nil + } + + managedBy, ok := labels["app.kubernetes.io/managed-by"].(string) + isOurs := ok && managedBy == "kubectl-datadog" + return isOurs, rel, nil +} + func Uninstall(ctx context.Context, ac *action.Configuration, releaseName string) error { exist, err := Exists(ctx, ac, releaseName) if err != nil { diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go new file mode 100644 index 0000000000..ac9c1489ff --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go @@ -0,0 +1,94 @@ +package k8s + +import ( + "context" + "fmt" + + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +var karpenterAPIGroups = map[string]bool{ + "karpenter.sh": true, + "karpenter.k8s.aws": true, +} + +const helmReleaseNameKarpenter = "karpenter" + +// DetectActiveKarpenter checks for an active Karpenter installation by looking for +// webhook configurations that reference Karpenter API groups. Returns the namespace +// where Karpenter's webhook service is running. +func DetectActiveKarpenter(ctx context.Context, clientset kubernetes.Interface) (bool, string, error) { + vwcList, err := clientset.AdmissionregistrationV1().ValidatingWebhookConfigurations().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, "", fmt.Errorf("failed to list ValidatingWebhookConfigurations: %w", err) + } + + for _, vwc := range vwcList.Items { + for _, webhook := range vwc.Webhooks { + if ns, ok := extractKarpenterNamespace(webhook.Rules, webhook.ClientConfig); ok { + return true, ns, nil + } + } + } + + mwcList, err := clientset.AdmissionregistrationV1().MutatingWebhookConfigurations().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, "", fmt.Errorf("failed to list MutatingWebhookConfigurations: %w", err) + } + + for _, mwc := range mwcList.Items { + for _, webhook := range mwc.Webhooks { + if ns, ok := extractKarpenterNamespace(webhook.Rules, webhook.ClientConfig); ok { + return true, ns, nil + } + } + } + + return false, "", nil +} + +func extractKarpenterNamespace(rules []admissionregistrationv1.RuleWithOperations, clientConfig admissionregistrationv1.WebhookClientConfig) (string, bool) { + for _, rule := range rules { + for _, group := range rule.APIGroups { + if karpenterAPIGroups[group] { + if clientConfig.Service != nil { + return clientConfig.Service.Namespace, true + } + // URL-based webhook — Karpenter is present but we can't + // determine the namespace (e.g. external/out-of-cluster). + return "", true + } + } + } + return "", false +} + +// FindKarpenterHelmRelease searches for a deployed Helm release named "karpenter" +// across all namespaces by looking at Helm storage secrets. This is a fallback +// for when webhooks are absent (e.g. pods crashed) but the Helm release still exists. +func FindKarpenterHelmRelease(ctx context.Context, clientset kubernetes.Interface) (bool, []string, error) { + secrets, err := clientset.CoreV1().Secrets("").List(ctx, metav1.ListOptions{ + LabelSelector: "owner=helm,name=" + helmReleaseNameKarpenter + ",status=deployed", + }) + if err != nil { + return false, nil, fmt.Errorf("failed to list Helm release secrets: %w", err) + } + + if len(secrets.Items) == 0 { + return false, nil, nil + } + + // Deduplicate namespaces (multiple revisions may exist in the same namespace) + seen := map[string]bool{} + var namespaces []string + for _, s := range secrets.Items { + if !seen[s.Namespace] { + seen[s.Namespace] = true + namespaces = append(namespaces, s.Namespace) + } + } + + return true, namespaces, nil +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go new file mode 100644 index 0000000000..feb6205bb1 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go @@ -0,0 +1,353 @@ +package k8s + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +func TestDetectActiveKarpenter(t *testing.T) { + for _, tc := range []struct { + name string + objects []runtime.Object + expectedFound bool + expectedNamespace string + }{ + { + name: "No webhook configurations", + objects: nil, + expectedFound: false, + }, + { + name: "Karpenter ValidatingWebhookConfiguration in dd-karpenter", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + { + name: "Karpenter ValidatingWebhookConfiguration in custom namespace", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.k8s.aws", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.k8s.aws"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "karpenter", + }, + { + name: "Non-Karpenter ValidatingWebhookConfiguration only", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "other-webhook"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validate.something.io", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"apps"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "system", + Name: "webhook-svc", + }, + }, + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Karpenter MutatingWebhookConfiguration detected", + objects: []runtime.Object{ + &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-defaulting"}, + Webhooks: []admissionregistrationv1.MutatingWebhook{ + { + Name: "defaulting.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + { + name: "Karpenter webhook with URL-based config (no service)", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + URL: ptrString("https://external-karpenter.example.com/validate"), + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "", + }, + { + name: "Multiple webhooks, only one is Karpenter", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "other-webhook"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validate.other.io", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"apps"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "other", + Name: "other-svc", + }, + }, + }, + }, + }, + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.objects...) + + found, ns, err := DetectActiveKarpenter(context.Background(), clientset) + assert.NoError(t, err) + assert.Equal(t, tc.expectedFound, found) + assert.Equal(t, tc.expectedNamespace, ns) + }) + } +} + +func TestFindKarpenterHelmRelease(t *testing.T) { + for _, tc := range []struct { + name string + objects []runtime.Object + expectedFound bool + expectedNamespaces []string + }{ + { + name: "No Helm secrets", + objects: nil, + expectedFound: false, + }, + { + name: "Karpenter Helm secret in dd-karpenter", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"dd-karpenter"}, + }, + { + name: "Karpenter Helm secret in custom namespace", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v3", + Namespace: "my-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"my-karpenter"}, + }, + { + name: "Karpenter Helm secret with superseded status (not deployed)", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "superseded", + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Different Helm release (not karpenter)", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.datadog.v1", + Namespace: "datadog", + Labels: map[string]string{ + "owner": "helm", + "name": "datadog", + "status": "deployed", + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Multiple revisions in same namespace are deduplicated", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v2", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"dd-karpenter"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.objects...) + + found, namespaces, err := FindKarpenterHelmRelease(context.Background(), clientset) + assert.NoError(t, err) + assert.Equal(t, tc.expectedFound, found) + if tc.expectedNamespaces == nil { + assert.Nil(t, namespaces) + } else { + assert.ElementsMatch(t, tc.expectedNamespaces, namespaces) + } + }) + } +} + +func ptrString(s string) *string { + return &s +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go index df410e5d28..828c89ebf2 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go @@ -28,6 +28,7 @@ import ( "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/helm" + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/guess" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/k8s" "github.com/DataDog/datadog-operator/pkg/plugin/common" @@ -37,7 +38,8 @@ import ( ) const ( - karpenterOCIRegistry = "oci://public.ecr.aws/karpenter/karpenter" + // KarpenterOCIRegistry is the OCI registry URL for the Karpenter Helm chart. + KarpenterOCIRegistry = "oci://public.ecr.aws/karpenter/karpenter" ) var ( @@ -224,6 +226,25 @@ func (o *options) run(cmd *cobra.Command) error { return displayEKSAutoModeMessage(cmd, clusterName) } + // Check for existing Karpenter installation + if found, webhookNamespace, err := commonk8s.DetectActiveKarpenter(ctx, o.Clientset); err != nil { + return fmt.Errorf("failed to check for existing Karpenter installation: %w", err) + } else if found { + if webhookNamespace == "" { + // Karpenter webhook found but uses a URL (external/out-of-cluster) — treat as external + return displayExternalKarpenterMessage(cmd, clusterName, "unknown (URL-based webhook)") + } + isOurs, _, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, webhookNamespace, "karpenter") + if checkErr != nil { + return fmt.Errorf("failed to check Karpenter Helm release ownership: %w", checkErr) + } + if !isOurs { + return displayExternalKarpenterMessage(cmd, clusterName, webhookNamespace) + } + // It's ours — use the detected namespace for the rest of the install + karpenterNamespace = webhookNamespace + } + display.PrintBox(cmd.OutOrStdout(), "Installing Karpenter on cluster "+clusterName+".") cli, err := clients.Build(ctx, o.ConfigFlags, o.Clientset) @@ -231,11 +252,11 @@ func (o *options) run(cmd *cobra.Command) error { return fmt.Errorf("failed to build clients: %w", err) } - if err = createCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { + if err = CreateCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { return err } - if err = updateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { + if err = UpdateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { return err } @@ -243,14 +264,15 @@ func (o *options) run(cmd *cobra.Command) error { return err } - if err = createNodePoolResources(ctx, cmd, cli, clusterName, createKarpenterResources, inferenceMethod, debug); err != nil { + if err = CreateNodePoolResources(ctx, cmd, cli, clusterName, createKarpenterResources, inferenceMethod, debug); err != nil { return err } return displaySuccessMessage(cmd, clusterName, createKarpenterResources) } -func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string) error { +// CreateCloudFormationStacks creates or updates the Karpenter and DD-Karpenter CloudFormation stacks. +func CreateCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string) error { if err := aws.CreateOrUpdateStack(ctx, cli.CloudFormation, "dd-karpenter-"+clusterName+"-karpenter", KarpenterCfn, map[string]string{ "ClusterName": clusterName, }); err != nil { @@ -279,7 +301,8 @@ func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clust return nil } -func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterName string) error { +// UpdateAwsAuthConfigMap ensures the Karpenter node role is in the aws-auth ConfigMap. +func UpdateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterName string) error { awsAuthConfigMapPresent, err := guess.IsAwsAuthConfigMapPresent(ctx, cli.K8sClientset) if err != nil { return fmt.Errorf("failed to check if aws-auth ConfigMap is present: %w", err) @@ -290,7 +313,6 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa return nil } - // Get AWS account ID callerIdentity, err := cli.STS.GetCallerIdentity(ctx, &sts.GetCallerIdentityInput{}) if err != nil { return fmt.Errorf("failed to get identity caller: %w", err) @@ -300,7 +322,6 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa } accountID := *callerIdentity.Account - // Add role mapping in the `aws-auth` ConfigMap if err = aws.EnsureAwsAuthRole(ctx, cli.K8sClientset, aws.RoleMapping{ RoleArn: "arn:aws:iam::" + accountID + ":role/KarpenterNodeRole-" + clusterName, Username: "system:node:{{EC2PrivateDNSName}}", @@ -313,7 +334,12 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa } func (o *options) installHelmChart(ctx context.Context, clusterName string, karpenterNamespace string, karpenterVersion string, debug bool) error { - actionConfig, err := helm.NewActionConfig(o.ConfigFlags, karpenterNamespace) + return InstallOrUpgradeHelmChart(ctx, o.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion, debug) +} + +// InstallOrUpgradeHelmChart installs or upgrades the Karpenter Helm chart with the plugin's standard values. +func InstallOrUpgradeHelmChart(ctx context.Context, configFlags *genericclioptions.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion string, debug bool) error { + actionConfig, err := helm.NewActionConfig(configFlags, karpenterNamespace) if err != nil { return err } @@ -349,14 +375,15 @@ func (o *options) installHelmChart(ctx context.Context, clusterName string, karp }, } - if err = helm.CreateOrUpgrade(ctx, actionConfig, "karpenter", karpenterNamespace, karpenterOCIRegistry, karpenterVersion, values); err != nil { + if err = helm.CreateOrUpgrade(ctx, actionConfig, "karpenter", karpenterNamespace, KarpenterOCIRegistry, karpenterVersion, values); err != nil { return fmt.Errorf("failed to create or update Helm release: %w", err) } return nil } -func createNodePoolResources(ctx context.Context, cmd *cobra.Command, cli *clients.Clients, clusterName string, createResources CreateKarpenterResources, inferenceMethod InferenceMethod, debug bool) error { +// CreateNodePoolResources infers and creates or updates Karpenter NodePool and EC2NodeClass resources. +func CreateNodePoolResources(ctx context.Context, cmd *cobra.Command, cli *clients.Clients, clusterName string, createResources CreateKarpenterResources, inferenceMethod InferenceMethod, debug bool) error { if createResources == CreateKarpenterResourcesNone { return nil } @@ -418,6 +445,24 @@ func openAutoscalingSettingsURL(cmd *cobra.Command, clusterName string) string { return color.New(color.Bold, color.Underline, color.FgBlue).Sprint(autoscalingSettingsURL) } +func displayExternalKarpenterMessage(cmd *cobra.Command, clusterName string, existingNamespace string) error { + coloredURL := openAutoscalingSettingsURL(cmd, clusterName) + + display.PrintBox(cmd.OutOrStdout(), + "Karpenter is already installed on cluster "+clusterName+".", + "", + "An existing Karpenter installation was detected", + "in namespace \""+existingNamespace+"\".", + "This installation was not created by kubectl-datadog.", + "", + "To use Datadog autoscaling with an existing Karpenter,", + "navigate to the Autoscaling settings page:", + coloredURL, + ) + + return fmt.Errorf("existing Karpenter installation detected in namespace %q", existingNamespace) +} + func displayEKSAutoModeMessage(cmd *cobra.Command, clusterName string) error { coloredURL := openAutoscalingSettingsURL(cmd, clusterName) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go new file mode 100644 index 0000000000..3950ed93c1 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go @@ -0,0 +1,241 @@ +// Package upgrade provides functionality to upgrade a Karpenter installation +// that was previously deployed by the kubectl-datadog plugin. +package upgrade + +import ( + "context" + "errors" + "fmt" + "log" + "os/signal" + "syscall" + + "github.com/spf13/cobra" + "helm.sh/helm/v3/pkg/release" + "k8s.io/cli-runtime/pkg/genericclioptions" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/helm" + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/guess" + "github.com/DataDog/datadog-operator/pkg/plugin/common" +) + +var ( + clusterName string + karpenterVersion string + updateKarpenterResources bool + inferenceMethod = install.InferenceMethodNodeGroups + debug bool + upgradeExample = ` + # upgrade autoscaling + %[1]s upgrade + + # upgrade to a specific Karpenter version + %[1]s upgrade --karpenter-version 1.1.0 + + # upgrade and re-infer NodePool/EC2NodeClass resources + %[1]s upgrade --update-karpenter-resources +` +) + +type options struct { + genericclioptions.IOStreams + common.Options + args []string +} + +func newOptions(streams genericclioptions.IOStreams) *options { + o := &options{ + IOStreams: streams, + } + o.SetConfigFlags() + return o +} + +// New provides a cobra command for upgrading an existing Karpenter installation. +func New(streams genericclioptions.IOStreams) *cobra.Command { + o := newOptions(streams) + cmd := &cobra.Command{ + Use: "upgrade", + Short: "Upgrade Karpenter on an EKS cluster", + Long: "Upgrade a Karpenter installation that was previously deployed by kubectl-datadog. The installation namespace is auto-detected. Helm values are reset to the plugin's defaults on each upgrade.", + Example: fmt.Sprintf(upgradeExample, "kubectl datadog autoscaling cluster"), + SilenceUsage: true, + RunE: func(c *cobra.Command, args []string) error { + if err := o.complete(c, args); err != nil { + return err + } + if err := o.validate(); err != nil { + return err + } + + return o.run(c) + }, + } + + cmd.Flags().StringVar(&clusterName, "cluster-name", "", "Name of the EKS cluster (auto-detected from existing installation if not specified)") + cmd.Flags().StringVar(&karpenterVersion, "karpenter-version", "", "Version of Karpenter to upgrade to (default to latest)") + cmd.Flags().BoolVar(&updateKarpenterResources, "update-karpenter-resources", false, "Re-infer and update NodePool/EC2NodeClass resources") + cmd.Flags().Var(&inferenceMethod, "inference-method", "Method to infer EC2NodeClass and NodePool properties: nodes, nodegroups (only used with --update-karpenter-resources)") + cmd.Flags().BoolVar(&debug, "debug", false, "Enable debug logs") + + o.ConfigFlags.AddFlags(cmd.Flags()) + + return cmd +} + +func (o *options) complete(cmd *cobra.Command, args []string) error { + o.args = args + return o.Init(cmd) +} + +func (o *options) validate() error { + if len(o.args) > 0 { + return errors.New("no arguments are allowed") + } + + return nil +} + +func (o *options) run(cmd *cobra.Command) error { + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + log.SetOutput(cmd.OutOrStderr()) + ctrl.SetLogger(zap.New(zap.UseDevMode(false), zap.WriteTo(cmd.ErrOrStderr()))) + + // Detect the existing installation and get the Helm release in one pass + karpenterNamespace, rel, err := o.detectInstallation(ctx) + if err != nil { + return err + } + + // Resolve cluster name: flag > Helm release values > kubeconfig + if clusterName == "" { + clusterName = extractClusterName(rel.Config) + } + if clusterName == "" { + if name, err := clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags); err != nil { + return err + } else if name != "" { + clusterName = name + } else { + return errors.New("cluster name must be specified either via --cluster-name, from the existing installation, or in the current kubeconfig context") + } + } + + if autoModeEnabled, err := guess.IsEKSAutoModeEnabled(o.DiscoveryClient); err != nil { + return fmt.Errorf("failed to check for EKS auto-mode: %w", err) + } else if autoModeEnabled { + return fmt.Errorf("EKS auto-mode is active on cluster %s; Karpenter is built into EKS auto-mode and cannot be upgraded separately", clusterName) + } + + display.PrintBox(cmd.OutOrStdout(), "Upgrading Karpenter on cluster "+clusterName+" (namespace: "+karpenterNamespace+").") + + cli, err := clients.Build(ctx, o.ConfigFlags, o.Clientset) + if err != nil { + return fmt.Errorf("failed to build clients: %w", err) + } + + if err = install.CreateCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { + return err + } + + if err = install.UpdateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { + return err + } + + if err = install.InstallOrUpgradeHelmChart(ctx, o.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion, debug); err != nil { + return err + } + + if updateKarpenterResources { + if err = install.CreateNodePoolResources(ctx, cmd, cli, clusterName, install.CreateKarpenterResourcesAll, inferenceMethod, debug); err != nil { + return err + } + } + + display.PrintBox(cmd.OutOrStdout(), "Karpenter upgraded on cluster "+clusterName+".") + + return nil +} + +// detectInstallation finds the existing Karpenter installation namespace and +// returns the Helm release. Checks webhooks first, then falls back to Helm secret scanning. +func (o *options) detectInstallation(ctx context.Context) (string, *release.Release, error) { + found, namespace, err := commonk8s.DetectActiveKarpenter(ctx, o.Clientset) + if err != nil { + return "", nil, fmt.Errorf("failed to detect active Karpenter: %w", err) + } + + if found && namespace == "" { + return "", nil, fmt.Errorf("an external Karpenter installation was detected (URL-based webhook); upgrade is only supported for installations created by kubectl-datadog") + } + + if found { + isOurs, rel, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, namespace, "karpenter") + if checkErr != nil { + return "", nil, fmt.Errorf("failed to check Karpenter Helm release ownership: %w", checkErr) + } + if !isOurs { + return "", nil, fmt.Errorf("the Karpenter installation in namespace %q was not created by kubectl-datadog", namespace) + } + log.Printf("Found kubectl-datadog Karpenter installation in namespace %q.", namespace) + return namespace, rel, nil + } + + // Fallback: scan for Helm release secrets across namespaces + helmFound, namespaces, err := commonk8s.FindKarpenterHelmRelease(ctx, o.Clientset) + if err != nil { + return "", nil, fmt.Errorf("failed to find Karpenter Helm release: %w", err) + } + + if !helmFound { + return "", nil, errors.New("no Karpenter installation found; use 'kubectl datadog autoscaling cluster install' first") + } + + var ours []string + var oursRelease *release.Release + var checkErrors []error + for _, ns := range namespaces { + isOurs, rel, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, ns, "karpenter") + if checkErr != nil { + checkErrors = append(checkErrors, fmt.Errorf("namespace %q: %w", ns, checkErr)) + continue + } + if isOurs { + ours = append(ours, ns) + oursRelease = rel + } + } + + switch len(ours) { + case 0: + if len(checkErrors) > 0 { + return "", nil, fmt.Errorf("failed to check Karpenter Helm release ownership: %w", errors.Join(checkErrors...)) + } + return "", nil, fmt.Errorf("found Karpenter Helm release(s) but none were created by kubectl-datadog") + case 1: + log.Printf("Found kubectl-datadog Karpenter installation in namespace %q.", ours[0]) + return ours[0], oursRelease, nil + default: + return "", nil, fmt.Errorf("multiple kubectl-datadog Karpenter installations found in namespaces %v; this is unexpected — please uninstall the extra release(s) first", ours) + } +} + +func extractClusterName(config map[string]any) string { + if config == nil { + return "" + } + settings, ok := config["settings"].(map[string]any) + if !ok { + return "" + } + name, _ := settings["clusterName"].(string) + return name +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go new file mode 100644 index 0000000000..a85d78ea6c --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go @@ -0,0 +1,104 @@ +package upgrade + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestValidate(t *testing.T) { + for _, tc := range []struct { + name string + args []string + expectError bool + }{ + { + name: "No arguments", + args: []string{}, + expectError: false, + }, + { + name: "With arguments", + args: []string{"arg1"}, + expectError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + o := &options{args: tc.args} + err := o.validate() + + if tc.expectError { + assert.Error(t, err) + assert.Contains(t, err.Error(), "no arguments are allowed") + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestExtractClusterName(t *testing.T) { + for _, tc := range []struct { + name string + config map[string]any + expected string + }{ + { + name: "Nil config", + config: nil, + expected: "", + }, + { + name: "Empty config", + config: map[string]any{}, + expected: "", + }, + { + name: "Missing settings key", + config: map[string]any{ + "additionalLabels": map[string]any{}, + }, + expected: "", + }, + { + name: "Settings is wrong type", + config: map[string]any{ + "settings": "not-a-map", + }, + expected: "", + }, + { + name: "Missing clusterName in settings", + config: map[string]any{ + "settings": map[string]any{ + "interruptionQueue": "my-cluster", + }, + }, + expected: "", + }, + { + name: "clusterName is wrong type", + config: map[string]any{ + "settings": map[string]any{ + "clusterName": 42, + }, + }, + expected: "", + }, + { + name: "Valid clusterName", + config: map[string]any{ + "settings": map[string]any{ + "clusterName": "my-eks-cluster", + "interruptionQueue": "my-eks-cluster", + }, + }, + expected: "my-eks-cluster", + }, + } { + t.Run(tc.name, func(t *testing.T) { + result := extractClusterName(tc.config) + assert.Equal(t, tc.expected, result) + }) + } +} From eb759390913159bd72a708f1194620983c189939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Fri, 10 Apr 2026 18:34:59 +0200 Subject: [PATCH 2/4] Fix govet shadow warnings in upgrade command Avoid variable shadowing of `err` in the upgrade run() method by using plain assignments instead of if-init statements. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../autoscaling/cluster/upgrade/upgrade.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go index 3950ed93c1..1c6e98f5cd 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go @@ -120,18 +120,22 @@ func (o *options) run(cmd *cobra.Command) error { clusterName = extractClusterName(rel.Config) } if clusterName == "" { - if name, err := clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags); err != nil { + name, err := clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags) + if err != nil { return err - } else if name != "" { + } + if name != "" { clusterName = name } else { return errors.New("cluster name must be specified either via --cluster-name, from the existing installation, or in the current kubeconfig context") } } - if autoModeEnabled, err := guess.IsEKSAutoModeEnabled(o.DiscoveryClient); err != nil { + autoModeEnabled, err := guess.IsEKSAutoModeEnabled(o.DiscoveryClient) + if err != nil { return fmt.Errorf("failed to check for EKS auto-mode: %w", err) - } else if autoModeEnabled { + } + if autoModeEnabled { return fmt.Errorf("EKS auto-mode is active on cluster %s; Karpenter is built into EKS auto-mode and cannot be upgraded separately", clusterName) } From f15baa9e41ed0ef47fbfe5a9904e0e99336a2834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Fri, 10 Apr 2026 18:42:22 +0200 Subject: [PATCH 3/4] Fix remaining govet shadow warning in upgrade command Use explicit var declaration and assignment instead of short variable declaration to avoid shadowing the outer err variable. Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go index 1c6e98f5cd..8f4c99f614 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go @@ -120,7 +120,8 @@ func (o *options) run(cmd *cobra.Command) error { clusterName = extractClusterName(rel.Config) } if clusterName == "" { - name, err := clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags) + var name string + name, err = clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags) if err != nil { return err } From 8f871c2868d6181ae038743d881360ed1f7b5ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Fri, 10 Apr 2026 19:03:40 +0200 Subject: [PATCH 4/4] Add Helm-release fallback to install Karpenter guard When DetectActiveKarpenter finds no webhooks, fall back to FindKarpenterHelmRelease to detect orphaned Helm releases (e.g. when Karpenter pods have crashed but the release still exists). This prevents creating a duplicate installation. The upgrade command already used this fallback; this aligns the install command's detection logic. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../autoscaling/cluster/install/install.go | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go index 828c89ebf2..3487f48006 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go @@ -226,7 +226,8 @@ func (o *options) run(cmd *cobra.Command) error { return displayEKSAutoModeMessage(cmd, clusterName) } - // Check for existing Karpenter installation + // Check for existing Karpenter installation via webhooks first, + // then fall back to Helm release secrets (handles crashed/absent webhooks). if found, webhookNamespace, err := commonk8s.DetectActiveKarpenter(ctx, o.Clientset); err != nil { return fmt.Errorf("failed to check for existing Karpenter installation: %w", err) } else if found { @@ -243,6 +244,33 @@ func (o *options) run(cmd *cobra.Command) error { } // It's ours — use the detected namespace for the rest of the install karpenterNamespace = webhookNamespace + } else { + // No webhooks found — check for orphaned Helm releases (e.g. pods crashed). + helmFound, namespaces, helmErr := commonk8s.FindKarpenterHelmRelease(ctx, o.Clientset) + if helmErr != nil { + return fmt.Errorf("failed to search for Karpenter Helm releases: %w", helmErr) + } + if helmFound { + foundOurs := false + var externalNS string + for _, ns := range namespaces { + isOurs, _, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, ns, "karpenter") + if checkErr != nil { + return fmt.Errorf("failed to check Karpenter Helm release ownership in namespace %q: %w", ns, checkErr) + } + if isOurs { + // Our previous install exists but webhooks are gone — reuse its namespace. + karpenterNamespace = ns + foundOurs = true + break + } + externalNS = ns + } + if !foundOurs && externalNS != "" { + // Found Helm release(s) but none are ours — treat as external. + return displayExternalKarpenterMessage(cmd, clusterName, externalNS) + } + } } display.PrintBox(cmd.OutOrStdout(), "Installing Karpenter on cluster "+clusterName+".")