diff --git a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go index ce16cfcf8d..cca92aed9c 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/cluster.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/cluster.go @@ -8,6 +8,7 @@ import ( "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/uninstall" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/upgrade" ) // options provides information required by cluster command @@ -33,6 +34,7 @@ func New(streams genericclioptions.IOStreams) *cobra.Command { cmd.AddCommand(install.New(streams)) cmd.AddCommand(uninstall.New(streams)) + cmd.AddCommand(upgrade.New(streams)) o := newOptions(streams) o.configFlags.AddFlags(cmd.Flags()) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go b/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go index ef7821ec8f..71f05a0585 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/helm/helm.go @@ -128,6 +128,44 @@ func upgrade(ctx context.Context, ac *action.Configuration, releaseName, namespa return nil } +// GetRelease retrieves the full Helm release including config values. +func GetRelease(_ context.Context, ac *action.Configuration, releaseName string) (*release.Release, error) { + getAction := action.NewGet(ac) + rel, err := getAction.Run(releaseName) + if err != nil { + return nil, fmt.Errorf("failed to get Helm release %s: %w", releaseName, err) + } + return rel, nil +} + +// IsOurRelease checks whether a Karpenter Helm release in the given namespace +// was installed by the kubectl-datadog plugin. Returns the release for reuse +// by the caller. +func IsOurRelease(ctx context.Context, configFlags *genericclioptions.ConfigFlags, namespace, releaseName string) (bool, *release.Release, error) { + actionConfig, err := NewActionConfig(configFlags, namespace) + if err != nil { + return false, nil, fmt.Errorf("failed to create Helm action config for namespace %s: %w", namespace, err) + } + + rel, err := GetRelease(ctx, actionConfig, releaseName) + if err != nil { + return false, nil, err + } + + if rel == nil || rel.Config == nil { + return false, nil, nil + } + + labels, ok := rel.Config["additionalLabels"].(map[string]any) + if !ok { + return false, rel, nil + } + + managedBy, ok := labels["app.kubernetes.io/managed-by"].(string) + isOurs := ok && managedBy == "kubectl-datadog" + return isOurs, rel, nil +} + func Uninstall(ctx context.Context, ac *action.Configuration, releaseName string) error { exist, err := Exists(ctx, ac, releaseName) if err != nil { diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go new file mode 100644 index 0000000000..ac9c1489ff --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter.go @@ -0,0 +1,94 @@ +package k8s + +import ( + "context" + "fmt" + + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +var karpenterAPIGroups = map[string]bool{ + "karpenter.sh": true, + "karpenter.k8s.aws": true, +} + +const helmReleaseNameKarpenter = "karpenter" + +// DetectActiveKarpenter checks for an active Karpenter installation by looking for +// webhook configurations that reference Karpenter API groups. Returns the namespace +// where Karpenter's webhook service is running. +func DetectActiveKarpenter(ctx context.Context, clientset kubernetes.Interface) (bool, string, error) { + vwcList, err := clientset.AdmissionregistrationV1().ValidatingWebhookConfigurations().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, "", fmt.Errorf("failed to list ValidatingWebhookConfigurations: %w", err) + } + + for _, vwc := range vwcList.Items { + for _, webhook := range vwc.Webhooks { + if ns, ok := extractKarpenterNamespace(webhook.Rules, webhook.ClientConfig); ok { + return true, ns, nil + } + } + } + + mwcList, err := clientset.AdmissionregistrationV1().MutatingWebhookConfigurations().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, "", fmt.Errorf("failed to list MutatingWebhookConfigurations: %w", err) + } + + for _, mwc := range mwcList.Items { + for _, webhook := range mwc.Webhooks { + if ns, ok := extractKarpenterNamespace(webhook.Rules, webhook.ClientConfig); ok { + return true, ns, nil + } + } + } + + return false, "", nil +} + +func extractKarpenterNamespace(rules []admissionregistrationv1.RuleWithOperations, clientConfig admissionregistrationv1.WebhookClientConfig) (string, bool) { + for _, rule := range rules { + for _, group := range rule.APIGroups { + if karpenterAPIGroups[group] { + if clientConfig.Service != nil { + return clientConfig.Service.Namespace, true + } + // URL-based webhook — Karpenter is present but we can't + // determine the namespace (e.g. external/out-of-cluster). + return "", true + } + } + } + return "", false +} + +// FindKarpenterHelmRelease searches for a deployed Helm release named "karpenter" +// across all namespaces by looking at Helm storage secrets. This is a fallback +// for when webhooks are absent (e.g. pods crashed) but the Helm release still exists. +func FindKarpenterHelmRelease(ctx context.Context, clientset kubernetes.Interface) (bool, []string, error) { + secrets, err := clientset.CoreV1().Secrets("").List(ctx, metav1.ListOptions{ + LabelSelector: "owner=helm,name=" + helmReleaseNameKarpenter + ",status=deployed", + }) + if err != nil { + return false, nil, fmt.Errorf("failed to list Helm release secrets: %w", err) + } + + if len(secrets.Items) == 0 { + return false, nil, nil + } + + // Deduplicate namespaces (multiple revisions may exist in the same namespace) + seen := map[string]bool{} + var namespaces []string + for _, s := range secrets.Items { + if !seen[s.Namespace] { + seen[s.Namespace] = true + namespaces = append(namespaces, s.Namespace) + } + } + + return true, namespaces, nil +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go new file mode 100644 index 0000000000..feb6205bb1 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/k8s/karpenter_test.go @@ -0,0 +1,353 @@ +package k8s + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +func TestDetectActiveKarpenter(t *testing.T) { + for _, tc := range []struct { + name string + objects []runtime.Object + expectedFound bool + expectedNamespace string + }{ + { + name: "No webhook configurations", + objects: nil, + expectedFound: false, + }, + { + name: "Karpenter ValidatingWebhookConfiguration in dd-karpenter", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + { + name: "Karpenter ValidatingWebhookConfiguration in custom namespace", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.k8s.aws", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.k8s.aws"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "karpenter", + }, + { + name: "Non-Karpenter ValidatingWebhookConfiguration only", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "other-webhook"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validate.something.io", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"apps"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "system", + Name: "webhook-svc", + }, + }, + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Karpenter MutatingWebhookConfiguration detected", + objects: []runtime.Object{ + &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-defaulting"}, + Webhooks: []admissionregistrationv1.MutatingWebhook{ + { + Name: "defaulting.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + { + name: "Karpenter webhook with URL-based config (no service)", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + URL: ptrString("https://external-karpenter.example.com/validate"), + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "", + }, + { + name: "Multiple webhooks, only one is Karpenter", + objects: []runtime.Object{ + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "other-webhook"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validate.other.io", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"apps"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "other", + Name: "other-svc", + }, + }, + }, + }, + }, + &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: "karpenter-validation"}, + Webhooks: []admissionregistrationv1.ValidatingWebhook{ + { + Name: "validation.karpenter.sh", + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{"karpenter.sh"}, + }, + }, + }, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Namespace: "dd-karpenter", + Name: "karpenter", + }, + }, + }, + }, + }, + }, + expectedFound: true, + expectedNamespace: "dd-karpenter", + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.objects...) + + found, ns, err := DetectActiveKarpenter(context.Background(), clientset) + assert.NoError(t, err) + assert.Equal(t, tc.expectedFound, found) + assert.Equal(t, tc.expectedNamespace, ns) + }) + } +} + +func TestFindKarpenterHelmRelease(t *testing.T) { + for _, tc := range []struct { + name string + objects []runtime.Object + expectedFound bool + expectedNamespaces []string + }{ + { + name: "No Helm secrets", + objects: nil, + expectedFound: false, + }, + { + name: "Karpenter Helm secret in dd-karpenter", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"dd-karpenter"}, + }, + { + name: "Karpenter Helm secret in custom namespace", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v3", + Namespace: "my-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"my-karpenter"}, + }, + { + name: "Karpenter Helm secret with superseded status (not deployed)", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "superseded", + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Different Helm release (not karpenter)", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.datadog.v1", + Namespace: "datadog", + Labels: map[string]string{ + "owner": "helm", + "name": "datadog", + "status": "deployed", + }, + }, + }, + }, + expectedFound: false, + }, + { + name: "Multiple revisions in same namespace are deduplicated", + objects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v1", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sh.helm.release.v1.karpenter.v2", + Namespace: "dd-karpenter", + Labels: map[string]string{ + "owner": "helm", + "name": "karpenter", + "status": "deployed", + }, + }, + }, + }, + expectedFound: true, + expectedNamespaces: []string{"dd-karpenter"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.objects...) + + found, namespaces, err := FindKarpenterHelmRelease(context.Background(), clientset) + assert.NoError(t, err) + assert.Equal(t, tc.expectedFound, found) + if tc.expectedNamespaces == nil { + assert.Nil(t, namespaces) + } else { + assert.ElementsMatch(t, tc.expectedNamespaces, namespaces) + } + }) + } +} + +func ptrString(s string) *string { + return &s +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go index df410e5d28..3487f48006 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go @@ -28,6 +28,7 @@ import ( "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/helm" + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/guess" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/k8s" "github.com/DataDog/datadog-operator/pkg/plugin/common" @@ -37,7 +38,8 @@ import ( ) const ( - karpenterOCIRegistry = "oci://public.ecr.aws/karpenter/karpenter" + // KarpenterOCIRegistry is the OCI registry URL for the Karpenter Helm chart. + KarpenterOCIRegistry = "oci://public.ecr.aws/karpenter/karpenter" ) var ( @@ -224,6 +226,53 @@ func (o *options) run(cmd *cobra.Command) error { return displayEKSAutoModeMessage(cmd, clusterName) } + // Check for existing Karpenter installation via webhooks first, + // then fall back to Helm release secrets (handles crashed/absent webhooks). + if found, webhookNamespace, err := commonk8s.DetectActiveKarpenter(ctx, o.Clientset); err != nil { + return fmt.Errorf("failed to check for existing Karpenter installation: %w", err) + } else if found { + if webhookNamespace == "" { + // Karpenter webhook found but uses a URL (external/out-of-cluster) — treat as external + return displayExternalKarpenterMessage(cmd, clusterName, "unknown (URL-based webhook)") + } + isOurs, _, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, webhookNamespace, "karpenter") + if checkErr != nil { + return fmt.Errorf("failed to check Karpenter Helm release ownership: %w", checkErr) + } + if !isOurs { + return displayExternalKarpenterMessage(cmd, clusterName, webhookNamespace) + } + // It's ours — use the detected namespace for the rest of the install + karpenterNamespace = webhookNamespace + } else { + // No webhooks found — check for orphaned Helm releases (e.g. pods crashed). + helmFound, namespaces, helmErr := commonk8s.FindKarpenterHelmRelease(ctx, o.Clientset) + if helmErr != nil { + return fmt.Errorf("failed to search for Karpenter Helm releases: %w", helmErr) + } + if helmFound { + foundOurs := false + var externalNS string + for _, ns := range namespaces { + isOurs, _, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, ns, "karpenter") + if checkErr != nil { + return fmt.Errorf("failed to check Karpenter Helm release ownership in namespace %q: %w", ns, checkErr) + } + if isOurs { + // Our previous install exists but webhooks are gone — reuse its namespace. + karpenterNamespace = ns + foundOurs = true + break + } + externalNS = ns + } + if !foundOurs && externalNS != "" { + // Found Helm release(s) but none are ours — treat as external. + return displayExternalKarpenterMessage(cmd, clusterName, externalNS) + } + } + } + display.PrintBox(cmd.OutOrStdout(), "Installing Karpenter on cluster "+clusterName+".") cli, err := clients.Build(ctx, o.ConfigFlags, o.Clientset) @@ -231,11 +280,11 @@ func (o *options) run(cmd *cobra.Command) error { return fmt.Errorf("failed to build clients: %w", err) } - if err = createCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { + if err = CreateCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { return err } - if err = updateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { + if err = UpdateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { return err } @@ -243,14 +292,15 @@ func (o *options) run(cmd *cobra.Command) error { return err } - if err = createNodePoolResources(ctx, cmd, cli, clusterName, createKarpenterResources, inferenceMethod, debug); err != nil { + if err = CreateNodePoolResources(ctx, cmd, cli, clusterName, createKarpenterResources, inferenceMethod, debug); err != nil { return err } return displaySuccessMessage(cmd, clusterName, createKarpenterResources) } -func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string) error { +// CreateCloudFormationStacks creates or updates the Karpenter and DD-Karpenter CloudFormation stacks. +func CreateCloudFormationStacks(ctx context.Context, cli *clients.Clients, clusterName string, karpenterNamespace string) error { if err := aws.CreateOrUpdateStack(ctx, cli.CloudFormation, "dd-karpenter-"+clusterName+"-karpenter", KarpenterCfn, map[string]string{ "ClusterName": clusterName, }); err != nil { @@ -279,7 +329,8 @@ func createCloudFormationStacks(ctx context.Context, cli *clients.Clients, clust return nil } -func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterName string) error { +// UpdateAwsAuthConfigMap ensures the Karpenter node role is in the aws-auth ConfigMap. +func UpdateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterName string) error { awsAuthConfigMapPresent, err := guess.IsAwsAuthConfigMapPresent(ctx, cli.K8sClientset) if err != nil { return fmt.Errorf("failed to check if aws-auth ConfigMap is present: %w", err) @@ -290,7 +341,6 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa return nil } - // Get AWS account ID callerIdentity, err := cli.STS.GetCallerIdentity(ctx, &sts.GetCallerIdentityInput{}) if err != nil { return fmt.Errorf("failed to get identity caller: %w", err) @@ -300,7 +350,6 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa } accountID := *callerIdentity.Account - // Add role mapping in the `aws-auth` ConfigMap if err = aws.EnsureAwsAuthRole(ctx, cli.K8sClientset, aws.RoleMapping{ RoleArn: "arn:aws:iam::" + accountID + ":role/KarpenterNodeRole-" + clusterName, Username: "system:node:{{EC2PrivateDNSName}}", @@ -313,7 +362,12 @@ func updateAwsAuthConfigMap(ctx context.Context, cli *clients.Clients, clusterNa } func (o *options) installHelmChart(ctx context.Context, clusterName string, karpenterNamespace string, karpenterVersion string, debug bool) error { - actionConfig, err := helm.NewActionConfig(o.ConfigFlags, karpenterNamespace) + return InstallOrUpgradeHelmChart(ctx, o.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion, debug) +} + +// InstallOrUpgradeHelmChart installs or upgrades the Karpenter Helm chart with the plugin's standard values. +func InstallOrUpgradeHelmChart(ctx context.Context, configFlags *genericclioptions.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion string, debug bool) error { + actionConfig, err := helm.NewActionConfig(configFlags, karpenterNamespace) if err != nil { return err } @@ -349,14 +403,15 @@ func (o *options) installHelmChart(ctx context.Context, clusterName string, karp }, } - if err = helm.CreateOrUpgrade(ctx, actionConfig, "karpenter", karpenterNamespace, karpenterOCIRegistry, karpenterVersion, values); err != nil { + if err = helm.CreateOrUpgrade(ctx, actionConfig, "karpenter", karpenterNamespace, KarpenterOCIRegistry, karpenterVersion, values); err != nil { return fmt.Errorf("failed to create or update Helm release: %w", err) } return nil } -func createNodePoolResources(ctx context.Context, cmd *cobra.Command, cli *clients.Clients, clusterName string, createResources CreateKarpenterResources, inferenceMethod InferenceMethod, debug bool) error { +// CreateNodePoolResources infers and creates or updates Karpenter NodePool and EC2NodeClass resources. +func CreateNodePoolResources(ctx context.Context, cmd *cobra.Command, cli *clients.Clients, clusterName string, createResources CreateKarpenterResources, inferenceMethod InferenceMethod, debug bool) error { if createResources == CreateKarpenterResourcesNone { return nil } @@ -418,6 +473,24 @@ func openAutoscalingSettingsURL(cmd *cobra.Command, clusterName string) string { return color.New(color.Bold, color.Underline, color.FgBlue).Sprint(autoscalingSettingsURL) } +func displayExternalKarpenterMessage(cmd *cobra.Command, clusterName string, existingNamespace string) error { + coloredURL := openAutoscalingSettingsURL(cmd, clusterName) + + display.PrintBox(cmd.OutOrStdout(), + "Karpenter is already installed on cluster "+clusterName+".", + "", + "An existing Karpenter installation was detected", + "in namespace \""+existingNamespace+"\".", + "This installation was not created by kubectl-datadog.", + "", + "To use Datadog autoscaling with an existing Karpenter,", + "navigate to the Autoscaling settings page:", + coloredURL, + ) + + return fmt.Errorf("existing Karpenter installation detected in namespace %q", existingNamespace) +} + func displayEKSAutoModeMessage(cmd *cobra.Command, clusterName string) error { coloredURL := openAutoscalingSettingsURL(cmd, clusterName) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go new file mode 100644 index 0000000000..8f4c99f614 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade.go @@ -0,0 +1,246 @@ +// Package upgrade provides functionality to upgrade a Karpenter installation +// that was previously deployed by the kubectl-datadog plugin. +package upgrade + +import ( + "context" + "errors" + "fmt" + "log" + "os/signal" + "syscall" + + "github.com/spf13/cobra" + "helm.sh/helm/v3/pkg/release" + "k8s.io/cli-runtime/pkg/genericclioptions" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/helm" + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/guess" + "github.com/DataDog/datadog-operator/pkg/plugin/common" +) + +var ( + clusterName string + karpenterVersion string + updateKarpenterResources bool + inferenceMethod = install.InferenceMethodNodeGroups + debug bool + upgradeExample = ` + # upgrade autoscaling + %[1]s upgrade + + # upgrade to a specific Karpenter version + %[1]s upgrade --karpenter-version 1.1.0 + + # upgrade and re-infer NodePool/EC2NodeClass resources + %[1]s upgrade --update-karpenter-resources +` +) + +type options struct { + genericclioptions.IOStreams + common.Options + args []string +} + +func newOptions(streams genericclioptions.IOStreams) *options { + o := &options{ + IOStreams: streams, + } + o.SetConfigFlags() + return o +} + +// New provides a cobra command for upgrading an existing Karpenter installation. +func New(streams genericclioptions.IOStreams) *cobra.Command { + o := newOptions(streams) + cmd := &cobra.Command{ + Use: "upgrade", + Short: "Upgrade Karpenter on an EKS cluster", + Long: "Upgrade a Karpenter installation that was previously deployed by kubectl-datadog. The installation namespace is auto-detected. Helm values are reset to the plugin's defaults on each upgrade.", + Example: fmt.Sprintf(upgradeExample, "kubectl datadog autoscaling cluster"), + SilenceUsage: true, + RunE: func(c *cobra.Command, args []string) error { + if err := o.complete(c, args); err != nil { + return err + } + if err := o.validate(); err != nil { + return err + } + + return o.run(c) + }, + } + + cmd.Flags().StringVar(&clusterName, "cluster-name", "", "Name of the EKS cluster (auto-detected from existing installation if not specified)") + cmd.Flags().StringVar(&karpenterVersion, "karpenter-version", "", "Version of Karpenter to upgrade to (default to latest)") + cmd.Flags().BoolVar(&updateKarpenterResources, "update-karpenter-resources", false, "Re-infer and update NodePool/EC2NodeClass resources") + cmd.Flags().Var(&inferenceMethod, "inference-method", "Method to infer EC2NodeClass and NodePool properties: nodes, nodegroups (only used with --update-karpenter-resources)") + cmd.Flags().BoolVar(&debug, "debug", false, "Enable debug logs") + + o.ConfigFlags.AddFlags(cmd.Flags()) + + return cmd +} + +func (o *options) complete(cmd *cobra.Command, args []string) error { + o.args = args + return o.Init(cmd) +} + +func (o *options) validate() error { + if len(o.args) > 0 { + return errors.New("no arguments are allowed") + } + + return nil +} + +func (o *options) run(cmd *cobra.Command) error { + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + log.SetOutput(cmd.OutOrStderr()) + ctrl.SetLogger(zap.New(zap.UseDevMode(false), zap.WriteTo(cmd.ErrOrStderr()))) + + // Detect the existing installation and get the Helm release in one pass + karpenterNamespace, rel, err := o.detectInstallation(ctx) + if err != nil { + return err + } + + // Resolve cluster name: flag > Helm release values > kubeconfig + if clusterName == "" { + clusterName = extractClusterName(rel.Config) + } + if clusterName == "" { + var name string + name, err = clients.GetClusterNameFromKubeconfig(ctx, o.ConfigFlags) + if err != nil { + return err + } + if name != "" { + clusterName = name + } else { + return errors.New("cluster name must be specified either via --cluster-name, from the existing installation, or in the current kubeconfig context") + } + } + + autoModeEnabled, err := guess.IsEKSAutoModeEnabled(o.DiscoveryClient) + if err != nil { + return fmt.Errorf("failed to check for EKS auto-mode: %w", err) + } + if autoModeEnabled { + return fmt.Errorf("EKS auto-mode is active on cluster %s; Karpenter is built into EKS auto-mode and cannot be upgraded separately", clusterName) + } + + display.PrintBox(cmd.OutOrStdout(), "Upgrading Karpenter on cluster "+clusterName+" (namespace: "+karpenterNamespace+").") + + cli, err := clients.Build(ctx, o.ConfigFlags, o.Clientset) + if err != nil { + return fmt.Errorf("failed to build clients: %w", err) + } + + if err = install.CreateCloudFormationStacks(ctx, cli, clusterName, karpenterNamespace); err != nil { + return err + } + + if err = install.UpdateAwsAuthConfigMap(ctx, cli, clusterName); err != nil { + return err + } + + if err = install.InstallOrUpgradeHelmChart(ctx, o.ConfigFlags, clusterName, karpenterNamespace, karpenterVersion, debug); err != nil { + return err + } + + if updateKarpenterResources { + if err = install.CreateNodePoolResources(ctx, cmd, cli, clusterName, install.CreateKarpenterResourcesAll, inferenceMethod, debug); err != nil { + return err + } + } + + display.PrintBox(cmd.OutOrStdout(), "Karpenter upgraded on cluster "+clusterName+".") + + return nil +} + +// detectInstallation finds the existing Karpenter installation namespace and +// returns the Helm release. Checks webhooks first, then falls back to Helm secret scanning. +func (o *options) detectInstallation(ctx context.Context) (string, *release.Release, error) { + found, namespace, err := commonk8s.DetectActiveKarpenter(ctx, o.Clientset) + if err != nil { + return "", nil, fmt.Errorf("failed to detect active Karpenter: %w", err) + } + + if found && namespace == "" { + return "", nil, fmt.Errorf("an external Karpenter installation was detected (URL-based webhook); upgrade is only supported for installations created by kubectl-datadog") + } + + if found { + isOurs, rel, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, namespace, "karpenter") + if checkErr != nil { + return "", nil, fmt.Errorf("failed to check Karpenter Helm release ownership: %w", checkErr) + } + if !isOurs { + return "", nil, fmt.Errorf("the Karpenter installation in namespace %q was not created by kubectl-datadog", namespace) + } + log.Printf("Found kubectl-datadog Karpenter installation in namespace %q.", namespace) + return namespace, rel, nil + } + + // Fallback: scan for Helm release secrets across namespaces + helmFound, namespaces, err := commonk8s.FindKarpenterHelmRelease(ctx, o.Clientset) + if err != nil { + return "", nil, fmt.Errorf("failed to find Karpenter Helm release: %w", err) + } + + if !helmFound { + return "", nil, errors.New("no Karpenter installation found; use 'kubectl datadog autoscaling cluster install' first") + } + + var ours []string + var oursRelease *release.Release + var checkErrors []error + for _, ns := range namespaces { + isOurs, rel, checkErr := helm.IsOurRelease(ctx, o.ConfigFlags, ns, "karpenter") + if checkErr != nil { + checkErrors = append(checkErrors, fmt.Errorf("namespace %q: %w", ns, checkErr)) + continue + } + if isOurs { + ours = append(ours, ns) + oursRelease = rel + } + } + + switch len(ours) { + case 0: + if len(checkErrors) > 0 { + return "", nil, fmt.Errorf("failed to check Karpenter Helm release ownership: %w", errors.Join(checkErrors...)) + } + return "", nil, fmt.Errorf("found Karpenter Helm release(s) but none were created by kubectl-datadog") + case 1: + log.Printf("Found kubectl-datadog Karpenter installation in namespace %q.", ours[0]) + return ours[0], oursRelease, nil + default: + return "", nil, fmt.Errorf("multiple kubectl-datadog Karpenter installations found in namespaces %v; this is unexpected — please uninstall the extra release(s) first", ours) + } +} + +func extractClusterName(config map[string]any) string { + if config == nil { + return "" + } + settings, ok := config["settings"].(map[string]any) + if !ok { + return "" + } + name, _ := settings["clusterName"].(string) + return name +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go new file mode 100644 index 0000000000..a85d78ea6c --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/upgrade/upgrade_test.go @@ -0,0 +1,104 @@ +package upgrade + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestValidate(t *testing.T) { + for _, tc := range []struct { + name string + args []string + expectError bool + }{ + { + name: "No arguments", + args: []string{}, + expectError: false, + }, + { + name: "With arguments", + args: []string{"arg1"}, + expectError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + o := &options{args: tc.args} + err := o.validate() + + if tc.expectError { + assert.Error(t, err) + assert.Contains(t, err.Error(), "no arguments are allowed") + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestExtractClusterName(t *testing.T) { + for _, tc := range []struct { + name string + config map[string]any + expected string + }{ + { + name: "Nil config", + config: nil, + expected: "", + }, + { + name: "Empty config", + config: map[string]any{}, + expected: "", + }, + { + name: "Missing settings key", + config: map[string]any{ + "additionalLabels": map[string]any{}, + }, + expected: "", + }, + { + name: "Settings is wrong type", + config: map[string]any{ + "settings": "not-a-map", + }, + expected: "", + }, + { + name: "Missing clusterName in settings", + config: map[string]any{ + "settings": map[string]any{ + "interruptionQueue": "my-cluster", + }, + }, + expected: "", + }, + { + name: "clusterName is wrong type", + config: map[string]any{ + "settings": map[string]any{ + "clusterName": 42, + }, + }, + expected: "", + }, + { + name: "Valid clusterName", + config: map[string]any{ + "settings": map[string]any{ + "clusterName": "my-eks-cluster", + "interruptionQueue": "my-eks-cluster", + }, + }, + expected: "my-eks-cluster", + }, + } { + t.Run(tc.name, func(t *testing.T) { + result := extractClusterName(tc.config) + assert.Equal(t, tc.expected, result) + }) + } +}