From 154219267dc9406dfe59e639b71c19371733c20c Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:15:23 -0400 Subject: [PATCH 01/15] initial experiment signals --- cmd/main.go | 2 +- pkg/fleet/daemon.go | 189 ++++++++++-- pkg/fleet/daemon_test.go | 508 ++++++++++++++++++++++++++++++++ pkg/fleet/experiment.go | 118 ++++++++ pkg/fleet/remote_config.go | 30 +- pkg/fleet/remote_config_test.go | 31 +- 6 files changed, 840 insertions(+), 38 deletions(-) create mode 100644 pkg/fleet/daemon_test.go create mode 100644 pkg/fleet/experiment.go diff --git a/cmd/main.go b/cmd/main.go index 8325f079fb..a1bfc6a3f2 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -666,6 +666,6 @@ func setupAndStartHelmMetadataForwarder(logger logr.Logger, mgr manager.Manager, } func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient) error { - daemon := fleet.NewDaemon(logger.WithName("fleet"), rcClient) + daemon := fleet.NewDaemon(rcClient, mgr.GetClient()) return mgr.Add(daemon) } diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 706e4aeb15..1062ec974e 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -11,9 +11,12 @@ import ( "sync" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" - "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" "github.com/DataDog/datadog-operator/pkg/remoteconfig" ) @@ -29,17 +32,17 @@ var _ manager.LeaderElectionRunnable = &Daemon{} // Daemon subscribes to fleet-specific RC products (installer configs and tasks) // and runs after leader election as a controller-runtime Runnable. type Daemon struct { - logger logr.Logger rcClient remoteconfig.RCClient + client client.Client mu sync.RWMutex configs map[string]installerConfig // keyed by config ID; replaced on each RC update } // NewDaemon creates a new Fleet Daemon. -func NewDaemon(logger logr.Logger, rcClient remoteconfig.RCClient) *Daemon { +func NewDaemon(rcClient remoteconfig.RCClient, k8sClient client.Client) *Daemon { return &Daemon{ - logger: logger, rcClient: rcClient, + client: k8sClient, configs: make(map[string]installerConfig), } } @@ -47,13 +50,19 @@ func NewDaemon(logger logr.Logger, rcClient remoteconfig.RCClient) *Daemon { // Start implements manager.Runnable. It subscribes to fleet RC products and // blocks until ctx is cancelled. func (d *Daemon) Start(ctx context.Context) error { - d.logger.Info("Starting Fleet daemon") + logger := ctrl.LoggerFrom(ctx).WithName("fleet-daemon").WithValues("kind", "DatadogAgent") + ctx = ctrl.LoggerInto(ctx, logger) + logger.Info("Starting Fleet daemon") - d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(d.handleConfigs)) - d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(d.handleRemoteAPIRequest)) + d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(func(configs map[string]installerConfig) error { + return d.handleConfigs(ctx, configs) + })) + d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(func(req remoteAPIRequest) error { + return d.handleRemoteAPIRequest(ctx, req) + })) <-ctx.Done() - d.logger.Info("Stopping Fleet daemon") + logger.Info("Stopping Fleet daemon") return nil } @@ -65,12 +74,13 @@ func (d *Daemon) NeedLeaderElection() bool { // handleConfigs replaces the stored installer configs with the latest RC update. // Configs are indexed by their ID so they can be retrieved by task handlers. -func (d *Daemon) handleConfigs(configs map[string]installerConfig) error { +func (d *Daemon) handleConfigs(ctx context.Context, configs map[string]installerConfig) error { d.mu.Lock() defer d.mu.Unlock() + logger := ctrl.LoggerFrom(ctx) newConfigs := make(map[string]installerConfig, len(configs)) for _, cfg := range configs { - d.logger.Info("Received installer config", "id", cfg.ID, "file_operations", len(cfg.FileOperations)) + logger.Info("Received installer config", "id", cfg.ID, "operations", len(cfg.Operations)) newConfigs[cfg.ID] = cfg } d.configs = newConfigs @@ -89,39 +99,174 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { } // handleRemoteAPIRequest dispatches the incoming task to the appropriate handler. -func (d *Daemon) handleRemoteAPIRequest(req remoteAPIRequest) error { - d.logger.Info("Received remote API request", "id", req.ID, "package", req.Package, "method", req.Method) +func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIRequest) error { + ctrl.LoggerFrom(ctx).Info("Received remote API request", "id", req.ID, "package", req.Package, "method", req.Method) switch req.Method { case methodStartDatadogAgentExperiment: - return d.startDatadogAgentExperiment(req) + return d.startDatadogAgentExperiment(ctx, req) case methodStopDatadogAgentExperiment: - return d.stopDatadogAgentExperiment(req) + return d.stopDatadogAgentExperiment(ctx, req) case methodPromoteDatadogAgentExperiment: - return d.promoteDatadogAgentExperiment(req) + return d.promoteDatadogAgentExperiment(ctx, req) default: return fmt.Errorf("unknown method: %s", req.Method) } } -func (d *Daemon) startDatadogAgentExperiment(req remoteAPIRequest) error { +// resolveOperation looks up the installer config for the request, validates its single +// DatadogAgent operation, and fills in the canonical GVK. Returns the operation ready for use. +func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetManagementOperation, error) { cfg, err := d.getConfig(req.ExpectedState.ExperimentConfig) if err != nil { + return fleetManagementOperation{}, fmt.Errorf("%s: %w", signal, err) + } + + if len(cfg.Operations) != 1 { + return fleetManagementOperation{}, fmt.Errorf("%s: config %s must have exactly 1 operation, got %d", signal, cfg.ID, len(cfg.Operations)) + } + op := cfg.Operations[0] + + if err := validateOperation(op); err != nil { + return fleetManagementOperation{}, fmt.Errorf("%s: invalid operation: %w", signal, err) + } + + op.GroupVersionKind = v2alpha1.GroupVersion.WithKind("DatadogAgent") + + return op, nil +} + +// startDatadogAgentExperiment starts a DatadogAgent experiment. +// The first step updates the DDA spec with the experiment configuration. +// The second step updates the DDA status to running, recording the experiment ID and generation. +func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + op, err := d.resolveOperation(req, "start DatadogAgent experiment") + if err != nil { + return err + } + + logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) + ctx = ctrl.LoggerInto(ctx, logger) + + // Fetch current DDA to check signal preconditions. + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + } + + if err := canStart(getExperimentPhase(dda)); err != nil { return fmt.Errorf("start DatadogAgent experiment: %w", err) } - d.logger.Info("Starting DatadogAgent experiment", "id", req.ID, "package", req.Package, "config_id", cfg.ID) + + // Apply the spec patch. + if err := retryWithBackoff(ctx, func() error { + return d.client.Patch(ctx, dda, client.RawPatch(types.MergePatchType, op.Config)) + }); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to patch spec: %w", err) + } + + // Re-fetch to get the generation that was bumped by the spec patch. + if err := retryWithBackoff(ctx, func() error { + return d.client.Get(ctx, op.NamespacedName, dda) + }); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to re-fetch DatadogAgent: %w", err) + } + newGeneration := dda.Generation + + // Update status: phase=running, record experiment ID and new generation. + // Re-fetch inside the retry to get the latest ResourceVersion on conflict. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return err + } + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: req.ID, + Generation: newGeneration, + } + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to update status: %w", err) + } + + logger.Info("Started DatadogAgent experiment", "generation", newGeneration) return nil } -func (d *Daemon) stopDatadogAgentExperiment(req remoteAPIRequest) error { - d.logger.Info("Stopping DatadogAgent experiment", "id", req.ID, "package", req.Package) +func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + op, err := d.resolveOperation(req, "stop DatadogAgent experiment") + if err != nil { + return err + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name)) + logger := ctrl.LoggerFrom(ctx) + + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + } + + if isNoOp, err := canStop(ctx, getExperimentPhase(dda)); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: %w", err) + } else if isNoOp { + return nil + } + + // Update status: set phase=stopped; leave ID and generation for the reconciler. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return err + } + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: failed to update status: %w", err) + } + + logger.Info("Stopped DatadogAgent experiment") return nil } -func (d *Daemon) promoteDatadogAgentExperiment(req remoteAPIRequest) error { - cfg, err := d.getConfig(req.ExpectedState.ExperimentConfig) +func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + op, err := d.resolveOperation(req, "promote DatadogAgent experiment") if err != nil { + return err + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name)) + logger := ctrl.LoggerFrom(ctx) + + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return fmt.Errorf("promote DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + } + + if isNoOp, err := canPromote(ctx, getExperimentPhase(dda)); err != nil { return fmt.Errorf("promote DatadogAgent experiment: %w", err) + } else if isNoOp { + return nil } - d.logger.Info("Promoting DatadogAgent experiment", "id", req.ID, "package", req.Package, "config_id", cfg.ID) + + // Update status: set phase=promoted; leave ID and generation intact. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return err + } + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("promote DatadogAgent experiment: failed to update status: %w", err) + } + + logger.Info("Promoted DatadogAgent experiment") return nil } + +// getExperimentPhase returns the current experiment phase from a DDA's status, +// or empty string if no experiment is active. +func getExperimentPhase(dda *v2alpha1.DatadogAgent) v2alpha1.ExperimentPhase { + if dda.Status.Experiment == nil { + return "" + } + return dda.Status.Experiment.Phase +} diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go new file mode 100644 index 0000000000..af9b5620aa --- /dev/null +++ b/pkg/fleet/daemon_test.go @@ -0,0 +1,508 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package fleet + +import ( + "context" + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// --- Test helpers --- + +func testFleetScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = v2alpha1.AddToScheme(s) + return s +} + +func testDaemon(dda *v2alpha1.DatadogAgent, configs map[string]installerConfig) (*Daemon, client.Client) { + s := testFleetScheme() + b := fake.NewClientBuilder().WithScheme(s).WithStatusSubresource(&v2alpha1.DatadogAgent{}) + if dda != nil { + b = b.WithObjects(dda) + } + c := b.Build() + return &Daemon{ + client: c, + configs: configs, + }, c +} + +var testDDAGVK = schema.GroupVersionKind{ + Group: "datadoghq.com", + Version: "v2alpha1", + Kind: "DatadogAgent", +} + +var testDDANSN = types.NamespacedName{Namespace: "datadog", Name: "datadog-agent"} + +func testDDAObject(phase v2alpha1.ExperimentPhase) *v2alpha1.DatadogAgent { + dda := &v2alpha1.DatadogAgent{ + ObjectMeta: metav1.ObjectMeta{ + Name: testDDANSN.Name, + Namespace: testDDANSN.Namespace, + Generation: 1, + }, + } + if phase != "" { + dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase, ID: "old-exp", Generation: 1} + } + return dda +} + +func testInstallerConfigWithDDA() map[string]installerConfig { + return map[string]installerConfig{ + "test-config": { + ID: "test-config", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + }, + }, + }, + } +} + +func testStartRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Method: methodStartDatadogAgentExperiment, + ExpectedState: expectedState{ + ExperimentConfig: "test-config", + }, + } +} + +// --- startDatadogAgentExperiment tests --- + +func TestStartDatadogAgentExperiment_ConfigNotFound(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.ExpectedState.ExperimentConfig = "nonexistent" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_NoDDAOperation(t *testing.T) { + configs := map[string]installerConfig{ + "no-dda-config": { + ID: "no-dda-config", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: schema.GroupVersionKind{Group: "other.io", Version: "v1", Kind: "Other"}, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + }, + }, + }, + } + d, _ := testDaemon(testDDAObject(""), configs) + req := testStartRequest() + req.ExpectedState.ExperimentConfig = "no-dda-config" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) // no DDA pre-created + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +} + +func TestStartDatadogAgentExperiment_Running(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +} + +func TestStartDatadogAgentExperiment_Stopped(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseStopped), testInstallerConfigWithDDA()) + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +} + +func TestStartDatadogAgentExperiment_Success_NilPhase(t *testing.T) { + d, c := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) + assert.NotZero(t, dda.Status.Experiment.Generation) +} + +func TestStartDatadogAgentExperiment_Success_FromRollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_Success_FromTimeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) +} + +func TestStartDatadogAgentExperiment_Success_FromAborted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) +} + +func TestStartDatadogAgentExperiment_Success_OverwritesPreviousExperiment(t *testing.T) { + // Start a new experiment when a previous one exists (e.g. after rollback). + // The old experiment's ID and generation must be fully replaced. + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) + assert.NotEqual(t, "old-exp", dda.Status.Experiment.ID) +} + +// --- stopDatadogAgentExperiment tests --- + +func testStopRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Method: methodStopDatadogAgentExperiment, + ExpectedState: expectedState{ + ExperimentConfig: "test-config", + }, + } +} + +func TestStopDatadogAgentExperiment_ConfigNotFound(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + req := testStopRequest() + req.ExpectedState.ExperimentConfig = "nonexistent" + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), req)) +} + +func TestStopDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_NilPhase(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_Aborted(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + // Status must not change — already rolled back. + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhasePromoted), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_Success_Running(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseStopped, dda.Status.Experiment.Phase) + // ID and generation must be preserved. + assert.Equal(t, "old-exp", dda.Status.Experiment.ID) + assert.Equal(t, int64(1), dda.Status.Experiment.Generation) +} + +// --- promoteDatadogAgentExperiment tests --- + +func testPromoteRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Method: methodPromoteDatadogAgentExperiment, + ExpectedState: expectedState{ + ExperimentConfig: "test-config", + }, + } +} + +func TestPromoteDatadogAgentExperiment_ConfigNotFound(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + req := testPromoteRequest() + req.ExpectedState.ExperimentConfig = "nonexistent" + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), req)) +} + +func TestPromoteDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_NilPhase(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_Aborted(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhasePromoted), testInstallerConfigWithDDA()) + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) + // ID and generation must be preserved. + assert.Equal(t, "old-exp", dda.Status.Experiment.ID) + assert.Equal(t, int64(1), dda.Status.Experiment.Generation) +} + +// --- validateOperation tests --- + +func TestValidateOperation_Valid(t *testing.T) { + op := fleetManagementOperation{ + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + } + assert.NoError(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyName(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: testDDAGVK, + NamespacedName: types.NamespacedName{Namespace: "datadog", Name: ""}, + } + assert.Error(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyNamespace(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: testDDAGVK, + NamespacedName: types.NamespacedName{Namespace: "", Name: "datadog-agent"}, + } + assert.Error(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyGroup_Allowed(t *testing.T) { + // Group is auto-detected from the cluster; empty group is valid input. + op := fleetManagementOperation{ + GroupVersionKind: schema.GroupVersionKind{Group: "", Version: "", Kind: "DatadogAgent"}, + NamespacedName: testDDANSN, + } + assert.NoError(t, validateOperation(op)) +} + +func TestValidateOperation_WrongKind(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: schema.GroupVersionKind{Group: "datadoghq.com", Version: "v2alpha1", Kind: "DatadogMonitor"}, + NamespacedName: testDDANSN, + } + assert.Error(t, validateOperation(op)) +} + +// --- canStart tests --- + +func TestCanStart_NilPhase(t *testing.T) { + assert.NoError(t, canStart("")) +} + +func TestCanStart_Running(t *testing.T) { + assert.Error(t, canStart(v2alpha1.ExperimentPhaseRunning)) +} + +func TestCanStart_Stopped(t *testing.T) { + assert.Error(t, canStart(v2alpha1.ExperimentPhaseStopped)) +} + +func TestCanStart_Rollback(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseRollback)) +} + +func TestCanStart_Timeout(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseTimeout)) +} + +func TestCanStart_Promoted(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhasePromoted)) +} + +func TestCanStart_Aborted(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseAborted)) +} + +// --- canStop tests --- + +func TestCanStop_NilPhase(t *testing.T) { + isNoOp, err := canStop(context.Background(), "") + assert.False(t, isNoOp) + assert.Error(t, err) +} + +func TestCanStop_Running(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseRunning) + assert.False(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Stopped(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseStopped) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Rollback(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseRollback) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Timeout(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseTimeout) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Promoted(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhasePromoted) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Aborted(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseAborted) + assert.False(t, isNoOp) + assert.Error(t, err) +} + +// --- canPromote tests --- + +func TestCanPromote_NilPhase(t *testing.T) { + isNoOp, err := canPromote(context.Background(), "") + assert.False(t, isNoOp) + assert.Error(t, err) +} + +func TestCanPromote_Running(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseRunning) + assert.False(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Stopped(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseStopped) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Rollback(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseRollback) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Timeout(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseTimeout) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Promoted(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhasePromoted) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Aborted(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseAborted) + assert.False(t, isNoOp) + assert.Error(t, err) +} diff --git a/pkg/fleet/experiment.go b/pkg/fleet/experiment.go new file mode 100644 index 0000000000..149c083e7f --- /dev/null +++ b/pkg/fleet/experiment.go @@ -0,0 +1,118 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package fleet + +import ( + "context" + "fmt" + "math" + "time" + + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// validateOperation checks that a fleetManagementOperation has the fields +// required to locate and act on a DatadogAgent resource. +func validateOperation(op fleetManagementOperation) error { + if op.NamespacedName.Name == "" { + return fmt.Errorf("operation namespaced name must have a non-empty name") + } + if op.NamespacedName.Namespace == "" { + return fmt.Errorf("operation namespaced name must have a non-empty namespace") + } + if op.GroupVersionKind.Kind != "DatadogAgent" { + return fmt.Errorf("operation kind must be DatadogAgent, got %q", op.GroupVersionKind.Kind) + } + return nil +} + +// canStart returns whether a start signal is allowed given the current experiment phase. +// +// Allowed from: nil/empty, rollback, timeout, promoted, aborted (start new experiment). +// Error from: running (experiment already running), stopped (rollback in progress). +func canStart(phase v2alpha1.ExperimentPhase) error { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return fmt.Errorf("experiment already running (phase=%s); stop it before starting a new one", phase) + case v2alpha1.ExperimentPhaseStopped: + return fmt.Errorf("rollback in progress (phase=%s); wait for rollback to complete before starting a new experiment", phase) + default: + // nil/empty, rollback, timeout, promoted, aborted: start is allowed + return nil + } +} + +// canStop returns whether a stop signal is allowed given the current experiment phase. +// Returns (true, nil) if the signal is a no-op and should be silently ignored, (false, err) if rejected, or (false, nil) to proceed. +// +// Allowed from: running. +// No-op from: stopped, rollback, timeout, promoted (already terminal or in-progress rollback). +// Error from: nil/empty (nothing to stop), aborted (terminal, user-driven state). +func canStop(ctx context.Context, phase v2alpha1.ExperimentPhase) (bool, error) { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return false, nil + case v2alpha1.ExperimentPhaseStopped, + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhasePromoted: + ctrl.LoggerFrom(ctx).Info("Stop signal ignored: experiment is not in a stoppable state", "phase", phase) + return true, nil + case v2alpha1.ExperimentPhaseAborted: + return false, fmt.Errorf("experiment was aborted due to a manual spec change (phase=%s)", phase) + default: + // nil/empty: no active experiment to stop + return false, fmt.Errorf("no active experiment to stop (phase=%s)", phase) + } +} + +// canPromote returns whether a promote signal is allowed given the current experiment phase. +// Returns (true, nil) if the signal is a no-op and should be silently ignored, (false, err) if rejected, or (false, nil) to proceed. +// +// Allowed from: running. +// No-op from: stopped, rollback, timeout, promoted (already terminal or rolling back). +// Error from: nil/empty (nothing to promote), aborted (terminal, user-driven state). +func canPromote(ctx context.Context, phase v2alpha1.ExperimentPhase) (bool, error) { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return false, nil + case v2alpha1.ExperimentPhaseStopped, + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhasePromoted: + ctrl.LoggerFrom(ctx).Info("Promote signal ignored: experiment is not in a promotable state", "phase", phase) + return true, nil + case v2alpha1.ExperimentPhaseAborted: + return false, fmt.Errorf("experiment was aborted due to a manual spec change (phase=%s)", phase) + default: + // nil/empty: no active experiment to promote + return false, fmt.Errorf("no active experiment to promote (phase=%s)", phase) + } +} + +// experimentBackoff is the retry backoff for k8s operations during experiment signals. +// Retries start at 1s, doubling each attempt up to 10s, for up to 3 minutes total. +var experimentBackoff = wait.Backoff{ + Duration: 1 * time.Second, + Factor: 2.0, + Jitter: 0.1, + Cap: 10 * time.Second, + Steps: math.MaxInt32, +} + +// retryWithBackoff retries fn on any error with exponential backoff. +// The total retry window is bounded by a 3-minute context timeout. +func retryWithBackoff(ctx context.Context, fn func() error) error { + ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + return retry.OnError(experimentBackoff, func(err error) bool { + return ctx.Err() == nil + }, fn) +} diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index 2acee81fd8..951a1607c2 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -10,19 +10,32 @@ import ( "fmt" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" ) // installerConfig is the fleet installer configuration received via RC. type installerConfig struct { - ID string `json:"id"` - FileOperations []installerConfigFileOperation `json:"file_operations"` + ID string `json:"id"` + Operations []fleetManagementOperation `json:"operations"` } -// installerConfigFileOperation is a single file operation in an installerConfig. -type installerConfigFileOperation struct { - FileOperationType string `json:"file_op"` - FilePath string `json:"file_path"` - Patch json.RawMessage `json:"patch"` +// Operation is the type of fleet management operation to perform on a Kubernetes resource. +type Operation string + +const ( + OperationCreate Operation = "create" + OperationUpdate Operation = "update" + OperationDelete Operation = "delete" +) + +// fleetManagementOperation is a single fleet operation for config management of a Kubernetes resource. +// Config is a JSON merge patch (no strategic merge patch). +type fleetManagementOperation struct { + Operation Operation `json:"operation"` + GroupVersionKind schema.GroupVersionKind `json:"group_version_kind"` + NamespacedName types.NamespacedName `json:"namespaced_name"` + Config json.RawMessage `json:"config"` } // remoteAPIRequest is a task sent to the fleet daemon via RC. @@ -92,6 +105,9 @@ func handleUpdaterTaskUpdate(h func(remoteAPIRequest) error) func(map[string]sta continue } + // Signal received and parsed; notify the backend before applying. + applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateUnacknowledged}) + if err := h(req); err != nil { applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: err.Error()}) continue diff --git a/pkg/fleet/remote_config_test.go b/pkg/fleet/remote_config_test.go index 09d3ebc670..103a9bf8d9 100644 --- a/pkg/fleet/remote_config_test.go +++ b/pkg/fleet/remote_config_test.go @@ -12,17 +12,27 @@ import ( "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" ) // Test data var testInstallerConfig = installerConfig{ ID: "test", - FileOperations: []installerConfigFileOperation{ + Operations: []fleetManagementOperation{ { - FileOperationType: "write", - FilePath: "/etc/datadog-agent/config.yaml", - Patch: json.RawMessage(`{"key":"value"}`), + Operation: OperationUpdate, + GroupVersionKind: schema.GroupVersionKind{ + Group: "datadoghq.com", + Version: "v2alpha1", + Kind: "DatadogAgent", + }, + NamespacedName: types.NamespacedName{ + Namespace: "datadog", + Name: "datadog-agent", + }, + Config: json.RawMessage(`{"spec":{"features":{"apm":{"enabled":true}}}}`), }, }, } @@ -130,11 +140,13 @@ func TestRemoteAPIRequest(t *testing.T) { updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(nil) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}).Return() handler(updates, cb.applyStateCallback) cb.AssertCalled(t, "handleRemoteAPIRequest", testRemoteAPIRequest) + cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}) cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}) } @@ -166,6 +178,7 @@ func TestRemoteAPIRequestError(t *testing.T) { updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(assert.AnError) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", mock.MatchedBy(func(s state.ApplyStatus) bool { return s.State == state.ApplyStateError && s.Error != "" })).Return() @@ -173,6 +186,7 @@ func TestRemoteAPIRequestError(t *testing.T) { handler(updates, cb.applyStateCallback) cb.AssertCalled(t, "handleRemoteAPIRequest", testRemoteAPIRequest) + cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}) cb.AssertCalled(t, "applyStateCallback", "path/to/task", mock.MatchedBy(func(s state.ApplyStatus) bool { return s.State == state.ApplyStateError })) @@ -186,16 +200,17 @@ func TestRemoteAPIRequestIgnoresAlreadyExecutedRequests(t *testing.T) { updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(nil) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}).Return() - // First call — should invoke the handler. + // First call — should invoke the handler; sends Unacknowledged then Acknowledged. handler(updates, cb.applyStateCallback) cb.AssertNumberOfCalls(t, "handleRemoteAPIRequest", 1) - // Second call with same request ID — handler must NOT be called again. + // Second call with same request ID — handler must NOT be called again; sends only Acknowledged. handler(updates, cb.applyStateCallback) cb.AssertNumberOfCalls(t, "handleRemoteAPIRequest", 1) - // applyStateCallback is called both times (acknowledge on repeat). - cb.AssertNumberOfCalls(t, "applyStateCallback", 2) + // First call: Unacknowledged + Acknowledged. Second call: Acknowledged only. + cb.AssertNumberOfCalls(t, "applyStateCallback", 3) } From d0ff10a0e34b2ee4caacd2563ef63ed2d231321e Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Tue, 7 Apr 2026 16:03:32 -0400 Subject: [PATCH 02/15] Review suggestions --- cmd/main.go | 6 ++--- .../controller/datadogagent/experiment.go | 3 +++ pkg/fleet/daemon.go | 27 ++++++++++++------- pkg/fleet/daemon_test.go | 5 ++-- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index a1bfc6a3f2..6457aec462 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -341,7 +341,7 @@ func run(opts *options) error { } if opts.remoteUpdatesEnabled { - if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client()); rcErr != nil { + if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client(), opts.createControllerRevisions && opts.datadogAgentInternalEnabled); rcErr != nil { setupErrorf(setupLog, rcErr, "Unable to setup Fleet daemon") } } @@ -665,7 +665,7 @@ func setupAndStartHelmMetadataForwarder(logger logr.Logger, mgr manager.Manager, return mgr.Add(hmf) } -func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient) error { - daemon := fleet.NewDaemon(rcClient, mgr.GetClient()) +func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient, revisionsEnabled bool) error { + daemon := fleet.NewDaemon(rcClient, mgr.GetClient(), revisionsEnabled) return mgr.Add(daemon) } diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index aa4a59c46c..7becd9a4c4 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -159,6 +159,9 @@ func (r *Reconciler) rollback( // Merge snapshot annotations (Datadog-only keys) on top of current annotations // so that non-Datadog annotations (user metadata, tooling labels, etc.) are preserved. merged := maps.Clone(current.Annotations) + if merged == nil { + merged = make(map[string]string, len(snapshot.Annotations)) + } maps.Copy(merged, snapshot.Annotations) toUpdate := &v2alpha1.DatadogAgent{ diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 1062ec974e..ee8dae4879 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -32,18 +32,22 @@ var _ manager.LeaderElectionRunnable = &Daemon{} // Daemon subscribes to fleet-specific RC products (installer configs and tasks) // and runs after leader election as a controller-runtime Runnable. type Daemon struct { - rcClient remoteconfig.RCClient - client client.Client - mu sync.RWMutex - configs map[string]installerConfig // keyed by config ID; replaced on each RC update + rcClient remoteconfig.RCClient + client client.Client + revisionsEnabled bool + mu sync.RWMutex + configs map[string]installerConfig // keyed by config ID; replaced on each RC update } -// NewDaemon creates a new Fleet Daemon. -func NewDaemon(rcClient remoteconfig.RCClient, k8sClient client.Client) *Daemon { +// NewDaemon creates a new Fleet Daemon. When revisionsEnabled is false, experiment +// signals are rejected because the reconciler cannot process them without the +// ControllerRevision machinery. +func NewDaemon(rcClient remoteconfig.RCClient, k8sClient client.Client, revisionsEnabled bool) *Daemon { return &Daemon{ - rcClient: rcClient, - client: k8sClient, - configs: make(map[string]installerConfig), + rcClient: rcClient, + client: k8sClient, + revisionsEnabled: revisionsEnabled, + configs: make(map[string]installerConfig), } } @@ -101,6 +105,11 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { // handleRemoteAPIRequest dispatches the incoming task to the appropriate handler. func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIRequest) error { ctrl.LoggerFrom(ctx).Info("Received remote API request", "id", req.ID, "package", req.Package, "method", req.Method) + + if !d.revisionsEnabled { + return fmt.Errorf("experiment signals require the CreateControllerRevisions and DatadogAgentInternal feature gates") + } + switch req.Method { case methodStartDatadogAgentExperiment: return d.startDatadogAgentExperiment(ctx, req) diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index af9b5620aa..b898e8e177 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -38,8 +38,9 @@ func testDaemon(dda *v2alpha1.DatadogAgent, configs map[string]installerConfig) } c := b.Build() return &Daemon{ - client: c, - configs: configs, + client: c, + revisionsEnabled: true, + configs: configs, }, c } From 120e0c506ab39f620c439f2920d386e4f0493217 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 8 Apr 2026 17:14:08 +0200 Subject: [PATCH 03/15] [FA] Add cluster_name as tag (#2878) * Add cluster_name as tag * Add updater_type --- pkg/remoteconfig/updater.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index bca9c5e92d..767022b4d8 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -186,9 +186,13 @@ func (r *RemoteConfigUpdater) Start(apiKey string, site string, clusterName stri } r.rcService = rcService + updaterTags := []string{"updater_type:datadog-operator"} + if r.serviceConf.clusterName != "" { + updaterTags = append(updaterTags, "cluster_name:"+r.serviceConf.clusterName) + } rcClient, err := client.NewClient( rcService, - client.WithUpdater(), + client.WithUpdater(updaterTags...), client.WithProducts(state.ProductAgentConfig, state.ProductOrchestratorK8sCRDs), client.WithDirectorRootOverride(r.serviceConf.cfg.GetString("site"), r.serviceConf.cfg.GetString("remote_configuration.director_root")), client.WithPollInterval(pollInterval), From 70edfda72d7c9cb881256e3e3005a5f82228d22c Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:46:24 -0400 Subject: [PATCH 04/15] add test logs --- pkg/fleet/daemon.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index ee8dae4879..459f75d4a6 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -148,14 +148,19 @@ func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetMan // The first step updates the DDA spec with the experiment configuration. // The second step updates the DDA status to running, recording the experiment ID and generation. func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + tempLogger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) + tempLogger.Info("Starting DatadogAgent experiment", "config", req.ExpectedState.ExperimentConfig) op, err := d.resolveOperation(req, "start DatadogAgent experiment") if err != nil { + tempLogger.Error(err, "Failed to resolve operation") return err } logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) ctx = ctrl.LoggerInto(ctx, logger) + logger.Info("Starting k8s resource patch", "config", op.Config) + // Fetch current DDA to check signal preconditions. dda := &v2alpha1.DatadogAgent{} if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { From e0d5b372f7ed6c1419ec309aaa45a0e0a1de5a58 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 9 Apr 2026 10:57:25 -0400 Subject: [PATCH 05/15] more logs --- pkg/fleet/daemon.go | 9 ++++++--- pkg/fleet/remote_config.go | 12 ++++++++++-- pkg/fleet/remote_config_test.go | 15 ++++++++------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 459f75d4a6..53e0a3c523 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -58,10 +58,10 @@ func (d *Daemon) Start(ctx context.Context) error { ctx = ctrl.LoggerInto(ctx, logger) logger.Info("Starting Fleet daemon") - d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(func(configs map[string]installerConfig) error { + d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(ctx, func(configs map[string]installerConfig) error { return d.handleConfigs(ctx, configs) })) - d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(func(req remoteAPIRequest) error { + d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(ctx, func(req remoteAPIRequest) error { return d.handleRemoteAPIRequest(ctx, req) })) @@ -82,12 +82,14 @@ func (d *Daemon) handleConfigs(ctx context.Context, configs map[string]installer d.mu.Lock() defer d.mu.Unlock() logger := ctrl.LoggerFrom(ctx) + logger.Info("Received installer configs", "configs", configs) newConfigs := make(map[string]installerConfig, len(configs)) for _, cfg := range configs { logger.Info("Received installer config", "id", cfg.ID, "operations", len(cfg.Operations)) newConfigs[cfg.ID] = cfg } d.configs = newConfigs + logger.Info("Updated installer configs", "configs", d.configs) return nil } @@ -104,7 +106,8 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { // handleRemoteAPIRequest dispatches the incoming task to the appropriate handler. func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIRequest) error { - ctrl.LoggerFrom(ctx).Info("Received remote API request", "id", req.ID, "package", req.Package, "method", req.Method) + logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "package", req.Package, "method", req.Method) + logger.Info("Received remote API request") if !d.revisionsEnabled { return fmt.Errorf("experiment signals require the CreateControllerRevisions and DatadogAgentInternal feature gates") diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index 951a1607c2..b81c6dea96 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -6,12 +6,14 @@ package fleet import ( + "context" "encoding/json" "fmt" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" ) // installerConfig is the fleet installer configuration received via RC. @@ -61,10 +63,13 @@ type expectedState struct { // handleInstallerConfigUpdate returns an RC subscription callback that parses // UPDATER_AGENT payloads and forwards them as a map[configID]installerConfig to h. -func handleInstallerConfigUpdate(h func(map[string]installerConfig) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { +func handleInstallerConfigUpdate(ctx context.Context, h func(map[string]installerConfig) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { return func(updates map[string]state.RawConfig, applyStatus func(string, state.ApplyStatus)) { + logger := ctrl.LoggerFrom(ctx) configs := make(map[string]installerConfig, len(updates)) for cfgPath, raw := range updates { + logger.Info("Received UPDATER_AGENT payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + var cfg installerConfig if err := json.Unmarshal(raw.Config, &cfg); err != nil { applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: fmt.Sprintf("failed to unmarshal installer config: %v", err)}) @@ -89,10 +94,13 @@ func handleInstallerConfigUpdate(h func(map[string]installerConfig) error) func( // handleUpdaterTaskUpdate returns an RC subscription callback that parses a single // UPDATER_TASK payload and forwards it as a remoteAPIRequest to h. // Requests that have already been executed (tracked by seen IDs) are ignored. -func handleUpdaterTaskUpdate(h func(remoteAPIRequest) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { +func handleUpdaterTaskUpdate(ctx context.Context, h func(remoteAPIRequest) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { seen := make(map[string]struct{}) return func(updates map[string]state.RawConfig, applyStatus func(string, state.ApplyStatus)) { + logger := ctrl.LoggerFrom(ctx) for cfgPath, raw := range updates { + logger.Info("Received UPDATER_TASK payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + var req remoteAPIRequest if err := json.Unmarshal(raw.Config, &req); err != nil { applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: fmt.Sprintf("failed to unmarshal remote API request: %v", err)}) diff --git a/pkg/fleet/remote_config_test.go b/pkg/fleet/remote_config_test.go index 103a9bf8d9..a9ba745aec 100644 --- a/pkg/fleet/remote_config_test.go +++ b/pkg/fleet/remote_config_test.go @@ -6,6 +6,7 @@ package fleet import ( + "context" "encoding/json" "testing" @@ -74,7 +75,7 @@ func marshalRawConfig(t *testing.T, v any) state.RawConfig { func TestInstallerConfigUpdate(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) raw := marshalRawConfig(t, testInstallerConfig) updates := map[string]state.RawConfig{"path/to/config": raw} @@ -91,7 +92,7 @@ func TestInstallerConfigUpdate(t *testing.T) { func TestInstallerConfigUpdateBadConfig(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) updates := map[string]state.RawConfig{ "path/to/config": {Config: []byte("not json")}, @@ -111,7 +112,7 @@ func TestInstallerConfigUpdateBadConfig(t *testing.T) { func TestInstallerConfigUpdateError(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) raw := marshalRawConfig(t, testInstallerConfig) updates := map[string]state.RawConfig{"path/to/config": raw} @@ -134,7 +135,7 @@ func TestInstallerConfigUpdateError(t *testing.T) { func TestRemoteAPIRequest(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} @@ -152,7 +153,7 @@ func TestRemoteAPIRequest(t *testing.T) { func TestRemoteAPIRequestBadConfig(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) updates := map[string]state.RawConfig{ "path/to/task": {Config: []byte("not json")}, @@ -172,7 +173,7 @@ func TestRemoteAPIRequestBadConfig(t *testing.T) { func TestRemoteAPIRequestError(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} @@ -194,7 +195,7 @@ func TestRemoteAPIRequestError(t *testing.T) { func TestRemoteAPIRequestIgnoresAlreadyExecutedRequests(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} From 91623035d596e5e8f6a354a6bc79d72508af8572 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Fri, 10 Apr 2026 13:51:14 -0400 Subject: [PATCH 06/15] use installer config --- api/go.mod | 4 +- api/go.sum | 8 ++-- go.mod | 18 ++++---- go.sum | 36 ++++++++-------- go.work.sum | 8 +++- pkg/fleet/daemon.go | 12 ++++-- pkg/fleet/daemon_test.go | 76 +++++++++++++++++++++++++++------ pkg/fleet/remote_config.go | 21 +++++---- pkg/fleet/remote_config_test.go | 43 ++++++++++++++++++- 9 files changed, 166 insertions(+), 60 deletions(-) diff --git a/api/go.mod b/api/go.mod index 13c9e45ce5..d57e071099 100644 --- a/api/go.mod +++ b/api/go.mod @@ -50,10 +50,10 @@ require ( github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/net v0.49.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sys v0.42.0 // indirect - golang.org/x/text v0.33.0 // indirect + golang.org/x/text v0.34.0 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/api/go.sum b/api/go.sum index 1468a6904b..7df534a035 100644 --- a/api/go.sum +++ b/api/go.sum @@ -61,13 +61,13 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= -golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= diff --git a/go.mod b/go.mod index ab7ae62bce..1ba660fa3b 100644 --- a/go.mod +++ b/go.mod @@ -43,7 +43,7 @@ require ( github.com/DataDog/datadog-agent/pkg/config/model v0.59.0-rc.5 github.com/DataDog/datadog-agent/pkg/config/remote v0.59.0-rc.5 github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1 - github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5 + github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3 github.com/DataDog/datadog-operator/api v0.0.0-20250130131115-7f198adcc856 github.com/aws/aws-sdk-go-v2 v1.39.6 github.com/aws/aws-sdk-go-v2/config v1.29.17 @@ -66,7 +66,7 @@ require ( github.com/prometheus/client_golang v1.23.2 github.com/samber/lo v1.52.0 golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 - golang.org/x/text v0.33.0 + golang.org/x/text v0.34.0 helm.sh/helm/v3 v3.18.5 k8s.io/kubectl v0.33.3 k8s.io/utils v0.0.0-20251222233032-718f0e51e6d2 @@ -90,7 +90,7 @@ require ( github.com/DataDog/datadog-go/v5 v5.6.0 // indirect github.com/DataDog/go-libddwaf/v3 v3.3.0 // indirect github.com/DataDog/go-sqllexer v0.0.15 // indirect - github.com/DataDog/go-tuf v1.1.0-0.5.2 // indirect + github.com/DataDog/go-tuf v1.1.1-0.5.2 // indirect github.com/DataDog/gostackparse v0.7.0 // indirect github.com/DataDog/sketches-go v1.4.5 // indirect github.com/DataDog/viper v1.13.5 // indirect @@ -228,7 +228,7 @@ require ( github.com/rubenv/sql-migrate v1.8.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect - github.com/secure-systems-lab/go-securesystemslib v0.7.0 // indirect + github.com/secure-systems-lab/go-securesystemslib v0.9.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/shirou/gopsutil/v3 v3.24.1 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect @@ -264,15 +264,15 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.46.0 // indirect - golang.org/x/mod v0.31.0 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/crypto v0.48.0 // indirect + golang.org/x/mod v0.32.0 // indirect + golang.org/x/net v0.49.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.42.0 // indirect - golang.org/x/term v0.38.0 // indirect + golang.org/x/term v0.40.0 // indirect golang.org/x/time v0.14.0 // indirect - golang.org/x/tools v0.40.0 // indirect + golang.org/x/tools v0.41.0 // indirect golang.org/x/tools/go/expect v0.1.1-deprecated // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect diff --git a/go.sum b/go.sum index 83a7b2fd6b..78e4f3f1aa 100644 --- a/go.sum +++ b/go.sum @@ -81,8 +81,8 @@ github.com/DataDog/datadog-agent/pkg/obfuscate v0.59.0-rc.5 h1:Z+vgXHXmSnVRlCpcP github.com/DataDog/datadog-agent/pkg/obfuscate v0.59.0-rc.5/go.mod h1:ATVw8kr3U1Eqz3qBz9kS6WFDKji9XyoAsHKSlj3hPTM= github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1 h1:5Xr7XQoINzwO0PswNKClUW4b/ZfsTefVHxTAjfMY4Qk= github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1/go.mod h1:QOAaPRsuM4WNXtP3Rbw+gWamge9lGnDH8ZKB6HdzoAs= -github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5 h1:kCZxrxVVfbYAA8oMB4s7yb8XbO5WMkBmygcgKm3xh7o= -github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5/go.mod h1:c4th0IFaP0Q1ofRa0GcPB9hJWN+cmUoEfOI1Ub0O50A= +github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3 h1:cQvV1Qiq0EBdbGxfU4ECDJ8Nl3xYXcNUe6QzXw73pA4= +github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3/go.mod h1:TpW5ZwsQTrlRBPjtZH5/OFwpmOqxV/v2i9BiF4Xfcac= github.com/DataDog/datadog-agent/pkg/util/backoff v0.59.0-rc.5 h1:mAXwj00h3GqpiY7yECDLwLvCOAXswx3mIiU9N6Ki1GU= github.com/DataDog/datadog-agent/pkg/util/backoff v0.59.0-rc.5/go.mod h1:2RMfdYkKyeh8hXs6WgaamkkEyK35Xo55C4rFG4dO1k8= github.com/DataDog/datadog-agent/pkg/util/cache v0.59.0-rc.5 h1:+20QEL2pI+IwOqqesjhzzWkv6GcOe3TrIgUQtg6ZdTE= @@ -111,8 +111,8 @@ github.com/DataDog/go-libddwaf/v3 v3.3.0 h1:jS72fuQpFgJZEdEJDmHJCPAgNTEMZoz1EUvi github.com/DataDog/go-libddwaf/v3 v3.3.0/go.mod h1:Bz/0JkpGf689mzbUjKJeheJINqsyyhM8p9PDuHdK2Ec= github.com/DataDog/go-sqllexer v0.0.15 h1:rUUu52dP8EQhJLnUw0MIAxZp0BQx2fOTuMztr3vtHUU= github.com/DataDog/go-sqllexer v0.0.15/go.mod h1:KwkYhpFEVIq+BfobkTC1vfqm4gTi65skV/DpDBXtexc= -github.com/DataDog/go-tuf v1.1.0-0.5.2 h1:4CagiIekonLSfL8GMHRHcHudo1fQnxELS9g4tiAupQ4= -github.com/DataDog/go-tuf v1.1.0-0.5.2/go.mod h1:zBcq6f654iVqmkk8n2Cx81E1JnNTMOAx1UEO/wZR+P0= +github.com/DataDog/go-tuf v1.1.1-0.5.2 h1:YWvghV4ZvrQsPcUw8IOUMSDpqc3W5ruOIC+KJxPknv0= +github.com/DataDog/go-tuf v1.1.1-0.5.2/go.mod h1:zBcq6f654iVqmkk8n2Cx81E1JnNTMOAx1UEO/wZR+P0= github.com/DataDog/gostackparse v0.7.0 h1:i7dLkXHvYzHV308hnkvVGDL3BR4FWl7IsXNPz/IGQh4= github.com/DataDog/gostackparse v0.7.0/go.mod h1:lTfqcJKqS9KnXQGnyQMCugq3u1FP6UZMfWR0aitKFMM= github.com/DataDog/sketches-go v1.4.5 h1:ki7VfeNz7IcNafq7yI/j5U/YCkO3LJiMDtXz9OMQbyE= @@ -1103,8 +1103,8 @@ github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCw github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/secure-systems-lab/go-securesystemslib v0.7.0 h1:OwvJ5jQf9LnIAS83waAjPbcMsODrTQUpJ02eNLUoxBg= -github.com/secure-systems-lab/go-securesystemslib v0.7.0/go.mod h1:/2gYnlnHVQ6xeGtfIqFy7Do03K4cdCY0A/GlJLDKLHI= +github.com/secure-systems-lab/go-securesystemslib v0.9.0 h1:rf1HIbL64nUpEIZnjLZ3mcNEL9NBPB0iuVjyxvq3LZc= +github.com/secure-systems-lab/go-securesystemslib v0.9.0/go.mod h1:DVHKMcZ+V4/woA/peqr+L0joiRXbPpQ042GgJckkFgw= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/shirou/gopsutil/v3 v3.24.1 h1:R3t6ondCEvmARp3wxODhXMTLC/klMa87h2PHUw5m7QI= @@ -1353,8 +1353,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= -golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= -golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= +golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= +golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -1392,8 +1392,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= -golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1446,8 +1446,8 @@ golang.org/x/net v0.0.0-20210316092652-d523dce5a7f4/go.mod h1:RBQZq4jEuRlivfhVLd golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= -golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1572,8 +1572,8 @@ golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= -golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= +golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1582,8 +1582,8 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1650,8 +1650,8 @@ golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= -golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= -golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM= golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= diff --git a/go.work.sum b/go.work.sum index 7b83ae8981..4ee5910943 100644 --- a/go.work.sum +++ b/go.work.sum @@ -58,7 +58,6 @@ cloud.google.com/go/cloudtasks v1.13.0 h1:rKVSsQwh0CI68n3RalLoGuW7sOtq2eil2gVZK4 cloud.google.com/go/cloudtasks v1.13.0/go.mod h1:O1jFRGb1Vm3sN2u/tBdPiVGVTWIsrsbEs3K3N3nNlEU= cloud.google.com/go/compute v1.28.0/go.mod h1:DEqZBtYrDnD5PvjsKwb3onnhX+qjdCVM7eshj1XdjV4= cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= -cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= cloud.google.com/go/contactcenterinsights v1.14.0 h1:hZSiEb53tyULmOSBlDPhEWPEe+vQ0F2gPQjOka1E5oE= cloud.google.com/go/contactcenterinsights v1.14.0/go.mod h1:APmWYHDN4sASnUBnXs4o68t1EUfnqadA53//CzXZ1xE= cloud.google.com/go/container v1.39.0 h1:Q1oW01ENxkkG3uf1oYoTmHPdvP+yhFCIuCJ4mk2RwkQ= @@ -1304,6 +1303,7 @@ golang.org/x/arch v0.6.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys= golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc= +golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/image v0.0.0-20190802002840-cff245a6509b h1:+qEpEAPhDZ1o0x3tHzZTQDArnOixOzGD9HUJfcg0mb4= @@ -1331,6 +1331,7 @@ golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= @@ -1357,6 +1358,8 @@ golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457 h1:zf5N6UOrA487eEFacMePxjXAJctxKmyjKUsjA11Uzuk= golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457/go.mod h1:pRgIJT+bRLFKnoM1ldnzKoxTIn14Yxz928LQRYYgIN0= golang.org/x/telemetry v0.0.0-20251008203120-078029d740a8 h1:LvzTn0GQhWuvKH/kVRS3R3bVAsdQWI7hvfLHGgh9+lU= @@ -1365,6 +1368,7 @@ golang.org/x/telemetry v0.0.0-20251203150158-8fff8a5912fc h1:bH6xUXay0AIFMElXG2r golang.org/x/telemetry v0.0.0-20251203150158-8fff8a5912fc/go.mod h1:hKdjCMrbv9skySur+Nek8Hd0uJ0GuxJIoIX2payrIdQ= golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= @@ -1373,6 +1377,7 @@ golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= @@ -1383,7 +1388,6 @@ golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0 golang.org/x/tools v0.29.0/go.mod h1:KMQVMRsVxU6nHCFXrBPhDB8XncLNLM0lIy/F14RP588= golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/api v0.152.0/go.mod h1:3qNJX5eOmhiWYc67jRA/3GsDw97UFb5ivv7Y2PrriAY= google.golang.org/api v0.169.0 h1:QwWPy71FgMWqJN/l6jVlFHUa29a7dcUy02I8o799nPY= google.golang.org/api v0.169.0/go.mod h1:gpNOiMA2tZ4mf5R9Iwf4rK/Dcz0fbdIgWYWVoxmsyLg= diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 53e0a3c523..81832ae56c 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -58,7 +58,7 @@ func (d *Daemon) Start(ctx context.Context) error { ctx = ctrl.LoggerInto(ctx, logger) logger.Info("Starting Fleet daemon") - d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(ctx, func(configs map[string]installerConfig) error { + d.rcClient.Subscribe(state.ProductInstallerConfig, handleInstallerConfigUpdate(ctx, func(configs map[string]installerConfig) error { return d.handleConfigs(ctx, configs) })) d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(ctx, func(req remoteAPIRequest) error { @@ -128,7 +128,13 @@ func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIReques // resolveOperation looks up the installer config for the request, validates its single // DatadogAgent operation, and fills in the canonical GVK. Returns the operation ready for use. func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetManagementOperation, error) { - cfg, err := d.getConfig(req.ExpectedState.ExperimentConfig) + // get params version from req + id := req.Params.Version + if id == "" { + return fleetManagementOperation{}, fmt.Errorf("%s: version is required", signal) + } + // match to d.configs[params.version] to get config + cfg, err := d.getConfig(id) if err != nil { return fleetManagementOperation{}, fmt.Errorf("%s: %w", signal, err) } @@ -152,7 +158,7 @@ func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetMan // The second step updates the DDA status to running, recording the experiment ID and generation. func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { tempLogger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) - tempLogger.Info("Starting DatadogAgent experiment", "config", req.ExpectedState.ExperimentConfig) + tempLogger.Info("Starting DatadogAgent experiment", "config", req.Params.Version) op, err := d.resolveOperation(req, "start DatadogAgent experiment") if err != nil { tempLogger.Error(err, "Failed to resolve operation") diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index b898e8e177..4c688b5849 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -86,9 +86,7 @@ func testStartRequest() remoteAPIRequest { return remoteAPIRequest{ ID: "exp-abc", Method: methodStartDatadogAgentExperiment, - ExpectedState: expectedState{ - ExperimentConfig: "test-config", - }, + Params: experimentParams{Version: "test-config"}, } } @@ -97,7 +95,7 @@ func testStartRequest() remoteAPIRequest { func TestStartDatadogAgentExperiment_ConfigNotFound(t *testing.T) { d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) req := testStartRequest() - req.ExpectedState.ExperimentConfig = "nonexistent" + req.Params.Version = "nonexistent" assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) } @@ -117,7 +115,7 @@ func TestStartDatadogAgentExperiment_NoDDAOperation(t *testing.T) { } d, _ := testDaemon(testDDAObject(""), configs) req := testStartRequest() - req.ExpectedState.ExperimentConfig = "no-dda-config" + req.Params.Version = "no-dda-config" assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) } @@ -196,22 +194,76 @@ func TestStartDatadogAgentExperiment_Success_OverwritesPreviousExperiment(t *tes assert.NotEqual(t, "old-exp", dda.Status.Experiment.ID) } +// --- resolveOperation: params.version matching tests --- + +func TestStartDatadogAgentExperiment_VersionMatchesInstallerConfig(t *testing.T) { + // Simulates the real payload pairing: + // - installer_config has id "cyg9-1ztz-cdnn" with an APM-enabling patch + // - updater_task has params.version "cyg9-1ztz-cdnn" linking them together + configs := map[string]installerConfig{ + "aaaa-bbbb-cccc": { + ID: "aaaa-bbbb-cccc", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{"spec":{"features":{"apm":{"enabled":true}}}}`), + }, + }, + }, + } + d, c := testDaemon(testDDAObject(""), configs) + req := remoteAPIRequest{ + ID: "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + Method: methodStartDatadogAgentExperiment, + Params: experimentParams{Version: "aaaa-bbbb-cccc"}, + ExpectedState: expectedState{ + Stable: "0.0.1", + StableConfig: "0.0.1", + ClientID: "aAbBcCdDeEfFgGhHiIjJk", + }, + } + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_EmptyVersion(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.Params.Version = "" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_VersionMismatch(t *testing.T) { + // params.version doesn't match any installer config ID + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.Params.Version = "xxxx-yyyy-zzzz" // no config with this ID + err := d.startDatadogAgentExperiment(context.Background(), req) + assert.Error(t, err) + assert.Contains(t, err.Error(), "not found") +} + // --- stopDatadogAgentExperiment tests --- func testStopRequest() remoteAPIRequest { return remoteAPIRequest{ ID: "exp-abc", Method: methodStopDatadogAgentExperiment, - ExpectedState: expectedState{ - ExperimentConfig: "test-config", - }, + Params: experimentParams{Version: "test-config"}, } } func TestStopDatadogAgentExperiment_ConfigNotFound(t *testing.T) { d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) req := testStopRequest() - req.ExpectedState.ExperimentConfig = "nonexistent" + req.Params.Version = "nonexistent" assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), req)) } @@ -277,16 +329,14 @@ func testPromoteRequest() remoteAPIRequest { return remoteAPIRequest{ ID: "exp-abc", Method: methodPromoteDatadogAgentExperiment, - ExpectedState: expectedState{ - ExperimentConfig: "test-config", - }, + Params: experimentParams{Version: "test-config"}, } } func TestPromoteDatadogAgentExperiment_ConfigNotFound(t *testing.T) { d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) req := testPromoteRequest() - req.ExpectedState.ExperimentConfig = "nonexistent" + req.Params.Version = "nonexistent" assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), req)) } diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index b81c6dea96..44c95e0014 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -42,13 +42,13 @@ type fleetManagementOperation struct { // remoteAPIRequest is a task sent to the fleet daemon via RC. type remoteAPIRequest struct { - ID string `json:"id"` - Package string `json:"package_name"` - TraceID string `json:"trace_id"` - ParentSpanID string `json:"parent_span_id"` - ExpectedState expectedState `json:"expected_state"` - Method string `json:"method"` - Params json.RawMessage `json:"params"` + ID string `json:"id"` + Package string `json:"package_name"` + TraceID string `json:"trace_id"` + ParentSpanID string `json:"parent_span_id"` + ExpectedState expectedState `json:"expected_state"` + Method string `json:"method"` + Params experimentParams `json:"params"` } // expectedState describes the package state expected before executing the request. @@ -61,6 +61,11 @@ type expectedState struct { ClientID string `json:"client_id"` } +// experimentParams holds the parsed params for experiment methods. +type experimentParams struct { + Version string `json:"version"` +} + // handleInstallerConfigUpdate returns an RC subscription callback that parses // UPDATER_AGENT payloads and forwards them as a map[configID]installerConfig to h. func handleInstallerConfigUpdate(ctx context.Context, h func(map[string]installerConfig) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { @@ -68,7 +73,7 @@ func handleInstallerConfigUpdate(ctx context.Context, h func(map[string]installe logger := ctrl.LoggerFrom(ctx) configs := make(map[string]installerConfig, len(updates)) for cfgPath, raw := range updates { - logger.Info("Received UPDATER_AGENT payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + logger.Info("Received INSTALLER_CONFIG payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) var cfg installerConfig if err := json.Unmarshal(raw.Config, &cfg); err != nil { diff --git a/pkg/fleet/remote_config_test.go b/pkg/fleet/remote_config_test.go index a9ba745aec..883e39edb6 100644 --- a/pkg/fleet/remote_config_test.go +++ b/pkg/fleet/remote_config_test.go @@ -41,7 +41,7 @@ var testInstallerConfig = installerConfig{ var testRemoteAPIRequest = remoteAPIRequest{ ID: "test", Method: "some_method", - Params: json.RawMessage(`{}`), + Params: experimentParams{}, } // callbackMock records calls made by the RC handler callbacks. @@ -151,6 +151,47 @@ func TestRemoteAPIRequest(t *testing.T) { cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}) } +func TestRemoteAPIRequestParsesParamsVersion(t *testing.T) { + // Verifies that a real UPDATER_TASK payload has params.version correctly parsed. + cb := &callbackMock{} + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) + + rawPayload := []byte(`{ + "id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + "package_name": "datadog-operator", + "trace_id": "12345678901234567890", + "parent_span_id": "11111111111111111111", + "expected_state": { + "stable": "0.0.1", + "experiment": "", + "stable_config": "0.0.1", + "client_id": "aAbBcCdDeEfFgGhHiIjJk" + }, + "method": "operator/start_datadogagent_experiment", + "params": { + "version": "aaaa-bbbb-cccc" + } + }`) + + updates := map[string]state.RawConfig{ + "datadog/2/UPDATER_TASK/a1b2c3d4/1234567890abcdef": {Config: rawPayload}, + } + + cb.On("handleRemoteAPIRequest", mock.MatchedBy(func(req remoteAPIRequest) bool { + return req.ID == "a1b2c3d4-e5f6-7890-abcd-ef1234567890" && + req.Method == "operator/start_datadogagent_experiment" && + req.Params.Version == "aaaa-bbbb-cccc" && + req.ExpectedState.StableConfig == "0.0.1" + })).Return(nil) + cb.On("applyStateCallback", mock.Anything, mock.Anything).Return() + + handler(updates, cb.applyStateCallback) + + cb.AssertCalled(t, "handleRemoteAPIRequest", mock.MatchedBy(func(req remoteAPIRequest) bool { + return req.Params.Version == "aaaa-bbbb-cccc" + })) +} + func TestRemoteAPIRequestBadConfig(t *testing.T) { cb := &callbackMock{} handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) From 6a42afc6331923ec57fd4c161c052bf4f1eadb1c Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:25:43 -0400 Subject: [PATCH 07/15] Logging + race --- api/datadoghq/v2alpha1/datadogagent_types.go | 7 -- .../v2alpha1/zz_generated.openapi.go | 7 -- .../bases/v1/datadoghq.com_datadogagents.yaml | 9 -- .../datadoghq.com_datadogagents_v2alpha1.json | 5 -- .../controller_experiment_integration_test.go | 79 +++++++--------- .../controller/datadogagent/experiment.go | 41 ++++++++- .../datadogagent/experiment_test.go | 89 +++++++++++++++---- pkg/fleet/daemon.go | 27 ++---- pkg/fleet/daemon_test.go | 14 ++- pkg/fleet/experiment.go | 5 ++ pkg/fleet/remote_config.go | 3 + 11 files changed, 160 insertions(+), 126 deletions(-) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 3929616e1d..21667f6d28 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -2395,13 +2395,6 @@ type ExperimentStatus struct { // ID is the unique experiment ID sent by Fleet Automation. // +optional ID string `json:"id,omitempty"` - // Generation is the DDA metadata.generation recorded when the experiment started. - // Used to detect manual spec changes while the experiment is running: if the - // current DDA generation differs from this value, the operator aborts the experiment. - // - // This value must be recorded after the DDA is patched for a startExperiment signal. - // +optional - Generation int64 `json:"generation,omitempty"` } // DatadogAgentStatus defines the observed state of DatadogAgent. diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index e35c857471..a5aa89253d 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -1142,13 +1142,6 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref common. Format: "", }, }, - "generation": { - SchemaProps: spec.SchemaProps{ - Description: "Generation is the DDA metadata.generation recorded when the experiment started. Used to detect manual spec changes while the experiment is running: if the current DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", - Type: []string{"integer"}, - Format: "int64", - }, - }, }, }, }, diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 22ecfd013a..09d72a666d 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -8492,15 +8492,6 @@ spec: experiment: description: Experiment tracks the state of an active or recent Fleet Automation experiment. properties: - generation: - description: |- - Generation is the DDA metadata.generation recorded when the experiment started. - Used to detect manual spec changes while the experiment is running: if the - current DDA generation differs from this value, the operator aborts the experiment. - - This value must be recorded after the DDA is patched for a startExperiment signal. - format: int64 - type: integer id: description: ID is the unique experiment ID sent by Fleet Automation. type: string diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 84ba9c4e7d..ab6b77eb9d 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -8194,11 +8194,6 @@ "additionalProperties": false, "description": "Experiment tracks the state of an active or recent Fleet Automation experiment.", "properties": { - "generation": { - "description": "Generation is the DDA metadata.generation recorded when the experiment started.\nUsed to detect manual spec changes while the experiment is running: if the\ncurrent DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", - "format": "int64", - "type": "integer" - }, "id": { "description": "ID is the unique experiment ID sent by Fleet Automation.", "type": "string" diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 399685ee25..139bcb5b01 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -79,9 +79,8 @@ func Test_Experiment_StoppedRollback(t *testing.T) { // RC writes phase=stopped. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -123,9 +122,8 @@ func Test_Experiment_TimeoutRollback(t *testing.T) { // RC writes phase=running. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -163,39 +161,31 @@ func Test_Experiment_AbortOnManualChange(t *testing.T) { assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) - // The fake client does not auto-increment Generation on Update, so revisions - // end up with a zero CreationTimestamp (fake-client limitation). Patch them to - // a recent time so the timeout path in handleRollback is not accidentally - // triggered before the abort check runs. + // Patch revision timestamps to a recent time so the timeout path in + // handleRollback is not accidentally triggered before the abort check runs. for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { rev.CreationTimestamp = metav1.Now() assert.NoError(t, r.client.Update(context.TODO(), &rev)) } assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - // The fake client never increments Generation, so dda.Generation is always 0. - // Set experiment.Generation to a non-zero sentinel so the abortExperiment - // generation-mismatch check (instance.Generation != experiment.Generation) fires. - const experimentGen = int64(2) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: experimentGen, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) - // User manually changes the spec — in a real cluster this bumps Generation past - // experimentGen. In the fake client Generation stays at 0, which differs from - // the experimentGen sentinel above, so the mismatch is already in place. + // User manually changes the spec — the new spec won't match any known revision, + // so abortExperiment detects it as a manual change. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = ptr.To("datadoghq.com") + dda.Spec.Global.Site = ptr.To("manual-change.example.com") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) // Spec should be the user's manual change, not rolled back. - assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, "manual-change.example.com", *dda.Spec.Global.Site) assert.NotNil(t, dda.Status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseAborted, dda.Status.Experiment.Phase) } @@ -232,9 +222,8 @@ func Test_Experiment_TimeoutPhase_IsStable(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) time.Sleep(2 * timeout) @@ -269,9 +258,8 @@ func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 2) @@ -286,12 +274,12 @@ func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) } -// Test_Experiment_RunningAfterTimeout_StaleGeneration verifies that if RC -// writes phase=running after a timeout rollback has completed, the operator -// fires timeout again idempotently: the pre-experiment revision is old enough -// to exceed the timeout threshold, rollback is a no-op (spec already correct), -// and phase=timeout is written again. -func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { +// Test_Experiment_RunningAfterTimeout verifies that if RC writes phase=running +// after a timeout rollback has completed, the operator fires timeout again +// idempotently: the pre-experiment revision is old enough to exceed the timeout +// threshold, rollback is a no-op (spec already correct), and phase=timeout is +// written again. +func Test_Experiment_RunningAfterTimeout(t *testing.T) { const ns, name = "default", "test-dda" const uid = types.UID("uid-1") const timeout = 50 * time.Millisecond @@ -309,22 +297,19 @@ func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) time.Sleep(2 * timeout) reconcileN(t, r, ns, name, 2) assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) - // RC writes phase=running again with the old (pre-rollback) generation — stale. + // RC writes phase=running again after the rollback already completed. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - const staleGen = int64(99) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: staleGen, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -355,9 +340,8 @@ func Test_Experiment_StoppedAfterRollback(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 2) @@ -397,9 +381,8 @@ func Test_Experiment_AbortDoesNotRollback(t *testing.T) { // Manually force phase=aborted (as if abort already happened). assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseAborted, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseAborted, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index 7becd9a4c4..045565b720 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -43,19 +43,20 @@ func (r *Reconciler) manageExperiment( if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { return err } - abortExperiment(ctx, instance.Generation, experiment, newStatus) + abortExperiment(ctx, instance, experiment, newStatus, revList) return nil } // abortExperiment marks the experiment as aborted in newStatus if a manual spec -// change is detected (DDA generation differs from the recorded experiment generation). +// change is detected (current spec doesn't match any known ControllerRevision). // It is a no-op if handleRollback has already set a terminal phase (e.g. timeout), // preventing spurious abort logs and phase overwrites. func abortExperiment( ctx context.Context, - instanceGeneration int64, + instance *v2alpha1.DatadogAgent, experiment *v2alpha1.ExperimentStatus, newStatus *v2alpha1.DatadogAgentStatus, + revisions []appsv1.ControllerRevision, ) { if experiment.Phase != v2alpha1.ExperimentPhaseRunning { return @@ -64,7 +65,22 @@ func abortExperiment( // handleRollback already determined a terminal phase (e.g. timeout); don't overwrite or log. return } - if experiment.Generation == 0 || instanceGeneration == experiment.Generation { + // On the first reconcile after experiment start, the new revision hasn't + // been created yet (manageExperiment runs before manageRevision). With + // only one revision (the pre-experiment baseline), the current spec won't + // match it — but that's expected, not a manual change. Skip the check + // when fewer than 2 revisions exist. + if len(revisions) < 2 { + return + } + if findMostRecentMatchingRevision(revisions, instance) != nil { + // Spec matches a known revision — no manual change detected. + // Edge case: if the user manually reverts to the pre-experiment spec, it + // matches the baseline revision, so abort does not fire. The experiment + // still terminates via timeout (the baseline revision's old timestamp + // exceeds the timeout threshold), and the rollback is a no-op because + // the spec already matches the target. The phase will read "timeout" + // rather than "aborted", but the end state is correct. return } ctrl.LoggerFrom(ctx).Info("Aborting experiment due to manual spec change") @@ -93,6 +109,12 @@ func (r *Reconciler) handleRollback( return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) case phase == v2alpha1.ExperimentPhaseRunning: rev := findMostRecentMatchingRevision(revisions, instance) + if rev == nil && len(revisions) >= 2 { + // Spec was manually changed — no revision matches the current spec. + // Fall back to the highest-numbered revision (the experiment revision) + // so we can still detect timeout even after a manual spec change. + rev = highestRevision(revisions) + } if rev != nil { elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { @@ -220,6 +242,17 @@ func findMostRecentMatchingRevision(revisions []appsv1.ControllerRevision, insta return result } +// highestRevision returns the revision with the largest Revision number. +func highestRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRevision { + var result *appsv1.ControllerRevision + for i := range revisions { + if result == nil || revisions[i].Revision > result.Revision { + result = &revisions[i] + } + } + return result +} + func getExperimentTimeout(timeout time.Duration) time.Duration { if timeout == 0 { return ExperimentDefaultTimeout diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index be255f20b8..233344796e 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -21,24 +21,78 @@ import ( func TestManageExperiment_AbortsOnManualChange(t *testing.T) { r, _ := newRevisionTestReconciler(t) - instance := newRevisionTestOwner("test-dda", "default") - instance.Generation = 3 - instance.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instance.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: 2, + // Create two revisions: pre-experiment (specA) and experiment (specB). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + // Simulate a manual spec change: specC doesn't match any revision. + instanceC := newRevisionTestOwner("test-dda", "default") + manualSite := "manual-change.example.com" + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} + instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + // Set recent timestamps so the timeout path in handleRollback does not fire. + revList := mustListRevisions(t, r, instanceC) + for i := range revList { + revList[i].CreationTimestamp = metav1.Now() } status := &v2alpha1.DatadogAgentStatus{ - Experiment: instance.Status.Experiment.DeepCopy(), + Experiment: instanceC.Status.Experiment.DeepCopy(), } - err := r.manageExperiment(context.Background(), instance, status, metav1.Now(), mustListRevisions(t, r, instance)) + err := r.manageExperiment(context.Background(), instanceC, status, metav1.Now(), revList) require.NoError(t, err) require.NotNil(t, status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseAborted, status.Experiment.Phase) } +// TestManageExperiment_ManualRevertToBaselineTerminatesViaTimeout verifies that +// when the user manually reverts the spec to the pre-experiment value during a +// running experiment, the experiment terminates via timeout rather than abort. +// The revision-based abort check sees the spec matching the baseline revision +// and treats it as a known state. The timeout path fires because the baseline +// revision's old timestamp exceeds the threshold. The rollback is a no-op +// (spec already matches target), and the phase is set to "timeout". +func TestManageExperiment_ManualRevertToBaselineTerminatesViaTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Rev1: pre-experiment spec (specA). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + // Rev2: experiment spec (specB). + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceA)) + + // User manually reverts to specA. The spec matches rev1 (the baseline), + // so abortExperiment won't fire. Instead, handleRollback detects timeout + // from rev1's old CreationTimestamp. + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + revList := mustListRevisions(t, r, instanceA) + // Ensure the baseline revision's timestamp is old enough to trigger timeout. + for i := range revList { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceA.Status.Experiment.DeepCopy()} + require.NoError(t, r.manageExperiment(context.Background(), instanceA, newStatus, metav1.Now(), revList)) + // Abort does not fire — spec matches a known revision. Timeout fires instead + // because the matching revision's timestamp exceeds the threshold. + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + func TestRollback_RestoresSpec(t *testing.T) { r, c := newRevisionTestReconciler(t) @@ -172,14 +226,14 @@ func TestManageExperiment_TimeoutWinsOverSpuriousAbort(t *testing.T) { require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) require.NoError(t, c.Create(context.Background(), instanceB)) - // Simulate a post-409 reconcile: phase=running, but generation was bumped by the - // rollback spec update (instanceB.Generation != experiment.Generation), AND timeout - // has elapsed. abortExperiment would fire for the generation mismatch, but - // handleRollback must win and persist phase=timeout. - instanceB.Generation = 99 + // Simulate a post-409 reconcile: phase=running, the spec was manually changed + // so it doesn't match any revision, AND timeout has elapsed. abortExperiment + // would fire for the revision mismatch, but handleRollback must win and + // persist phase=timeout. + manualSite := "manual-change.example.com" + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: 1, + Phase: v2alpha1.ExperimentPhaseRunning, } revList := mustListRevisions(t, r, instanceB) @@ -236,11 +290,8 @@ func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { // The DDA in the cluster already has the rolled-back spec (instanceA's spec), // as if reconcile-1 restored it but its status write 409'd. - // Generation is set to a realistic non-zero value: RC would have recorded - // instanceA.Generation when the experiment started. instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: instanceA.Generation, + Phase: v2alpha1.ExperimentPhaseRunning, } require.NoError(t, c.Create(context.Background(), instanceA)) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 81832ae56c..05edda8475 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -155,17 +155,17 @@ func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetMan // startDatadogAgentExperiment starts a DatadogAgent experiment. // The first step updates the DDA spec with the experiment configuration. -// The second step updates the DDA status to running, recording the experiment ID and generation. +// The second step updates the DDA status to running, recording the experiment ID. func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { - tempLogger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) - tempLogger.Info("Starting DatadogAgent experiment", "config", req.Params.Version) + logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) + logger.Info("Starting DatadogAgent experiment", "config", req.Params.Version) op, err := d.resolveOperation(req, "start DatadogAgent experiment") if err != nil { - tempLogger.Error(err, "Failed to resolve operation") + logger.Error(err, "Failed to resolve operation") return err } - logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) + logger = logger.WithValues("namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) ctx = ctrl.LoggerInto(ctx, logger) logger.Info("Starting k8s resource patch", "config", op.Config) @@ -187,31 +187,22 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR return fmt.Errorf("start DatadogAgent experiment: failed to patch spec: %w", err) } - // Re-fetch to get the generation that was bumped by the spec patch. - if err := retryWithBackoff(ctx, func() error { - return d.client.Get(ctx, op.NamespacedName, dda) - }); err != nil { - return fmt.Errorf("start DatadogAgent experiment: failed to re-fetch DatadogAgent: %w", err) - } - newGeneration := dda.Generation - - // Update status: phase=running, record experiment ID and new generation. + // Update status: phase=running, record experiment ID. // Re-fetch inside the retry to get the latest ResourceVersion on conflict. if err := retryWithBackoff(ctx, func() error { if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { return err } dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - ID: req.ID, - Generation: newGeneration, + Phase: v2alpha1.ExperimentPhaseRunning, + ID: req.ID, } return d.client.Status().Update(ctx, dda) }); err != nil { return fmt.Errorf("start DatadogAgent experiment: failed to update status: %w", err) } - logger.Info("Started DatadogAgent experiment", "generation", newGeneration) + logger.Info("Started DatadogAgent experiment") return nil } diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index 4c688b5849..e07156b71d 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -55,13 +55,12 @@ var testDDANSN = types.NamespacedName{Namespace: "datadog", Name: "datadog-agent func testDDAObject(phase v2alpha1.ExperimentPhase) *v2alpha1.DatadogAgent { dda := &v2alpha1.DatadogAgent{ ObjectMeta: metav1.ObjectMeta{ - Name: testDDANSN.Name, - Namespace: testDDANSN.Namespace, - Generation: 1, + Name: testDDANSN.Name, + Namespace: testDDANSN.Namespace, }, } if phase != "" { - dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase, ID: "old-exp", Generation: 1} + dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase, ID: "old-exp"} } return dda } @@ -144,7 +143,6 @@ func TestStartDatadogAgentExperiment_Success_NilPhase(t *testing.T) { require.NotNil(t, dda.Status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) assert.Equal(t, req.ID, dda.Status.Experiment.ID) - assert.NotZero(t, dda.Status.Experiment.Generation) } func TestStartDatadogAgentExperiment_Success_FromRollback(t *testing.T) { @@ -318,9 +316,8 @@ func TestStopDatadogAgentExperiment_Success_Running(t *testing.T) { require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) require.NotNil(t, dda.Status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseStopped, dda.Status.Experiment.Phase) - // ID and generation must be preserved. + // ID must be preserved. assert.Equal(t, "old-exp", dda.Status.Experiment.ID) - assert.Equal(t, int64(1), dda.Status.Experiment.Generation) } // --- promoteDatadogAgentExperiment tests --- @@ -390,9 +387,8 @@ func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) require.NotNil(t, dda.Status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) - // ID and generation must be preserved. + // ID must be preserved. assert.Equal(t, "old-exp", dda.Status.Experiment.ID) - assert.Equal(t, int64(1), dda.Status.Experiment.Generation) } // --- validateOperation tests --- diff --git a/pkg/fleet/experiment.go b/pkg/fleet/experiment.go index 149c083e7f..64bdd26b32 100644 --- a/pkg/fleet/experiment.go +++ b/pkg/fleet/experiment.go @@ -110,9 +110,14 @@ var experimentBackoff = wait.Backoff{ // retryWithBackoff retries fn on any error with exponential backoff. // The total retry window is bounded by a 3-minute context timeout. func retryWithBackoff(ctx context.Context, fn func() error) error { + logger := ctrl.LoggerFrom(ctx) + logger.Info("Retrying with backoff", "backoff", experimentBackoff) ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) defer cancel() + logger.Info("Context with timeout", "timeout", 3*time.Minute) return retry.OnError(experimentBackoff, func(err error) bool { + logger.Info("Error", "error", err) + logger.Info("Context error", "error", ctx.Err()) return ctx.Err() == nil }, fn) } diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index 44c95e0014..248ad0a9b5 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -108,12 +108,14 @@ func handleUpdaterTaskUpdate(ctx context.Context, h func(remoteAPIRequest) error var req remoteAPIRequest if err := json.Unmarshal(raw.Config, &req); err != nil { + logger.Error(err, "Failed to unmarshal remote API request") applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: fmt.Sprintf("failed to unmarshal remote API request: %v", err)}) continue } if _, ok := seen[req.ID]; ok { // Already executed; acknowledge without re-running. + logger.Info("Remote API request already executed", "id", req.ID) applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateAcknowledged}) continue } @@ -122,6 +124,7 @@ func handleUpdaterTaskUpdate(ctx context.Context, h func(remoteAPIRequest) error applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateUnacknowledged}) if err := h(req); err != nil { + logger.Error(err, "Failed to handle remote API request") applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: err.Error()}) continue } From b6d8b7f93b2e549b771fc3d6f894090452544aac Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:44:25 -0400 Subject: [PATCH 08/15] More edge cases --- .../controller_experiment_integration_test.go | 264 ++++++++++++++++++ .../controller/datadogagent/experiment.go | 59 +++- .../datadogagent/experiment_test.go | 158 +++++++++-- internal/controller/datadogagent/revision.go | 120 ++++---- .../controller/datadogagent/revision_test.go | 140 +++++----- 5 files changed, 593 insertions(+), 148 deletions(-) diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 139bcb5b01..9d28e1ac5e 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -396,3 +396,267 @@ func Test_Experiment_AbortDoesNotRollback(t *testing.T) { // Also verify the revision timestamp is not used as a proxy for time.Now() comparison. _ = metav1.Now() } + +// Test_Experiment_Abort_AnnotatesOnlyExperimentRevision verifies that when an +// experiment is aborted due to a manual spec change: +// - The experiment revision (highest at the time of abort) is annotated. +// - The new revision created by manageRevision for the manual-change spec +// is NOT annotated. +// - Subsequent reconciles do not re-annotate or spread the annotation. +func Test_Experiment_Abort_AnnotatesOnlyExperimentRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Record the experiment revision name (highest Revision number). + var experimentRevName string + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if experimentRevName == "" || rev.Revision > 0 { + experimentRevName = rev.Name + } + } + maxRev := int64(0) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Revision > maxRev { + maxRev = rev.Revision + experimentRevName = rev.Name + } + } + + // Patch timestamps so timeout doesn't fire before abort. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // Set phase=running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // User manually changes the spec — triggers abort. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("manual-change.example.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + // First reconcile: abort fires (manageExperiment), then manageRevision + // creates rev3 for the manual-change spec. + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, mustGetExperimentPhase(t, r, ns, name)) + + // Verify: only the experiment revision is annotated, not the new one. + revs := listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should be annotated", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "non-experiment revision %s should NOT be annotated", rev.Name) + } + } + + // Run additional reconciles — annotation state must be stable. + reconcileN(t, r, ns, name, 3) + + revs = listOwnedRevisions(t, r.client, ns, uid) + annotatedNames := []string{} + for _, rev := range revs { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedNames = append(annotatedNames, rev.Name) + } + } + assert.Equal(t, []string{experimentRevName}, annotatedNames, + "after multiple reconciles, only the original experiment revision should remain annotated") +} + +// Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision verifies +// that when an experiment is rolled back via phase=stopped: +// - The experiment revision is annotated with experiment-rollback. +// - The rollback target (baseline) revision is NOT annotated. +// - Subsequent reconciles do not spread or remove the annotation. +func Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Record baseline revision name. + baselineRevName := listOwnedRevisions(t, r.client, ns, uid)[0].Name + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Identify experiment revision. + var experimentRevName string + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name != baselineRevName { + experimentRevName = rev.Name + } + } + assert.NotEmpty(t, experimentRevName, "should have an experiment revision") + + // RC writes phase=stopped → triggers rollback. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // Verify: experiment revision annotated, baseline NOT annotated. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should be annotated after rollback", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "baseline revision %s should NOT be annotated", rev.Name) + } + } + + // Run additional reconciles — annotation state must be stable. + reconcileN(t, r, ns, name, 3) + + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should still be annotated after extra reconciles", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "baseline revision %s should still NOT be annotated after extra reconciles", rev.Name) + } + } +} + +// Test_Experiment_ReapplySameSpec_NoImmediateTimeout verifies the full +// annotation-based revision recreate flow end-to-end: +// +// 1. Baseline spec → experiment spec → timeout rollback. +// 2. Rollback annotates the experiment revision with experiment-rollback=true. +// 3. Re-apply the same experiment spec. +// 4. ensureRevision creates a new revision for the experiment spec (the content +// hash may differ from the original due to defaulting, but if it matches, +// the annotated revision is deleted+recreated with a fresh timestamp). +// 5. A subsequent reconcile with phase=running does NOT immediately timeout. +// +// NOTE: The fake client doesn't set CreationTimestamp on Create (unlike the +// real API server), so we manually patch timestamps to simulate fresh +// revisions after the re-apply reconcile. +func Test_Experiment_ReapplySameSpec_NoImmediateTimeout(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + // Use a short timeout for the initial experiment so it times out quickly, + // then switch to a long timeout for the re-applied experiment so we can + // assert it does NOT timeout within a single reconcile. + const shortTimeout = 50 * time.Millisecond + const longTimeout = 5 * time.Second + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, shortTimeout) + + // Step 1: Create baseline (rev1). + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Step 2: Apply experiment spec (rev2). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Step 3: Set phase=running and let it timeout. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * shortTimeout) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be rolled back") + + // Verify the experiment revision is annotated as rolled back. + revs := listOwnedRevisions(t, r.client, ns, uid) + var annotatedCount int + for _, rev := range revs { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedCount++ + } + } + assert.Equal(t, 1, annotatedCount, "exactly one revision should have the rollback annotation") + + // Switch to a long timeout so the re-applied experiment doesn't timeout + // within the reconcile's own execution time. + r.options.ExperimentTimeout = longTimeout + + // Step 4: Clear experiment status (simulating fleet daemon acknowledging the + // rollback) and re-apply the same experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = nil + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + // This reconcile triggers ensureRevision which either: + // - matches the annotated revision (same content hash) → delete+recreate, or + // - creates a new revision (different hash due to defaulting differences). + // Either way, the current revision for this spec has no rollback annotation. + reconcileN(t, r, ns, name, 1) + + // Step 5: Set phase=running again. Patch all revision timestamps to now so + // the timeout check works correctly with the fake client (which doesn't set + // CreationTimestamp on Create like the real API server). + revs = listOwnedRevisions(t, r.client, ns, uid) + assert.Len(t, revs, 2) + for i := range revs { + revs[i].CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &revs[i])) + } + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile — should NOT timeout because the revision is fresh. + reconcileN(t, r, ns, name, 1) + + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "experiment should still be running — no immediate timeout after reapply") +} diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index 045565b720..ce7c189eae 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -17,6 +17,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" ) @@ -24,6 +25,13 @@ import ( // ExperimentDefaultTimeout is the duration after which a running experiment is automatically rolled back. const ExperimentDefaultTimeout = 15 * time.Minute +// annotationExperimentRollback marks a ControllerRevision whose experiment was +// rejected (rollback, timeout, or abort). The annotation tells handleRollback +// to skip the timeout check (the CreationTimestamp is stale from the previous +// experiment) and tells ensureRevision to delete+recreate the revision so it +// gets a fresh timestamp when the same spec is re-applied. +const annotationExperimentRollback = "operator.datadoghq.com/experiment-rollback" + // manageExperiment handles all experiment state transitions for a reconcile cycle. // Must be called before manageRevision. func (r *Reconciler) manageExperiment( @@ -43,7 +51,7 @@ func (r *Reconciler) manageExperiment( if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { return err } - abortExperiment(ctx, instance, experiment, newStatus, revList) + r.abortExperiment(ctx, instance, experiment, newStatus, revList) return nil } @@ -51,7 +59,7 @@ func (r *Reconciler) manageExperiment( // change is detected (current spec doesn't match any known ControllerRevision). // It is a no-op if handleRollback has already set a terminal phase (e.g. timeout), // preventing spurious abort logs and phase overwrites. -func abortExperiment( +func (r *Reconciler) abortExperiment( ctx context.Context, instance *v2alpha1.DatadogAgent, experiment *v2alpha1.ExperimentStatus, @@ -85,6 +93,11 @@ func abortExperiment( } ctrl.LoggerFrom(ctx).Info("Aborting experiment due to manual spec change") newStatus.Experiment.Phase = v2alpha1.ExperimentPhaseAborted + // Mark the experiment revision (highest-numbered) so its stale timestamp + // doesn't cause an immediate timeout if the same spec is re-applied. + if rev := highestRevision(revisions); rev != nil { + r.markRevisionRolledBack(ctx, rev) + } } // handleRollback checks if the experiment needs rollback (explicit stop or timeout). @@ -115,7 +128,7 @@ func (r *Reconciler) handleRollback( // so we can still detect timeout even after a manual spec change. rev = highestRevision(revisions) } - if rev != nil { + if rev != nil && rev.Annotations[annotationExperimentRollback] != "true" { elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String()) @@ -128,7 +141,10 @@ func (r *Reconciler) handleRollback( } // restorePreviousSpec restores the DDA spec from the previous ControllerRevision -// and, on success, sets the terminal experiment phase. +// and, on success, sets the terminal experiment phase. It also marks the +// experiment revision (the highest-numbered, non-rollback-target revision) with +// the rollback annotation so its stale CreationTimestamp doesn't cause an +// immediate timeout if the same spec is re-applied later. func (r *Reconciler) restorePreviousSpec( ctx context.Context, instanceMeta metav1.ObjectMeta, @@ -136,10 +152,20 @@ func (r *Reconciler) restorePreviousSpec( revisions []appsv1.ControllerRevision, terminalPhase v2alpha1.ExperimentPhase, ) error { - if err := r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)); err != nil { + rollbackTarget := findRollbackTarget(revisions) + if err := r.rollback(ctx, instanceMeta, rollbackTarget); err != nil { return err } newStatus.Experiment.Phase = terminalPhase + // Mark the experiment revision (highest-numbered) so its stale timestamp + // doesn't cause an immediate timeout if the same spec is re-applied. + // Only annotate the highest revision rather than all non-rollback-target + // revisions: if GC failed on a prior reconcile there may be 3+ revisions, + // and annotating old baselines would cause needless delete+recreate in + // ensureRevision if those specs are ever re-applied. + if rev := highestRevision(revisions); rev != nil && rev.Name != rollbackTarget { + r.markRevisionRolledBack(ctx, rev) + } return nil } @@ -150,7 +176,8 @@ func (r *Reconciler) rollback( rollbackTarget string, ) error { if rollbackTarget == "" { - return fmt.Errorf("no previous revision to roll back to") + ctrl.LoggerFrom(ctx).Info("No previous revision to roll back to, skipping spec restore") + return nil } cr := &appsv1.ControllerRevision{} @@ -253,6 +280,26 @@ func highestRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRe return result } +// markRevisionRolledBack annotates a ControllerRevision with the rollback +// annotation. This is best-effort: if the patch fails, the stale timestamp +// remains but is no worse than the existing behavior. +func (r *Reconciler) markRevisionRolledBack(ctx context.Context, rev *appsv1.ControllerRevision) { + if rev.Annotations[annotationExperimentRollback] == "true" { + return // already annotated + } + logger := ctrl.LoggerFrom(ctx).WithValues( + "object.kind", "ControllerRevision", + "object.namespace", rev.Namespace, + "object.name", rev.Name, + ) + patch := []byte(`{"metadata":{"annotations":{"` + annotationExperimentRollback + `":"true"}}}`) + if err := r.client.Patch(ctx, rev, client.RawPatch(types.MergePatchType, patch)); err != nil { + logger.Error(err, "Failed to annotate experiment revision as rolled back") + return + } + logger.Info("Marked experiment revision as rolled back") +} + func getExperimentTimeout(timeout time.Duration) time.Duration { if timeout == 0 { return ExperimentDefaultTimeout diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index 233344796e..bc62995e91 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -123,8 +123,7 @@ func TestRollback_NoPreviousRevision(t *testing.T) { instance := newRevisionTestOwner("test-dda", "default") err := r.rollback(context.Background(), instance.ObjectMeta, "") - require.Error(t, err) - assert.Contains(t, err.Error(), "no previous revision") + require.NoError(t, err) } func TestHandleRollback_StoppedPhase(t *testing.T) { @@ -311,16 +310,13 @@ func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { // TestReapplySameSpecAfterRollback_NoImmediateTimeout is the end-to-end // regression test for the stale-revision bug. // -// Without the fix: gcOldRevisions kept the experiment revision (rev2/B) after -// rollback. When RC later re-applied spec B as a new experiment and set -// phase=Running, findMostRecentMatchingRevision found the stale rev2 — its -// CreationTimestamp predated the current experiment — so elapsed >= timeout -// fired immediately and the operator rolled back again, making the re-apply -// appear to have no effect. +// Without the fix: the stale experiment revision's old CreationTimestamp caused +// an immediate timeout when the same spec was re-applied as a new experiment. // -// With the fix: gcOldRevisions deletes rev2 once phase=Rollback is persisted. -// Re-applying spec B creates a fresh revision with a current timestamp, and -// the timeout clock starts correctly. +// With the fix: restorePreviousSpec annotates the experiment revision with the +// rollback annotation. handleRollback skips the timeout check for annotated +// revisions. ensureRevision deletes+recreates the annotated revision with a +// fresh timestamp when the spec is re-applied. func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { r, c := newRevisionTestReconciler(t) @@ -343,35 +339,147 @@ func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { } // Rollback: RC sets phase=Stopped; operator restores spec A. + // restorePreviousSpec annotates the experiment revision (B) as rolled back. instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} rollbackStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} require.NoError(t, r.handleRollback(context.Background(), instanceB, rollbackStatus, metav1.Now(), revList)) require.Equal(t, v2alpha1.ExperimentPhaseRollback, rollbackStatus.Experiment.Phase) - // Next reconcile: Rollback phase is now persisted in instance.Status. - // gcOldRevisions must delete the stale rev2 (B). - instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback} - rollbackNewStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback}} - require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), rollbackNewStatus)) - + // Verify the experiment revision was annotated. remaining := mustListRevisions(t, r, instanceA) - require.Len(t, remaining, 1, "stale experiment revision (spec B) must be deleted once Rollback phase is persisted") + require.Len(t, remaining, 2, "both revisions should be kept (no aggressive GC)") + var annotatedCount int + for _, rev := range remaining { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedCount++ + } + } + assert.Equal(t, 1, annotatedCount, "exactly one revision should have the rollback annotation") // RC re-applies spec B as a new experiment (sets spec=B, phase=Running). - // In the real reconcile loop manageExperiment (handleRollback) runs before - // manageRevision, so no revision for spec B exists at the time of the check. - // findMostRecentMatchingRevision returns nil → timeout check is skipped. + // handleRollback finds the annotated revision → skips timeout. instanceB2 := newRevisionTestOwner("test-dda", "default") instanceB2.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} - revListBeforeNewRevision := mustListRevisions(t, r, instanceB2) - require.Len(t, revListBeforeNewRevision, 1, "only rev1 (A) should exist before the new experiment's revision is created") - + revListForNewExp := mustListRevisions(t, r, instanceB2) newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} - require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListBeforeNewRevision)) + require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListForNewExp)) assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, "re-applying the same spec after rollback must not immediately time out") + + // ensureRevision recreates the annotated revision with a fresh timestamp. + _, err := r.ensureRevision(context.Background(), instanceB2, mustListRevisions(t, r, instanceB2), false) + require.NoError(t, err) + + finalRevs := mustListRevisions(t, r, instanceB2) + for _, rev := range finalRevs { + assert.Empty(t, rev.Annotations[annotationExperimentRollback], + "rollback annotation should be cleared after recreate") + } +} + +// TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest verifies that +// when 3+ revisions exist (e.g. GC failed on a prior reconcile), only the +// highest-numbered revision (the experiment) is annotated — not older baselines. +func TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Build 3 revisions using ensureRevision directly (bypasses GC). + instanceA := newRevisionTestOwner("test-dda", "default") + rev1Name, err := r.ensureRevision(context.Background(), instanceA, nil, false) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + rev2Name, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + experimentSite := "datadoghq.eu" + instanceC := newRevisionTestOwner("test-dda", "default") + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &experimentSite}} + rev3Name, err := r.ensureRevision(context.Background(), instanceC, mustListRevisions(t, r, instanceC), false) + require.NoError(t, err) + + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 3, "need 3 revisions to test this scenario") + + // rollback fetches the current DDA; create it with the experiment spec. + require.NoError(t, c.Create(context.Background(), instanceC)) + + // Trigger rollback via phase=stopped. + instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceC.Status.Experiment.DeepCopy()} + require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC.ObjectMeta, newStatus, revList, v2alpha1.ExperimentPhaseRollback)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) + + // Verify: only rev3 (experiment, highest) is annotated. + // Rev1 (old baseline) and rev2 (rollback target) must NOT be annotated. + for _, rev := range mustListRevisions(t, r, instanceA) { + hasAnnotation := rev.Annotations[annotationExperimentRollback] == "true" + switch rev.Name { + case rev3Name: + assert.True(t, hasAnnotation, "rev3 (experiment, highest) should be annotated") + case rev2Name: + assert.False(t, hasAnnotation, "rev2 (rollback target) should NOT be annotated") + case rev1Name: + assert.False(t, hasAnnotation, "rev1 (old baseline) should NOT be annotated") + } + } +} + +// TestAbortExperiment_ThreeRevisions_AnnotatesOnlyHighest verifies that when +// 3+ revisions exist and abort fires, only the highest-numbered revision (the +// experiment) is annotated — not older baselines. +func TestAbortExperiment_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + + // Build 3 revisions using ensureRevision directly (bypasses GC). + instanceA := newRevisionTestOwner("test-dda", "default") + rev1Name, err := r.ensureRevision(context.Background(), instanceA, nil, false) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + rev2Name, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + experimentSite := "datadoghq.eu" + instanceC := newRevisionTestOwner("test-dda", "default") + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &experimentSite}} + rev3Name, err := r.ensureRevision(context.Background(), instanceC, mustListRevisions(t, r, instanceC), false) + require.NoError(t, err) + + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 3) + + // Set recent timestamps so timeout doesn't fire first. + for i := range revList { + revList[i].CreationTimestamp = metav1.Now() + } + + // Simulate manual spec change (specD) — doesn't match any revision. + manualSite := "manual-change.example.com" + instanceD := newRevisionTestOwner("test-dda", "default") + instanceD.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} + instanceD.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceD.Status.Experiment.DeepCopy()} + r.abortExperiment(context.Background(), instanceD, instanceD.Status.Experiment, newStatus, revList) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, newStatus.Experiment.Phase) + + // Verify: only rev3 (experiment, highest) is annotated. + for _, rev := range mustListRevisions(t, r, instanceA) { + hasAnnotation := rev.Annotations[annotationExperimentRollback] == "true" + switch rev.Name { + case rev3Name: + assert.True(t, hasAnnotation, "rev3 (experiment, highest) should be annotated") + case rev2Name: + assert.False(t, hasAnnotation, "rev2 should NOT be annotated") + case rev1Name: + assert.False(t, hasAnnotation, "rev1 (old baseline) should NOT be annotated") + } + } } func TestHandleRollback_Timeout(t *testing.T) { diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 1ec5abe75c..fbd02bd9ad 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -15,6 +15,7 @@ import ( appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -51,7 +52,7 @@ func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.Data if err != nil { return err } - if err := r.gcOldRevisions(ctx, instance, revName, revList); err != nil { + if err := r.gcOldRevisions(ctx, revName, revList); err != nil { ctrl.LoggerFrom(ctx).Error(err, "Failed to garbage collect old ControllerRevisions, will retry on next reconcile") } return nil @@ -107,6 +108,11 @@ func (r *Reconciler) ensureRevision( return "", fmt.Errorf("failed to get GVK for owner: %w", err) } + data := runtime.RawExtension{Raw: specBytes} + labels := map[string]string{ + apicommon.DatadogAgentNameLabelKey: instance.GetName(), + } + // Find any existing revision with identical data, and track the max Revision. var matchingRev *appsv1.ControllerRevision maxRevision := int64(0) @@ -121,6 +127,16 @@ func (r *Reconciler) ensureRevision( } if matchingRev != nil { + objLogger := logger.WithValues( + "object.kind", "ControllerRevision", + "object.namespace", matchingRev.Namespace, + "object.name", matchingRev.Name, + ) + + if matchingRev.Annotations[annotationExperimentRollback] == "true" && !skipBump { + return r.recreateRevision(ctx, matchingRev, instance, gvks[0], labels, data, maxRevision) + } + // Identical content already snapshotted. Bump Revision to max+1 if it // has been superseded (e.g. after a revert) so ordering stays correct. // Skip the bump during experiment rollback: bumping the pre-experiment @@ -128,11 +144,6 @@ func (r *Reconciler) ensureRevision( // to select the experiment revision as the rollback target on the next // stopped signal, reversing the rollback. if matchingRev.Revision < maxRevision && !skipBump { - objLogger := logger.WithValues( - "object.kind", "ControllerRevision", - "object.namespace", matchingRev.Namespace, - "object.name", matchingRev.Name, - ) objLogger.Info("Bumping ControllerRevision to latest") patch := fmt.Appendf(nil, `{"revision":%d}`, maxRevision+1) if err := r.client.Patch(ctx, matchingRev, client.RawPatch(types.MergePatchType, patch)); err != nil && !apierrors.IsConflict(err) { @@ -143,10 +154,6 @@ func (r *Reconciler) ensureRevision( } nextRevision := maxRevision + 1 - data := runtime.RawExtension{Raw: specBytes} - labels := map[string]string{ - apicommon.DatadogAgentNameLabelKey: instance.GetName(), - } rev := controllerrevisions.NewControllerRevision(instance, gvks[0], labels, data, nextRevision, nil) // Check for a name conflict before creating. @@ -179,6 +186,46 @@ func (r *Reconciler) ensureRevision( return rev.Name, nil } +// recreateRevision deletes a rolled-back ControllerRevision and creates a +// fresh one with the same content but a new CreationTimestamp. This prevents +// an immediate timeout when the same experiment spec is re-applied, since +// CreationTimestamp is immutable in Kubernetes. +// +// Failure recovery: +// - Delete fails: error returned, next reconcile retries. +// - Delete succeeds, Create fails (or operator crashes): the revision is +// gone, so the next reconcile's ensureRevision takes the normal "no +// matching revision" path and creates a fresh one. +func (r *Reconciler) recreateRevision( + ctx context.Context, + old *appsv1.ControllerRevision, + instance *v2alpha1.DatadogAgent, + gvk schema.GroupVersionKind, + labels map[string]string, + data runtime.RawExtension, + maxRevision int64, +) (string, error) { + logger := ctrl.LoggerFrom(ctx).WithValues( + "object.kind", "ControllerRevision", + "object.namespace", old.Namespace, + "object.name", old.Name, + ) + logger.Info("Recreating rolled-back ControllerRevision with fresh timestamp") + + if err := r.client.Delete(ctx, old); err != nil && !apierrors.IsNotFound(err) { + return "", fmt.Errorf("failed to delete rolled-back ControllerRevision %s: %w", old.Name, err) + } + + fresh := controllerrevisions.NewControllerRevision(instance, gvk, labels, data, maxRevision+1, nil) + if err := r.client.Create(ctx, fresh); err != nil { + if apierrors.IsAlreadyExists(err) { + return fresh.Name, nil + } + return "", fmt.Errorf("failed to recreate ControllerRevision %s: %w", fresh.Name, err) + } + return fresh.Name, nil +} + // datadogAnnotations returns a copy of annotations filtered to only those // with `.datadoghq.com/` in the key, which are used for preview features. func datadogAnnotations(all map[string]string) map[string]string { @@ -195,41 +242,27 @@ func datadogAnnotations(all map[string]string) map[string]string { } // gcOldRevisions deletes all but the two most recent ControllerRevisions: -// the current and previous. -// -// Exception: after a rejected experiment (rollback, timeout, or abort) the -// experiment revision carries a stale CreationTimestamp. If that revision were -// kept and the same spec were re-applied as a new experiment, -// findMostRecentMatchingRevision would find it, compute an elapsed time based -// on the old timestamp, and immediately time out. To prevent this, all -// non-current revisions are deleted once the rejected phase is persisted in -// instance.Status. The check uses the persisted status (not the in-flight -// newStatus) so that the revision survives any status-write conflicts during -// the rollback itself — the deletion is intentionally deferred to the next -// reconcile cycle after the terminal phase is confirmed. +// the current and previous. Stale experiment revisions (marked with the +// rollback annotation) are kept here — they are handled by ensureRevision +// which recreates them with a fresh timestamp when the same spec is re-applied. func (r *Reconciler) gcOldRevisions( ctx context.Context, - instance *v2alpha1.DatadogAgent, current string, revList []appsv1.ControllerRevision, ) error { logger := ctrl.LoggerFrom(ctx) // Identify the most recent non-current revision to keep as previous. - // Skip this when the persisted experiment phase is a rejected terminal - // phase — in that case we want to delete the stale experiment revision. previous := "" - if !isRejectedExperimentPhase(instance) { - previousRevision := int64(-1) - for i := range revList { - rev := &revList[i] - if rev.Name == current { - continue - } - if rev.Revision > previousRevision { - previousRevision = rev.Revision - previous = rev.Name - } + previousRevision := int64(-1) + for i := range revList { + rev := &revList[i] + if rev.Name == current { + continue + } + if rev.Revision > previousRevision { + previousRevision = rev.Revision + previous = rev.Name } } @@ -251,18 +284,3 @@ func (r *Reconciler) gcOldRevisions( return nil } - -// isRejectedExperimentPhase returns true when the persisted experiment phase -// indicates the experiment spec was rejected — rolled back, timed out, or -// aborted. These are the cases where keeping the experiment revision would -// cause an immediate timeout if the same spec were re-applied. -func isRejectedExperimentPhase(instance *v2alpha1.DatadogAgent) bool { - if instance.Status.Experiment == nil { - return false - } - switch instance.Status.Experiment.Phase { - case v2alpha1.ExperimentPhaseRollback, v2alpha1.ExperimentPhaseTimeout, v2alpha1.ExperimentPhaseAborted: - return true - } - return false -} diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index 2689f6db70..2c6140edf9 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -174,7 +174,7 @@ func TestGCOldRevisions_KeepsCurrentAndPrevious(t *testing.T) { names[i] = name } - err := r.gcOldRevisions(context.Background(), instances[2], names[2], mustListRevisions(t, r, instances[2])) + err := r.gcOldRevisions(context.Background(), names[2], mustListRevisions(t, r, instances[2])) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -223,7 +223,7 @@ func TestGCOldRevisions_KeepsTwoRevisions(t *testing.T) { name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instanceB, name2, mustListRevisions(t, r, instanceB)) + err = r.gcOldRevisions(context.Background(), name2, mustListRevisions(t, r, instanceB)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -268,7 +268,7 @@ func TestGCOldRevisions_NoPreviousWhenOnlyCurrent(t *testing.T) { revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instance, revName, mustListRevisions(t, r, instance)) + err = r.gcOldRevisions(context.Background(), revName, mustListRevisions(t, r, instance)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -291,7 +291,7 @@ func TestGCOldRevisions_DeletesMultipleOld(t *testing.T) { } current := newRevisionTestOwner("test-dda", "default") - err := r.gcOldRevisions(context.Background(), current, names[4], mustListRevisions(t, r, current)) + err := r.gcOldRevisions(context.Background(), names[4], mustListRevisions(t, r, current)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -408,81 +408,89 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { assert.Equal(t, int64(2), revList[0].Revision) } -// TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment verifies that when -// the persisted experiment phase is a rejected terminal phase (Rollback, Timeout, -// or Aborted), gcOldRevisions deletes the stale experiment revision instead of -// keeping it as "previous". This prevents an immediate timeout if the same spec -// is re-applied as a new experiment. -func TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment(t *testing.T) { - rejectedPhases := []v2alpha1.ExperimentPhase{ - v2alpha1.ExperimentPhaseRollback, - v2alpha1.ExperimentPhaseTimeout, - v2alpha1.ExperimentPhaseAborted, - } +// TestEnsureRevision_RecreatesRolledBackRevision verifies that when a revision +// has the experiment-rollback annotation, ensureRevision deletes and recreates +// it to get a fresh CreationTimestamp. This prevents an immediate timeout when +// the same experiment spec is re-applied after a previous experiment was rejected. +func TestEnsureRevision_RecreatesRolledBackRevision(t *testing.T) { + r, c := newRevisionTestReconciler(t) - for _, phase := range rejectedPhases { - t.Run(string(phase), func(t *testing.T) { - r, c := newRevisionTestReconciler(t) + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instanceA := newRevisionTestOwner("test-dda", "default") - instanceB := newRevisionTestOwner("test-dda", "default") - instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) - nameA, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) - require.NoError(t, err) - _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) - require.NoError(t, err) + // Annotate revB as rolled back (simulating what restorePreviousSpec does). + revB := fetchRevisionByName(t, c, "default", nameB) + revB.Annotations = map[string]string{annotationExperimentRollback: "true"} + require.NoError(t, c.Update(context.Background(), revB)) + + // Re-apply the same experiment spec (instanceB). ensureRevision should + // delete+recreate the annotated revision with a fresh timestamp. + // (In real Kubernetes the CreationTimestamp is set server-side on create; + // the fake client doesn't refresh it, so we verify the annotation and + // revision number instead.) + nameB2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + assert.Equal(t, nameB, nameB2, "should reuse same name (same data hash)") - // Simulate the persisted status having a rejected terminal phase. - // current is instanceA (spec restored after rollback). - instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + revB2 := fetchRevisionByName(t, c, "default", nameB2) + assert.Empty(t, revB2.Annotations[annotationExperimentRollback], "rollback annotation should be cleared") + assert.Equal(t, int64(3), revB2.Revision, "revision number should be max+1") +} - err = r.gcOldRevisions(context.Background(), instanceA, nameA, mustListRevisions(t, r, instanceA)) - require.NoError(t, err) +// TestEnsureRevision_SkipsRecreateWhenSkipBump verifies that rolled-back +// revisions are NOT recreated when skipBump is true (during experiment rollback). +func TestEnsureRevision_SkipsRecreateWhenSkipBump(t *testing.T) { + r, c := newRevisionTestReconciler(t) - revList := &appsv1.ControllerRevisionList{} - require.NoError(t, c.List(context.Background(), revList)) - assert.Len(t, revList.Items, 1, "experiment revision should be deleted after rejected phase") - assert.Equal(t, nameA, revList.Items[0].Name, "only the current (pre-experiment) revision should remain") - }) - } -} + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} -// TestGCOldRevisions_KeepsPreviousForNonRejectedPhases verifies that for -// non-rejected experiment phases (Running, Promoted, or nil), the normal -// "keep current + previous" behavior is preserved. -func TestGCOldRevisions_KeepsPreviousForNonRejectedPhases(t *testing.T) { - nonRejectedPhases := []v2alpha1.ExperimentPhase{ - v2alpha1.ExperimentPhaseRunning, - v2alpha1.ExperimentPhasePromoted, - "", // nil experiment - } + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) - for _, phase := range nonRejectedPhases { - t.Run(string(phase)+"_or_nil", func(t *testing.T) { - r, c := newRevisionTestReconciler(t) + // Annotate revB as rolled back. + revB := fetchRevisionByName(t, c, "default", nameB) + revB.Annotations = map[string]string{annotationExperimentRollback: "true"} + require.NoError(t, c.Update(context.Background(), revB)) - instanceA := newRevisionTestOwner("test-dda", "default") - instanceB := newRevisionTestOwner("test-dda", "default") - instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + // With skipBump=true, the annotated revision should be returned as-is. + nameB2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), true) + require.NoError(t, err) + assert.Equal(t, nameB, nameB2) - _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) - require.NoError(t, err) - nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) - require.NoError(t, err) + revB2 := fetchRevisionByName(t, c, "default", nameB2) + assert.Equal(t, "true", revB2.Annotations[annotationExperimentRollback], "annotation should still be present") +} - if phase != "" { - instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} - } +// TestGCOldRevisions_AlwaysKeepsPrevious verifies that GC always keeps the +// current and previous revisions regardless of experiment phase. +func TestGCOldRevisions_AlwaysKeepsPrevious(t *testing.T) { + r, c := newRevisionTestReconciler(t) - err = r.gcOldRevisions(context.Background(), instanceB, nameB, mustListRevisions(t, r, instanceB)) - require.NoError(t, err) + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - revList := &appsv1.ControllerRevisionList{} - require.NoError(t, c.List(context.Background(), revList)) - assert.Len(t, revList.Items, 2, "current and previous should both be kept for non-rejected phase") - }) - } + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + err = r.gcOldRevisions(context.Background(), nameB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 2, "current and previous should both be kept") } // TestListRevisions_ExcludesForeignOwner verifies that a ControllerRevision From bc752d323f8102553774a979d000df20f04bb8fb Mon Sep 17 00:00:00 2001 From: Paul Coignet Date: Tue, 14 Apr 2026 14:21:09 +0200 Subject: [PATCH 09/15] Add package state pop --- pkg/fleet/daemon.go | 47 +++++++++++++++- pkg/fleet/daemon_test.go | 108 ++++++++++++++++++++++++++++++++++++ pkg/remoteconfig/updater.go | 2 + 3 files changed, 156 insertions(+), 1 deletion(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 05edda8475..f86cc405ea 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -10,6 +10,7 @@ import ( "fmt" "sync" + pbgo "github.com/DataDog/datadog-agent/pkg/proto/pbgo/core" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -62,7 +63,14 @@ func (d *Daemon) Start(ctx context.Context) error { return d.handleConfigs(ctx, configs) })) d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(ctx, func(req remoteAPIRequest) error { - return d.handleRemoteAPIRequest(ctx, req) + d.setTaskState(req.Package, req.ID, pbgo.TaskState_RUNNING, nil) + err := d.handleRemoteAPIRequest(ctx, req) + if err != nil { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_ERROR, err) + } else { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_DONE, nil) + } + return err })) <-ctx.Done() @@ -276,6 +284,43 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP return nil } +// setTaskState updates the Task field of the package entry in the RC installer state. +// If the package is not yet present in the state, a new entry is added. +// This is a no-op when rcClient is nil (e.g. in unit tests that construct Daemon directly). +func (d *Daemon) setTaskState(pkgName, taskID string, taskState pbgo.TaskState, taskErr error) { + if d.rcClient == nil { + return + } + task := &pbgo.PackageStateTask{ + Id: taskID, + State: taskState, + } + if taskErr != nil { + task.Error = &pbgo.TaskError{Message: taskErr.Error()} + } + + current := d.rcClient.GetInstallerState() + updated := make([]*pbgo.PackageState, 0, len(current)+1) + found := false + for _, pkg := range current { + if pkg.Package == pkgName { + cloned := *pkg + cloned.Task = task + updated = append(updated, &cloned) + found = true + } else { + updated = append(updated, pkg) + } + } + if !found { + updated = append(updated, &pbgo.PackageState{ + Package: pkgName, + Task: task, + }) + } + d.rcClient.SetInstallerState(updated) +} + // getExperimentPhase returns the current experiment phase from a DDA's status, // or empty string if no experiment is active. func getExperimentPhase(dda *v2alpha1.DatadogAgent) v2alpha1.ExperimentPhase { diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index e07156b71d..d083999a9a 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -8,8 +8,11 @@ package fleet import ( "context" "encoding/json" + "errors" "testing" + pbgo "github.com/DataDog/datadog-agent/pkg/proto/pbgo/core" + "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -553,3 +556,108 @@ func TestCanPromote_Aborted(t *testing.T) { assert.False(t, isNoOp) assert.Error(t, err) } + +// --- mockRCClient --- + +// mockRCClient is a minimal RCClient used to test package state updates. +type mockRCClient struct { + state []*pbgo.PackageState +} + +func (m *mockRCClient) Subscribe(_ string, _ func(map[string]state.RawConfig, func(string, state.ApplyStatus))) { +} + +func (m *mockRCClient) GetInstallerState() []*pbgo.PackageState { + return m.state +} + +func (m *mockRCClient) SetInstallerState(packages []*pbgo.PackageState) { + m.state = packages +} + +func testDaemonWithRC(initialState []*pbgo.PackageState) (*Daemon, *mockRCClient) { + rc := &mockRCClient{state: initialState} + d := &Daemon{ + rcClient: rc, + configs: make(map[string]installerConfig), + } + return d, rc +} + +// --- setTaskState tests --- + +func TestSetTaskState_Running(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + + require.Len(t, rc.state, 1) + pkg := rc.state[0] + assert.Equal(t, "datadog-operator", pkg.Package) + assert.Equal(t, "1.0.0", pkg.StableVersion) + require.NotNil(t, pkg.Task) + assert.Equal(t, "task-1", pkg.Task.Id) + assert.Equal(t, pbgo.TaskState_RUNNING, pkg.Task.State) + assert.Nil(t, pkg.Task.Error) +} + +func TestSetTaskState_Done(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_DONE, nil) + + require.Len(t, rc.state, 1) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_DONE, rc.state[0].Task.State) + assert.Nil(t, rc.state[0].Task.Error) +} + +func TestSetTaskState_Error(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_ERROR, errors.New("something went wrong")) + + require.Len(t, rc.state, 1) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_ERROR, rc.state[0].Task.State) + require.NotNil(t, rc.state[0].Task.Error) + assert.Equal(t, "something went wrong", rc.state[0].Task.Error.Message) +} + +func TestSetTaskState_PackageNotInState(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{}) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + + require.Len(t, rc.state, 1) + assert.Equal(t, "datadog-operator", rc.state[0].Package) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_RUNNING, rc.state[0].Task.State) +} + +func TestSetTaskState_PreservesOtherPackages(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "other-package", StableVersion: "2.0.0"}, + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_DONE, nil) + + require.Len(t, rc.state, 2) + // other-package must be unchanged + assert.Equal(t, "other-package", rc.state[0].Package) + assert.Nil(t, rc.state[0].Task) + // datadog-operator has task set + assert.Equal(t, "datadog-operator", rc.state[1].Package) + require.NotNil(t, rc.state[1].Task) + assert.Equal(t, pbgo.TaskState_DONE, rc.state[1].Task.State) +} + +func TestSetTaskState_NilClient(t *testing.T) { + d := &Daemon{} + // Must not panic when rcClient is nil. + assert.NotPanics(t, func() { + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + }) +} diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index 767022b4d8..3374dee90a 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -62,6 +62,8 @@ type RcServiceConfiguration struct { // RCClient is the interface for subscribing to RC product updates. type RCClient interface { Subscribe(product string, fn func(update map[string]state.RawConfig, applyStateCallback func(string, state.ApplyStatus))) + GetInstallerState() []*pbgo.PackageState + SetInstallerState(packages []*pbgo.PackageState) } // Client returns the underlying RC client. From 1d84dc23a32d0a4bb05345f068c6a4784d3d8a38 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:14:46 -0400 Subject: [PATCH 10/15] Fix lint --- pkg/fleet/daemon.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index f86cc405ea..16ab2db868 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -303,10 +303,15 @@ func (d *Daemon) setTaskState(pkgName, taskID string, taskState pbgo.TaskState, updated := make([]*pbgo.PackageState, 0, len(current)+1) found := false for _, pkg := range current { - if pkg.Package == pkgName { - cloned := *pkg - cloned.Task = task - updated = append(updated, &cloned) + if pkg.GetPackage() == pkgName { + updated = append(updated, &pbgo.PackageState{ + Package: pkg.GetPackage(), + StableVersion: pkg.GetStableVersion(), + ExperimentVersion: pkg.GetExperimentVersion(), + Task: task, + StableConfigVersion: pkg.GetStableConfigVersion(), + ExperimentConfigVersion: pkg.GetExperimentConfigVersion(), + }) found = true } else { updated = append(updated, pkg) From acb73f4cf7926764c74307a45fade28ca14d355d Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Tue, 14 Apr 2026 19:01:56 -0400 Subject: [PATCH 11/15] package state --- pkg/fleet/daemon.go | 109 +++++++++++++++++++++++++++++++++++++++ pkg/fleet/daemon_test.go | 65 ++++++++++++++++++----- 2 files changed, 162 insertions(+), 12 deletions(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 16ab2db868..262a9b51ce 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -185,6 +185,18 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR } if err := canStart(getExperimentPhase(dda)); err != nil { + // If an experiment is already running, treat repeat start signals as idempotent. + // The backend retries with new task UUIDs until it sees an ack, and in-memory + // state (ExperimentConfigVersion) doesn't survive operator restarts, so we + // cannot reliably match on config version. The backend is responsible for not + // sending a start for a different experiment while one is already running. + if getExperimentPhase(dda) == v2alpha1.ExperimentPhaseRunning { + logger.Info("Experiment already running, acknowledging start signal as idempotent", "experimentID", dda.Status.Experiment.ID) + // Restore the experiment config version — it may have been lost on restart. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, req.Params.Version) + return nil + } return fmt.Errorf("start DatadogAgent experiment: %w", err) } @@ -210,6 +222,10 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR return fmt.Errorf("start DatadogAgent experiment: failed to update status: %w", err) } + // Report the experiment config version to the backend. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, req.Params.Version) + logger.Info("Started DatadogAgent experiment") return nil } @@ -245,6 +261,10 @@ func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRe return fmt.Errorf("stop DatadogAgent experiment: failed to update status: %w", err) } + // Clear the experiment config version. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, "") + logger.Info("Stopped DatadogAgent experiment") return nil } @@ -258,6 +278,12 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name)) logger := ctrl.LoggerFrom(ctx) + // Verify there is an active experiment to promote. + _, experiment := d.getPackageConfigVersions(req.Package) + if experiment == "" { + return fmt.Errorf("promote DatadogAgent experiment: no experiment config version set") + } + dda := &v2alpha1.DatadogAgent{} if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { return fmt.Errorf("promote DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) @@ -280,6 +306,9 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP return fmt.Errorf("promote DatadogAgent experiment: failed to update status: %w", err) } + // Promote: stable = old experiment, experiment = "" + d.setPackageConfigVersions(req.Package, experiment, "") + logger.Info("Promoted DatadogAgent experiment") return nil } @@ -324,6 +353,86 @@ func (d *Daemon) setTaskState(pkgName, taskID string, taskState pbgo.TaskState, }) } d.rcClient.SetInstallerState(updated) + d.logInstallerState("setTaskState") +} + +// getPackageConfigVersions returns the current stable and experiment config versions +// for the given package from the RC installer state. +func (d *Daemon) getPackageConfigVersions(pkgName string) (stable, experiment string) { + if d.rcClient == nil { + return "", "" + } + for _, pkg := range d.rcClient.GetInstallerState() { + if pkg.GetPackage() == pkgName { + return pkg.GetStableConfigVersion(), pkg.GetExperimentConfigVersion() + } + } + return "", "" +} + +// setPackageConfigVersions updates the stable and experiment version fields of the +// package entry in the RC installer state. Both the config variants +// (StableConfigVersion/ExperimentConfigVersion) and the package variants +// (StableVersion/ExperimentVersion) are set to the same values. +func (d *Daemon) setPackageConfigVersions(pkgName, stable, experiment string) { + if d.rcClient == nil { + return + } + current := d.rcClient.GetInstallerState() + updated := make([]*pbgo.PackageState, 0, len(current)+1) + found := false + for _, pkg := range current { + if pkg.GetPackage() == pkgName { + updated = append(updated, &pbgo.PackageState{ + Package: pkg.GetPackage(), + StableVersion: stable, + ExperimentVersion: experiment, + Task: pkg.GetTask(), + StableConfigVersion: stable, + ExperimentConfigVersion: experiment, + }) + found = true + } else { + updated = append(updated, pkg) + } + } + if !found { + updated = append(updated, &pbgo.PackageState{ + Package: pkgName, + StableVersion: stable, + ExperimentVersion: experiment, + StableConfigVersion: stable, + ExperimentConfigVersion: experiment, + }) + } + d.rcClient.SetInstallerState(updated) + d.logInstallerState("setPackageConfigVersions") +} + +// logInstallerState logs the full installer state for debugging. +func (d *Daemon) logInstallerState(caller string) { + if d.rcClient == nil { + return + } + logger := ctrl.Log.WithName("fleet-daemon") + for _, pkg := range d.rcClient.GetInstallerState() { + var taskID string + var taskState pbgo.TaskState + if pkg.GetTask() != nil { + taskID = pkg.GetTask().GetId() + taskState = pkg.GetTask().GetState() + } + logger.Info("Installer state", + "caller", caller, + "package", pkg.GetPackage(), + "stableVersion", pkg.GetStableVersion(), + "experimentVersion", pkg.GetExperimentVersion(), + "stableConfigVersion", pkg.GetStableConfigVersion(), + "experimentConfigVersion", pkg.GetExperimentConfigVersion(), + "taskID", taskID, + "taskState", taskState, + ) + } } // getExperimentPhase returns the current experiment phase from a DDA's status, diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index d083999a9a..c1eef377c2 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -86,9 +86,10 @@ func testInstallerConfigWithDDA() map[string]installerConfig { func testStartRequest() remoteAPIRequest { return remoteAPIRequest{ - ID: "exp-abc", - Method: methodStartDatadogAgentExperiment, - Params: experimentParams{Version: "test-config"}, + ID: "exp-abc", + Package: "datadog-operator", + Method: methodStartDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, } } @@ -126,9 +127,20 @@ func TestStartDatadogAgentExperiment_DDANotFound(t *testing.T) { assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) } -func TestStartDatadogAgentExperiment_Running(t *testing.T) { - d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) - assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +func TestStartDatadogAgentExperiment_Running_Idempotent(t *testing.T) { + dda := testDDAObject(v2alpha1.ExperimentPhaseRunning) + d, c := testDaemon(dda, testInstallerConfigWithDDA()) + // Backend retries with a new task UUID; should be treated as idempotent. + req := testStartRequest() + req.ID = "retry-task-uuid" + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + // DDA should be unchanged — no re-patch, no status update. + got := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, got)) + require.NotNil(t, got.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, got.Status.Experiment.Phase) + assert.Equal(t, "old-exp", got.Status.Experiment.ID) } func TestStartDatadogAgentExperiment_Stopped(t *testing.T) { @@ -255,9 +267,10 @@ func TestStartDatadogAgentExperiment_VersionMismatch(t *testing.T) { func testStopRequest() remoteAPIRequest { return remoteAPIRequest{ - ID: "exp-abc", - Method: methodStopDatadogAgentExperiment, - Params: experimentParams{Version: "test-config"}, + ID: "exp-abc", + Package: "datadog-operator", + Method: methodStopDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, } } @@ -327,9 +340,10 @@ func TestStopDatadogAgentExperiment_Success_Running(t *testing.T) { func testPromoteRequest() remoteAPIRequest { return remoteAPIRequest{ - ID: "exp-abc", - Method: methodPromoteDatadogAgentExperiment, - Params: experimentParams{Version: "test-config"}, + ID: "exp-abc", + Package: "datadog-operator", + Method: methodPromoteDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, } } @@ -355,8 +369,19 @@ func TestPromoteDatadogAgentExperiment_Aborted(t *testing.T) { assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) } +func TestPromoteDatadogAgentExperiment_NoExperimentVersion(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1"}, + }} + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + func TestPromoteDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) dda := &v2alpha1.DatadogAgent{} @@ -366,6 +391,9 @@ func TestPromoteDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { func TestPromoteDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) dda := &v2alpha1.DatadogAgent{} @@ -375,6 +403,9 @@ func TestPromoteDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { func TestPromoteDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhasePromoted), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) dda := &v2alpha1.DatadogAgent{} @@ -384,6 +415,10 @@ func TestPromoteDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + rc := &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} + d.rcClient = rc require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) dda := &v2alpha1.DatadogAgent{} @@ -392,6 +427,12 @@ func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) // ID must be preserved. assert.Equal(t, "old-exp", dda.Status.Experiment.ID) + // Stable should now be the old experiment, experiment should be cleared. + require.Len(t, rc.state, 1) + assert.Equal(t, "exp-1", rc.state[0].StableVersion) + assert.Equal(t, "", rc.state[0].ExperimentVersion) + assert.Equal(t, "exp-1", rc.state[0].StableConfigVersion) + assert.Equal(t, "", rc.state[0].ExperimentConfigVersion) } // --- validateOperation tests --- From c7fffcd5c8efae88f079ca74cb84b2faf1ff460e Mon Sep 17 00:00:00 2001 From: Paul Coignet Date: Wed, 15 Apr 2026 11:19:44 +0200 Subject: [PATCH 12/15] Check expected state --- pkg/fleet/daemon.go | 39 ++++++++++++++++++++++- pkg/fleet/daemon_test.go | 68 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 262a9b51ce..0ac98f95c1 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -7,6 +7,7 @@ package fleet import ( "context" + "errors" "fmt" "sync" @@ -21,6 +22,14 @@ import ( "github.com/DataDog/datadog-operator/pkg/remoteconfig" ) +// errStateDoesntMatch is returned by verifyExpectedState when the task's expected +// state doesn't match the operator's current installer state. +type errStateDoesntMatch struct { + msg string +} + +func (e *errStateDoesntMatch) Error() string { return e.msg } + const ( methodStartDatadogAgentExperiment = "operator/start_datadogagent_experiment" methodStopDatadogAgentExperiment = "operator/stop_datadogagent_experiment" @@ -66,7 +75,12 @@ func (d *Daemon) Start(ctx context.Context) error { d.setTaskState(req.Package, req.ID, pbgo.TaskState_RUNNING, nil) err := d.handleRemoteAPIRequest(ctx, req) if err != nil { - d.setTaskState(req.Package, req.ID, pbgo.TaskState_ERROR, err) + var stateErr *errStateDoesntMatch + if errors.As(err, &stateErr) { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_INVALID_STATE, err) + } else { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_ERROR, err) + } } else { d.setTaskState(req.Package, req.ID, pbgo.TaskState_DONE, nil) } @@ -112,6 +126,24 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { return cfg, nil } +// verifyExpectedState checks that the config versions in the task's expected_state +// match the operator's current installer state for the given package. +// Returns *errStateDoesntMatch when they don't match. +func (d *Daemon) verifyExpectedState(req remoteAPIRequest) error { + stable, experiment := d.getPackageConfigVersions(req.Package) + if req.ExpectedState.StableConfig != stable || req.ExpectedState.ExperimentConfig != experiment { + return &errStateDoesntMatch{ + msg: fmt.Sprintf( + "state mismatch for package %s: expected stable_config=%q experiment_config=%q, got stable_config=%q experiment_config=%q", + req.Package, + req.ExpectedState.StableConfig, req.ExpectedState.ExperimentConfig, + stable, experiment, + ), + } + } + return nil +} + // handleRemoteAPIRequest dispatches the incoming task to the appropriate handler. func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIRequest) error { logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "package", req.Package, "method", req.Method) @@ -121,6 +153,11 @@ func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIReques return fmt.Errorf("experiment signals require the CreateControllerRevisions and DatadogAgentInternal feature gates") } + if err := d.verifyExpectedState(req); err != nil { + logger.Error(err, "Expected state mismatch") + return err + } + switch req.Method { case methodStartDatadogAgentExperiment: return d.startDatadogAgentExperiment(ctx, req) diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index c1eef377c2..144b5b7472 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -435,6 +435,74 @@ func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { assert.Equal(t, "", rc.state[0].ExperimentConfigVersion) } +// --- verifyExpectedState tests --- + +func TestVerifyExpectedState_Match(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: "2.0.0"}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "1.0.0", ExperimentConfig: "2.0.0"}, + } + assert.NoError(t, d.verifyExpectedState(req)) +} + +func TestVerifyExpectedState_StableMismatch(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: ""}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "wrong", ExperimentConfig: ""}, + } + err := d.verifyExpectedState(req) + require.Error(t, err) + var stateErr *errStateDoesntMatch + assert.True(t, errors.As(err, &stateErr)) +} + +func TestVerifyExpectedState_ExperimentMismatch(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: "2.0.0"}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "1.0.0", ExperimentConfig: "wrong"}, + } + err := d.verifyExpectedState(req) + require.Error(t, err) + var stateErr *errStateDoesntMatch + assert.True(t, errors.As(err, &stateErr)) +} + +func TestVerifyExpectedState_NilClient(t *testing.T) { + // When rcClient is nil, getPackageConfigVersions returns ("", ""). + // A request with empty expected state should pass. + d := &Daemon{} + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "", ExperimentConfig: ""}, + } + assert.NoError(t, d.verifyExpectedState(req)) +} + +func TestHandleRemoteAPIRequest_InvalidState(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: ""}, + }} + req := testStartRequest() + req.ExpectedState = expectedState{StableConfig: "stale", ExperimentConfig: ""} + err := d.handleRemoteAPIRequest(context.Background(), req) + require.Error(t, err) + var stateErr *errStateDoesntMatch + assert.True(t, errors.As(err, &stateErr)) +} + // --- validateOperation tests --- func TestValidateOperation_Valid(t *testing.T) { From 475d4166ff3c78518c28263483efe201111f55a2 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:22:54 -0400 Subject: [PATCH 13/15] no nsn for stop/promote --- pkg/fleet/daemon.go | 49 ++++++++++++++++++++++++---------------- pkg/fleet/daemon_test.go | 21 ++++------------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 0ac98f95c1..2fba8233a2 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -22,13 +22,13 @@ import ( "github.com/DataDog/datadog-operator/pkg/remoteconfig" ) -// errStateDoesntMatch is returned by verifyExpectedState when the task's expected +// stateDoesntMatchError is returned by verifyExpectedState when the task's expected // state doesn't match the operator's current installer state. -type errStateDoesntMatch struct { +type stateDoesntMatchError struct { msg string } -func (e *errStateDoesntMatch) Error() string { return e.msg } +func (e *stateDoesntMatchError) Error() string { return e.msg } const ( methodStartDatadogAgentExperiment = "operator/start_datadogagent_experiment" @@ -47,6 +47,7 @@ type Daemon struct { revisionsEnabled bool mu sync.RWMutex configs map[string]installerConfig // keyed by config ID; replaced on each RC update + experimentTarget types.NamespacedName // DDA targeted by the current experiment; set on startExperiment } // NewDaemon creates a new Fleet Daemon. When revisionsEnabled is false, experiment @@ -75,7 +76,7 @@ func (d *Daemon) Start(ctx context.Context) error { d.setTaskState(req.Package, req.ID, pbgo.TaskState_RUNNING, nil) err := d.handleRemoteAPIRequest(ctx, req) if err != nil { - var stateErr *errStateDoesntMatch + var stateErr *stateDoesntMatchError if errors.As(err, &stateErr) { d.setTaskState(req.Package, req.ID, pbgo.TaskState_INVALID_STATE, err) } else { @@ -128,11 +129,11 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { // verifyExpectedState checks that the config versions in the task's expected_state // match the operator's current installer state for the given package. -// Returns *errStateDoesntMatch when they don't match. +// Returns *stateDoesntMatchError when they don't match. func (d *Daemon) verifyExpectedState(req remoteAPIRequest) error { stable, experiment := d.getPackageConfigVersions(req.Package) if req.ExpectedState.StableConfig != stable || req.ExpectedState.ExperimentConfig != experiment { - return &errStateDoesntMatch{ + return &stateDoesntMatchError{ msg: fmt.Sprintf( "state mismatch for package %s: expected stable_config=%q experiment_config=%q, got stable_config=%q experiment_config=%q", req.Package, @@ -210,11 +211,19 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR return err } + // Store the target DDA for promote/stop signals (which don't carry a config). + d.experimentTarget = op.NamespacedName + logger = logger.WithValues("namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) ctx = ctrl.LoggerInto(ctx, logger) logger.Info("Starting k8s resource patch", "config", op.Config) + // Check the operation + if op.Operation != OperationUpdate { + return fmt.Errorf("start DatadogAgent experiment: invalid operation: %s", op.Operation) + } + // Fetch current DDA to check signal preconditions. dda := &v2alpha1.DatadogAgent{} if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { @@ -268,17 +277,17 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR } func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { - op, err := d.resolveOperation(req, "stop DatadogAgent experiment") - if err != nil { - return err + nsn := d.experimentTarget + if nsn.Name == "" { + return fmt.Errorf("stop DatadogAgent experiment: no experiment target set (was start called first?)") } - ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name)) + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) logger := ctrl.LoggerFrom(ctx) dda := &v2alpha1.DatadogAgent{} - if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { - return fmt.Errorf("stop DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + if err := d.client.Get(ctx, nsn, dda); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: failed to get DatadogAgent %s: %w", nsn, err) } if isNoOp, err := canStop(ctx, getExperimentPhase(dda)); err != nil { @@ -289,7 +298,7 @@ func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRe // Update status: set phase=stopped; leave ID and generation for the reconciler. if err := retryWithBackoff(ctx, func() error { - if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + if err := d.client.Get(ctx, nsn, dda); err != nil { return err } dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped @@ -307,12 +316,12 @@ func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRe } func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { - op, err := d.resolveOperation(req, "promote DatadogAgent experiment") - if err != nil { - return err + nsn := d.experimentTarget + if nsn.Name == "" { + return fmt.Errorf("promote DatadogAgent experiment: no experiment target set (was start called first?)") } - ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name)) + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) logger := ctrl.LoggerFrom(ctx) // Verify there is an active experiment to promote. @@ -322,8 +331,8 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP } dda := &v2alpha1.DatadogAgent{} - if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { - return fmt.Errorf("promote DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + if err := d.client.Get(ctx, nsn, dda); err != nil { + return fmt.Errorf("promote DatadogAgent experiment: failed to get DatadogAgent %s: %w", nsn, err) } if isNoOp, err := canPromote(ctx, getExperimentPhase(dda)); err != nil { @@ -334,7 +343,7 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP // Update status: set phase=promoted; leave ID and generation intact. if err := retryWithBackoff(ctx, func() error { - if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + if err := d.client.Get(ctx, nsn, dda); err != nil { return err } dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index 144b5b7472..cadf6878bf 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -44,6 +44,7 @@ func testDaemon(dda *v2alpha1.DatadogAgent, configs map[string]installerConfig) client: c, revisionsEnabled: true, configs: configs, + experimentTarget: testDDANSN, }, c } @@ -274,13 +275,6 @@ func testStopRequest() remoteAPIRequest { } } -func TestStopDatadogAgentExperiment_ConfigNotFound(t *testing.T) { - d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) - req := testStopRequest() - req.Params.Version = "nonexistent" - assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), req)) -} - func TestStopDatadogAgentExperiment_DDANotFound(t *testing.T) { d, _ := testDaemon(nil, testInstallerConfigWithDDA()) assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) @@ -347,13 +341,6 @@ func testPromoteRequest() remoteAPIRequest { } } -func TestPromoteDatadogAgentExperiment_ConfigNotFound(t *testing.T) { - d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) - req := testPromoteRequest() - req.Params.Version = "nonexistent" - assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), req)) -} - func TestPromoteDatadogAgentExperiment_DDANotFound(t *testing.T) { d, _ := testDaemon(nil, testInstallerConfigWithDDA()) assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) @@ -460,7 +447,7 @@ func TestVerifyExpectedState_StableMismatch(t *testing.T) { } err := d.verifyExpectedState(req) require.Error(t, err) - var stateErr *errStateDoesntMatch + var stateErr *stateDoesntMatchError assert.True(t, errors.As(err, &stateErr)) } @@ -475,7 +462,7 @@ func TestVerifyExpectedState_ExperimentMismatch(t *testing.T) { } err := d.verifyExpectedState(req) require.Error(t, err) - var stateErr *errStateDoesntMatch + var stateErr *stateDoesntMatchError assert.True(t, errors.As(err, &stateErr)) } @@ -499,7 +486,7 @@ func TestHandleRemoteAPIRequest_InvalidState(t *testing.T) { req.ExpectedState = expectedState{StableConfig: "stale", ExperimentConfig: ""} err := d.handleRemoteAPIRequest(context.Background(), req) require.Error(t, err) - var stateErr *errStateDoesntMatch + var stateErr *stateDoesntMatchError assert.True(t, errors.As(err, &stateErr)) } From 8397e6c113265eb72e471bbe328faf54e341a3d9 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Wed, 15 Apr 2026 12:58:58 -0400 Subject: [PATCH 14/15] Tweak logging --- pkg/fleet/daemon.go | 17 ++++++++--------- pkg/fleet/experiment.go | 5 ----- pkg/fleet/remote_config.go | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 2fba8233a2..8aa0dee282 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -105,14 +105,13 @@ func (d *Daemon) handleConfigs(ctx context.Context, configs map[string]installer d.mu.Lock() defer d.mu.Unlock() logger := ctrl.LoggerFrom(ctx) - logger.Info("Received installer configs", "configs", configs) newConfigs := make(map[string]installerConfig, len(configs)) for _, cfg := range configs { - logger.Info("Received installer config", "id", cfg.ID, "operations", len(cfg.Operations)) + logger.V(2).Info("Received installer config", "id", cfg.ID, "operations", len(cfg.Operations)) newConfigs[cfg.ID] = cfg } d.configs = newConfigs - logger.Info("Updated installer configs", "configs", d.configs) + logger.V(2).Info("Updated installer configs", "configs", d.configs) return nil } @@ -204,7 +203,7 @@ func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetMan // The second step updates the DDA status to running, recording the experiment ID. func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) - logger.Info("Starting DatadogAgent experiment", "config", req.Params.Version) + logger.V(1).Info("Starting DatadogAgent experiment", "config", req.Params.Version) op, err := d.resolveOperation(req, "start DatadogAgent experiment") if err != nil { logger.Error(err, "Failed to resolve operation") @@ -217,8 +216,6 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR logger = logger.WithValues("namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) ctx = ctrl.LoggerInto(ctx, logger) - logger.Info("Starting k8s resource patch", "config", op.Config) - // Check the operation if op.Operation != OperationUpdate { return fmt.Errorf("start DatadogAgent experiment: invalid operation: %s", op.Operation) @@ -279,11 +276,12 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { nsn := d.experimentTarget if nsn.Name == "" { - return fmt.Errorf("stop DatadogAgent experiment: no experiment target set (was start called first?)") + return fmt.Errorf("stop DatadogAgent experiment: no experiment target set") } ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) logger := ctrl.LoggerFrom(ctx) + logger.V(1).Info("Stopping DatadogAgent experiment") dda := &v2alpha1.DatadogAgent{} if err := d.client.Get(ctx, nsn, dda); err != nil { @@ -318,11 +316,12 @@ func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRe func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { nsn := d.experimentTarget if nsn.Name == "" { - return fmt.Errorf("promote DatadogAgent experiment: no experiment target set (was start called first?)") + return fmt.Errorf("promote DatadogAgent experiment: no experiment target set") } ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) logger := ctrl.LoggerFrom(ctx) + logger.V(1).Info("Promoting DatadogAgent experiment") // Verify there is an active experiment to promote. _, experiment := d.getPackageConfigVersions(req.Package) @@ -468,7 +467,7 @@ func (d *Daemon) logInstallerState(caller string) { taskID = pkg.GetTask().GetId() taskState = pkg.GetTask().GetState() } - logger.Info("Installer state", + logger.V(1).Info("Installer state", "caller", caller, "package", pkg.GetPackage(), "stableVersion", pkg.GetStableVersion(), diff --git a/pkg/fleet/experiment.go b/pkg/fleet/experiment.go index 64bdd26b32..149c083e7f 100644 --- a/pkg/fleet/experiment.go +++ b/pkg/fleet/experiment.go @@ -110,14 +110,9 @@ var experimentBackoff = wait.Backoff{ // retryWithBackoff retries fn on any error with exponential backoff. // The total retry window is bounded by a 3-minute context timeout. func retryWithBackoff(ctx context.Context, fn func() error) error { - logger := ctrl.LoggerFrom(ctx) - logger.Info("Retrying with backoff", "backoff", experimentBackoff) ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) defer cancel() - logger.Info("Context with timeout", "timeout", 3*time.Minute) return retry.OnError(experimentBackoff, func(err error) bool { - logger.Info("Error", "error", err) - logger.Info("Context error", "error", ctx.Err()) return ctx.Err() == nil }, fn) } diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index 248ad0a9b5..47555ae076 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -73,7 +73,7 @@ func handleInstallerConfigUpdate(ctx context.Context, h func(map[string]installe logger := ctrl.LoggerFrom(ctx) configs := make(map[string]installerConfig, len(updates)) for cfgPath, raw := range updates { - logger.Info("Received INSTALLER_CONFIG payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + logger.V(1).Info("Received INSTALLER_CONFIG payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) var cfg installerConfig if err := json.Unmarshal(raw.Config, &cfg); err != nil { @@ -104,7 +104,7 @@ func handleUpdaterTaskUpdate(ctx context.Context, h func(remoteAPIRequest) error return func(updates map[string]state.RawConfig, applyStatus func(string, state.ApplyStatus)) { logger := ctrl.LoggerFrom(ctx) for cfgPath, raw := range updates { - logger.Info("Received UPDATER_TASK payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + logger.V(1).Info("Received UPDATER_TASK payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) var req remoteAPIRequest if err := json.Unmarshal(raw.Config, &req); err != nil { From a816a2ac60bcafd4ac389e4e3e0ea854cb6d98b8 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 16 Apr 2026 12:45:53 -0400 Subject: [PATCH 15/15] Review suggestions + 2x promote fix --- .../controller_experiment_integration_test.go | 397 +++++++++++++++++- .../controller/datadogagent/experiment.go | 43 +- .../datadogagent/experiment_test.go | 31 +- pkg/fleet/daemon.go | 21 +- pkg/fleet/daemon_test.go | 3 +- pkg/fleet/experiment.go | 14 +- 6 files changed, 476 insertions(+), 33 deletions(-) diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 9d28e1ac5e..e30ad5b499 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -520,6 +520,20 @@ func Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision(t *testing. } assert.NotEmpty(t, experimentRevName, "should have an experiment revision") + // Add a pre-existing annotation to the experiment revision to verify the + // merge patch preserves it when annotateRevision adds its own annotation. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + rev := rev + if rev.Annotations == nil { + rev.Annotations = map[string]string{} + } + rev.Annotations["custom-annotation"] = "should-survive" + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + break + } + } + // RC writes phase=stopped → triggers rollback. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ @@ -531,11 +545,14 @@ func Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision(t *testing. assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) - // Verify: experiment revision annotated, baseline NOT annotated. + // Verify: experiment revision annotated, baseline NOT annotated, + // and pre-existing annotations are preserved by the merge patch. for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { if rev.Name == experimentRevName { assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], "experiment revision %s should be annotated after rollback", rev.Name) + assert.Equal(t, "should-survive", rev.Annotations["custom-annotation"], + "pre-existing annotation on experiment revision %s should be preserved", rev.Name) } else { assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], "baseline revision %s should NOT be annotated", rev.Name) @@ -556,6 +573,384 @@ func Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision(t *testing. } } +// Test_Experiment_PromoteThenNewExperiment_NoImmediateTimeout verifies that +// after an experiment is promoted, a subsequent new experiment does not +// immediately timeout due to a stale revision timestamp. +// +// Regression test: the promoted experiment's revision was not annotated, so +// handleRollback fell back to its stale timestamp and fired an immediate +// timeout on the first reconcile of the new experiment. +func Test_Experiment_PromoteThenNewExperiment_NoImmediateTimeout(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const longTimeout = 5 * time.Second + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, longTimeout) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: first experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=promoted (experiment succeeded, keep the new spec). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile processes the promoted phase: manageExperiment annotates the + // revision, then ensureRevision sees the annotation and recreates it with + // a fresh timestamp (consuming the annotation in the process). + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, mustGetExperimentPhase(t, r, ns, name)) + + // New experiment: daemon patches the spec first, which triggers a reconcile + // while still phase=promoted (abortExperiment is a no-op for non-running + // phases). This reconcile creates a revision for the new spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.jp") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Daemon then updates status to running with a new experiment ID. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Patch all revision timestamps to now so fresh revisions have fresh timestamps. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // Reconcile: the new experiment's revision exists with a fresh timestamp, + // so neither timeout nor abort should fire. + reconcileN(t, r, ns, name, 1) + + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "new experiment should still be running — no false timeout after promotion") +} + +// Test_Experiment_Promoted_DoesNotRecreateRevision verifies that a promoted +// experiment's revision is annotated with experiment-promoted (not +// experiment-rollback), and ensureRevision does NOT delete+recreate it. +func Test_Experiment_Promoted_DoesNotRecreateRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Record the experiment revision name (highest Revision number). + var experimentRevName string + maxRev := int64(0) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Revision > maxRev { + maxRev = rev.Revision + experimentRevName = rev.Name + } + } + + // RC writes phase=promoted. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile processes promoted phase. + reconcileN(t, r, ns, name, 1) + + // The experiment revision should have the promoted annotation, NOT rollback. + revs := listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentPromoted], + "experiment revision should have promoted annotation") + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision should NOT have rollback annotation") + } + } + + // Run additional reconciles — the revision should NOT be recreated. + // If it were recreated, the name would stay the same but the promoted + // annotation would be cleared. Verify it persists. + reconcileN(t, r, ns, name, 3) + + revs = listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentPromoted], + "promoted annotation should persist — revision must not be recreated") + } + } +} + +// Test_Experiment_StateTransitions verifies that after any terminal state, +// a new experiment can start and reach any terminal state correctly — with +// no false timeouts from stale revision timestamps. +// +// Matrix: 4 previous states × 3 new outcomes × 2 (fresh / stale old revision) = 24 subtests. +// +// Each sub-test follows the same 5-phase flow: +// +// Phase 1: Set up baseline + first experiment, reach terminal state. +// Phase 2: (stale variant only) Age all existing revision timestamps past timeout. +// Phase 3: Start a new experiment (mimics daemon: patch spec, reconcile, set running). +// Phase 4: Assert no false timeout — the new experiment must stay running. +// Phase 5: Drive the new experiment to its target outcome (stop/promote/timeout). +func Test_Experiment_StateTransitions(t *testing.T) { + type terminalSetup struct { + name string + reach func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) + } + + type newOutcome struct { + name string + action func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) + expect v2alpha1.ExperimentPhase + } + + // --------------------------------------------------------------- + // Terminal states: how to get the first experiment into each one. + // --------------------------------------------------------------- + terminalStates := []terminalSetup{ + { + // promoted: RC signals success, spec stays as-is. + name: "promoted", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Reconcile annotates the experiment revision with experiment-promoted. + reconcileN(t, r, ns, name, 1) + }, + }, + { + // rollback: RC signals stop, operator restores previous spec. + name: "rollback", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Two reconciles: first restores spec (status conflicts), + // second persists phase=rollback. + reconcileN(t, r, ns, name, 2) + }, + }, + { + // timeout: experiment runs past the deadline, operator rolls back. + name: "timeout", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + r.options.ExperimentTimeout = 50 * time.Millisecond + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(100 * time.Millisecond) + // Two reconciles: first rolls back spec (status conflicts), + // second persists phase=timeout. + reconcileN(t, r, ns, name, 2) + }, + }, + { + // aborted: user manually changes spec while experiment is running. + name: "aborted", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + // Fresh timestamps so timeout doesn't race abort. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Manual spec change — doesn't match any revision → abort. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("manual-change.example.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + }, + }, + } + + // --------------------------------------------------------------- + // Outcomes: how to drive the new experiment to its terminal state. + // --------------------------------------------------------------- + newOutcomes := []newOutcome{ + { + // stop: RC signals stop → operator rolls back → phase=rollback. + name: "stop", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + }, + expect: v2alpha1.ExperimentPhaseRollback, + }, + { + // promote: RC signals success → phase=promoted. + name: "promote", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + }, + expect: v2alpha1.ExperimentPhasePromoted, + }, + { + // timeout: age the new experiment's revision past the deadline + // so the reconciler triggers a real timeout. + name: "timeout", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + r.options.ExperimentTimeout = 50 * time.Millisecond + // Only age unannotated revisions (the new experiment's revision). + // Annotated revisions belong to the old experiment and must not + // be touched — they're already handled by the fallback skip. + staleTime := metav1.NewTime(time.Now().Add(-time.Minute)) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Annotations[annotationExperimentRollback] != "true" && + rev.Annotations[annotationExperimentPromoted] != "true" { + rev.CreationTimestamp = staleTime + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + } + reconcileN(t, r, ns, name, 2) + }, + expect: v2alpha1.ExperimentPhaseTimeout, + }, + } + + // --------------------------------------------------------------- + // Test loop: 4 previous × 3 outcomes × 2 (fresh/stale) = 24 subtests. + // --------------------------------------------------------------- + for _, prev := range terminalStates { + for _, next := range newOutcomes { + for _, staleOldRevision := range []bool{false, true} { + suffix := "" + if staleOldRevision { + suffix = "/stale_old_revision" + } + testName := prev.name + "_then_" + next.name + suffix + t.Run(testName, func(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 5*time.Second) + + // -- Phase 1: set up first experiment and reach terminal state -- + + // Baseline spec (rev1). + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // First experiment spec (rev2). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Drive exp-1 to its terminal state (promoted/rollback/timeout/aborted). + prev.reach(t, r, ns, name, uid, nsName, dda) + + // -- Phase 2: (stale variant) age old revisions past timeout -- + + if staleOldRevision { + staleTime := metav1.NewTime(time.Now().Add(-10 * time.Minute)) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = staleTime + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + } + + // -- Phase 3: start new experiment (mimics daemon) -- + + // Daemon step 1: patch spec to the new experiment config. + r.options.ExperimentTimeout = 5 * time.Second + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.jp") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + // This reconcile runs while the old terminal phase is still set, + // so handleRollback is a no-op. manageRevision creates a revision + // for the new spec. + reconcileN(t, r, ns, name, 1) + + // Daemon step 2: set phase=running with new experiment ID. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Give the new experiment's revision a fresh timestamp. + // (Fake client doesn't set CreationTimestamp on create.) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // -- Phase 4: assert no false timeout -- + + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "new experiment should be running — no false timeout") + + // -- Phase 5: drive new experiment to target outcome -- + + next.action(t, r, ns, name, uid, nsName, dda) + assert.Equal(t, next.expect, mustGetExperimentPhase(t, r, ns, name)) + }) + } + } + } +} + // Test_Experiment_ReapplySameSpec_NoImmediateTimeout verifies the full // annotation-based revision recreate flow end-to-end: // diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index ce7c189eae..4e7e581e17 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -32,6 +32,12 @@ const ExperimentDefaultTimeout = 15 * time.Minute // gets a fresh timestamp when the same spec is re-applied. const annotationExperimentRollback = "operator.datadoghq.com/experiment-rollback" +// annotationExperimentPromoted marks a ControllerRevision whose experiment was +// promoted. The annotation tells handleRollback to skip the timeout check so +// the stale CreationTimestamp doesn't cause a false timeout when a subsequent +// experiment starts before its own revision is created. +const annotationExperimentPromoted = "operator.datadoghq.com/experiment-promoted" + // manageExperiment handles all experiment state transitions for a reconcile cycle. // Must be called before manageRevision. func (r *Reconciler) manageExperiment( @@ -51,6 +57,14 @@ func (r *Reconciler) manageExperiment( if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { return err } + // Mark the highest revision when promoted so its stale timestamp doesn't + // cause a false timeout if a new experiment starts before manageRevision + // creates a fresh revision. + if experiment.Phase == v2alpha1.ExperimentPhasePromoted { + if rev := highestRevision(revList); rev != nil { + r.annotateRevision(ctx, rev, annotationExperimentPromoted) + } + } r.abortExperiment(ctx, instance, experiment, newStatus, revList) return nil } @@ -96,7 +110,7 @@ func (r *Reconciler) abortExperiment( // Mark the experiment revision (highest-numbered) so its stale timestamp // doesn't cause an immediate timeout if the same spec is re-applied. if rev := highestRevision(revisions); rev != nil { - r.markRevisionRolledBack(ctx, rev) + r.annotateRevision(ctx, rev, annotationExperimentRollback) } } @@ -127,8 +141,15 @@ func (r *Reconciler) handleRollback( // Fall back to the highest-numbered revision (the experiment revision) // so we can still detect timeout even after a manual spec change. rev = highestRevision(revisions) + // Only skip for annotations in the fallback path: the spec doesn't + // match any revision (e.g. new experiment before manageRevision runs). + // Annotations on fallback revisions indicate a prior experiment's + // terminal state — their stale timestamps must not cause false timeouts. + if rev != nil && (rev.Annotations[annotationExperimentRollback] == "true" || rev.Annotations[annotationExperimentPromoted] == "true") { + break + } } - if rev != nil && rev.Annotations[annotationExperimentRollback] != "true" { + if rev != nil { elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String()) @@ -164,7 +185,7 @@ func (r *Reconciler) restorePreviousSpec( // and annotating old baselines would cause needless delete+recreate in // ensureRevision if those specs are ever re-applied. if rev := highestRevision(revisions); rev != nil && rev.Name != rollbackTarget { - r.markRevisionRolledBack(ctx, rev) + r.annotateRevision(ctx, rev, annotationExperimentRollback) } return nil } @@ -280,11 +301,11 @@ func highestRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRe return result } -// markRevisionRolledBack annotates a ControllerRevision with the rollback -// annotation. This is best-effort: if the patch fails, the stale timestamp -// remains but is no worse than the existing behavior. -func (r *Reconciler) markRevisionRolledBack(ctx context.Context, rev *appsv1.ControllerRevision) { - if rev.Annotations[annotationExperimentRollback] == "true" { +// annotateRevision sets the given annotation on a ControllerRevision. +// This is best-effort: if the patch fails, the stale timestamp remains +// but is no worse than the existing behavior. +func (r *Reconciler) annotateRevision(ctx context.Context, rev *appsv1.ControllerRevision, annotation string) { + if rev.Annotations[annotation] == "true" { return // already annotated } logger := ctrl.LoggerFrom(ctx).WithValues( @@ -292,12 +313,12 @@ func (r *Reconciler) markRevisionRolledBack(ctx context.Context, rev *appsv1.Con "object.namespace", rev.Namespace, "object.name", rev.Name, ) - patch := []byte(`{"metadata":{"annotations":{"` + annotationExperimentRollback + `":"true"}}}`) + patch := []byte(`{"metadata":{"annotations":{"` + annotation + `":"true"}}}`) if err := r.client.Patch(ctx, rev, client.RawPatch(types.MergePatchType, patch)); err != nil { - logger.Error(err, "Failed to annotate experiment revision as rolled back") + logger.Error(err, "Failed to annotate experiment revision", "annotation", annotation) return } - logger.Info("Marked experiment revision as rolled back") + logger.Info("Annotated experiment revision", "annotation", annotation) } func getExperimentTimeout(timeout time.Duration) time.Duration { diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index bc62995e91..541d8ee12f 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -356,19 +356,15 @@ func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { } assert.Equal(t, 1, annotatedCount, "exactly one revision should have the rollback annotation") - // RC re-applies spec B as a new experiment (sets spec=B, phase=Running). - // handleRollback finds the annotated revision → skips timeout. + // RC re-applies spec B as a new experiment. + // In the real flow, the daemon patches the spec first, then a reconcile runs + // (with no experiment phase set) where ensureRevision recreates the annotated + // revision with a fresh timestamp. Only then does the daemon set phase=Running + // and the next reconcile calls handleRollback. instanceB2 := newRevisionTestOwner("test-dda", "default") instanceB2.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} - - revListForNewExp := mustListRevisions(t, r, instanceB2) - newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} - require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListForNewExp)) - assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, - "re-applying the same spec after rollback must not immediately time out") - // ensureRevision recreates the annotated revision with a fresh timestamp. + // Step 1: ensureRevision recreates the annotated revision (fresh, no annotation). _, err := r.ensureRevision(context.Background(), instanceB2, mustListRevisions(t, r, instanceB2), false) require.NoError(t, err) @@ -377,6 +373,21 @@ func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { assert.Empty(t, rev.Annotations[annotationExperimentRollback], "rollback annotation should be cleared after recreate") } + + // Fake client doesn't set CreationTimestamp on Create, so patch all + // revision timestamps to now to simulate fresh revisions. + for i := range finalRevs { + finalRevs[i].CreationTimestamp = metav1.Now() + require.NoError(t, c.Update(context.Background(), &finalRevs[i])) + } + + // Step 2: daemon sets phase=Running, next reconcile calls handleRollback. + instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + revListForNewExp := mustListRevisions(t, r, instanceB2) + newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListForNewExp)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, + "re-applying the same spec after rollback must not immediately time out") } // TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest verifies that diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 8aa0dee282..d3a1e20c51 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -299,6 +299,9 @@ func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRe if err := d.client.Get(ctx, nsn, dda); err != nil { return err } + if dda.Status.Experiment == nil { + return fmt.Errorf("experiment status was cleared unexpectedly") + } dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped return d.client.Status().Update(ctx, dda) }); err != nil { @@ -345,6 +348,9 @@ func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAP if err := d.client.Get(ctx, nsn, dda); err != nil { return err } + if dda.Status.Experiment == nil { + return fmt.Errorf("experiment status was cleared unexpectedly") + } dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted return d.client.Status().Update(ctx, dda) }); err != nil { @@ -415,10 +421,11 @@ func (d *Daemon) getPackageConfigVersions(pkgName string) (stable, experiment st return "", "" } -// setPackageConfigVersions updates the stable and experiment version fields of the -// package entry in the RC installer state. Both the config variants -// (StableConfigVersion/ExperimentConfigVersion) and the package variants -// (StableVersion/ExperimentVersion) are set to the same values. +// setPackageConfigVersions updates the config version fields of the +// package entry in the RC installer state. Only the config variants +// (StableConfigVersion/ExperimentConfigVersion) are set; the package variants +// (StableVersion/ExperimentVersion) are preserved since this is a config +// experiment, not a package upgrade. func (d *Daemon) setPackageConfigVersions(pkgName, stable, experiment string) { if d.rcClient == nil { return @@ -430,8 +437,8 @@ func (d *Daemon) setPackageConfigVersions(pkgName, stable, experiment string) { if pkg.GetPackage() == pkgName { updated = append(updated, &pbgo.PackageState{ Package: pkg.GetPackage(), - StableVersion: stable, - ExperimentVersion: experiment, + StableVersion: pkg.GetStableVersion(), + ExperimentVersion: pkg.GetExperimentVersion(), Task: pkg.GetTask(), StableConfigVersion: stable, ExperimentConfigVersion: experiment, @@ -444,8 +451,6 @@ func (d *Daemon) setPackageConfigVersions(pkgName, stable, experiment string) { if !found { updated = append(updated, &pbgo.PackageState{ Package: pkgName, - StableVersion: stable, - ExperimentVersion: experiment, StableConfigVersion: stable, ExperimentConfigVersion: experiment, }) diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go index cadf6878bf..6708253f5e 100644 --- a/pkg/fleet/daemon_test.go +++ b/pkg/fleet/daemon_test.go @@ -416,7 +416,8 @@ func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { assert.Equal(t, "old-exp", dda.Status.Experiment.ID) // Stable should now be the old experiment, experiment should be cleared. require.Len(t, rc.state, 1) - assert.Equal(t, "exp-1", rc.state[0].StableVersion) + // StableVersion/ExperimentVersion are not set by config experiments. + assert.Equal(t, "", rc.state[0].StableVersion) assert.Equal(t, "", rc.state[0].ExperimentVersion) assert.Equal(t, "exp-1", rc.state[0].StableConfigVersion) assert.Equal(t, "", rc.state[0].ExperimentConfigVersion) diff --git a/pkg/fleet/experiment.go b/pkg/fleet/experiment.go index 149c083e7f..ec4bf24369 100644 --- a/pkg/fleet/experiment.go +++ b/pkg/fleet/experiment.go @@ -11,6 +11,7 @@ import ( "math" "time" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" @@ -107,12 +108,21 @@ var experimentBackoff = wait.Backoff{ Steps: math.MaxInt32, } -// retryWithBackoff retries fn on any error with exponential backoff. +// isRetryable returns true for errors that are worth retrying (i.e. not permanent). +func isRetryable(err error) bool { + return !apierrors.IsNotFound(err) && + !apierrors.IsForbidden(err) && + !apierrors.IsInvalid(err) && + !apierrors.IsMethodNotSupported(err) +} + +// retryWithBackoff retries fn on transient errors with exponential backoff. // The total retry window is bounded by a 3-minute context timeout. +// Permanent errors (not-found, forbidden, invalid, method-not-supported) are not retried. func retryWithBackoff(ctx context.Context, fn func() error) error { ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) defer cancel() return retry.OnError(experimentBackoff, func(err error) bool { - return ctx.Err() == nil + return ctx.Err() == nil && isRetryable(err) }, fn) }