diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 3929616e1d..21667f6d28 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -2395,13 +2395,6 @@ type ExperimentStatus struct { // ID is the unique experiment ID sent by Fleet Automation. // +optional ID string `json:"id,omitempty"` - // Generation is the DDA metadata.generation recorded when the experiment started. - // Used to detect manual spec changes while the experiment is running: if the - // current DDA generation differs from this value, the operator aborts the experiment. - // - // This value must be recorded after the DDA is patched for a startExperiment signal. - // +optional - Generation int64 `json:"generation,omitempty"` } // DatadogAgentStatus defines the observed state of DatadogAgent. diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index e35c857471..a5aa89253d 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -1142,13 +1142,6 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref common. Format: "", }, }, - "generation": { - SchemaProps: spec.SchemaProps{ - Description: "Generation is the DDA metadata.generation recorded when the experiment started. Used to detect manual spec changes while the experiment is running: if the current DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", - Type: []string{"integer"}, - Format: "int64", - }, - }, }, }, }, diff --git a/cmd/main.go b/cmd/main.go index 18e18db300..1713d36d87 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -345,7 +345,7 @@ func run(opts *options) error { } if opts.remoteUpdatesEnabled { - if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client()); rcErr != nil { + if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client(), opts.createControllerRevisions && opts.datadogAgentInternalEnabled); rcErr != nil { setupErrorf(setupLog, rcErr, "Unable to setup Fleet daemon") } } @@ -670,7 +670,7 @@ func setupAndStartHelmMetadataForwarder(logger logr.Logger, mgr manager.Manager, return mgr.Add(hmf) } -func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient) error { - daemon := fleet.NewDaemon(logger.WithName("fleet"), rcClient) +func setupFleetDaemon(logger logr.Logger, mgr manager.Manager, rcClient remoteconfig.RCClient, revisionsEnabled bool) error { + daemon := fleet.NewDaemon(rcClient, mgr.GetClient(), revisionsEnabled) return mgr.Add(daemon) } diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 22ecfd013a..09d72a666d 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -8492,15 +8492,6 @@ spec: experiment: description: Experiment tracks the state of an active or recent Fleet Automation experiment. properties: - generation: - description: |- - Generation is the DDA metadata.generation recorded when the experiment started. - Used to detect manual spec changes while the experiment is running: if the - current DDA generation differs from this value, the operator aborts the experiment. - - This value must be recorded after the DDA is patched for a startExperiment signal. - format: int64 - type: integer id: description: ID is the unique experiment ID sent by Fleet Automation. type: string diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 84ba9c4e7d..ab6b77eb9d 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -8194,11 +8194,6 @@ "additionalProperties": false, "description": "Experiment tracks the state of an active or recent Fleet Automation experiment.", "properties": { - "generation": { - "description": "Generation is the DDA metadata.generation recorded when the experiment started.\nUsed to detect manual spec changes while the experiment is running: if the\ncurrent DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", - "format": "int64", - "type": "integer" - }, "id": { "description": "ID is the unique experiment ID sent by Fleet Automation.", "type": "string" diff --git a/go.mod b/go.mod index f3aa5a7c08..273a6bcb28 100644 --- a/go.mod +++ b/go.mod @@ -42,7 +42,7 @@ require ( github.com/DataDog/datadog-agent/pkg/config/model v0.59.0-rc.5 github.com/DataDog/datadog-agent/pkg/config/remote v0.59.0-rc.5 github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1 - github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5 + github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3 github.com/DataDog/datadog-operator/api v0.0.0-20250130131115-7f198adcc856 github.com/aws/aws-sdk-go-v2 v1.39.6 github.com/aws/aws-sdk-go-v2/config v1.29.17 @@ -90,7 +90,7 @@ require ( github.com/DataDog/datadog-go/v5 v5.6.0 // indirect github.com/DataDog/go-libddwaf/v3 v3.3.0 // indirect github.com/DataDog/go-sqllexer v0.0.15 // indirect - github.com/DataDog/go-tuf v1.1.0-0.5.2 // indirect + github.com/DataDog/go-tuf v1.1.1-0.5.2 // indirect github.com/DataDog/gostackparse v0.7.0 // indirect github.com/DataDog/sketches-go v1.4.5 // indirect github.com/DataDog/viper v1.13.5 // indirect @@ -226,7 +226,7 @@ require ( github.com/rubenv/sql-migrate v1.8.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect - github.com/secure-systems-lab/go-securesystemslib v0.7.0 // indirect + github.com/secure-systems-lab/go-securesystemslib v0.9.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/shirou/gopsutil/v3 v3.24.1 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect diff --git a/go.sum b/go.sum index b91a610ad0..aef2b256af 100644 --- a/go.sum +++ b/go.sum @@ -81,8 +81,8 @@ github.com/DataDog/datadog-agent/pkg/obfuscate v0.59.0-rc.5 h1:Z+vgXHXmSnVRlCpcP github.com/DataDog/datadog-agent/pkg/obfuscate v0.59.0-rc.5/go.mod h1:ATVw8kr3U1Eqz3qBz9kS6WFDKji9XyoAsHKSlj3hPTM= github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1 h1:5Xr7XQoINzwO0PswNKClUW4b/ZfsTefVHxTAjfMY4Qk= github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1/go.mod h1:QOAaPRsuM4WNXtP3Rbw+gWamge9lGnDH8ZKB6HdzoAs= -github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5 h1:kCZxrxVVfbYAA8oMB4s7yb8XbO5WMkBmygcgKm3xh7o= -github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.59.0-rc.5/go.mod h1:c4th0IFaP0Q1ofRa0GcPB9hJWN+cmUoEfOI1Ub0O50A= +github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3 h1:cQvV1Qiq0EBdbGxfU4ECDJ8Nl3xYXcNUe6QzXw73pA4= +github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3/go.mod h1:TpW5ZwsQTrlRBPjtZH5/OFwpmOqxV/v2i9BiF4Xfcac= github.com/DataDog/datadog-agent/pkg/util/backoff v0.59.0-rc.5 h1:mAXwj00h3GqpiY7yECDLwLvCOAXswx3mIiU9N6Ki1GU= github.com/DataDog/datadog-agent/pkg/util/backoff v0.59.0-rc.5/go.mod h1:2RMfdYkKyeh8hXs6WgaamkkEyK35Xo55C4rFG4dO1k8= github.com/DataDog/datadog-agent/pkg/util/cache v0.59.0-rc.5 h1:+20QEL2pI+IwOqqesjhzzWkv6GcOe3TrIgUQtg6ZdTE= @@ -110,8 +110,8 @@ github.com/DataDog/go-libddwaf/v3 v3.3.0 h1:jS72fuQpFgJZEdEJDmHJCPAgNTEMZoz1EUvi github.com/DataDog/go-libddwaf/v3 v3.3.0/go.mod h1:Bz/0JkpGf689mzbUjKJeheJINqsyyhM8p9PDuHdK2Ec= github.com/DataDog/go-sqllexer v0.0.15 h1:rUUu52dP8EQhJLnUw0MIAxZp0BQx2fOTuMztr3vtHUU= github.com/DataDog/go-sqllexer v0.0.15/go.mod h1:KwkYhpFEVIq+BfobkTC1vfqm4gTi65skV/DpDBXtexc= -github.com/DataDog/go-tuf v1.1.0-0.5.2 h1:4CagiIekonLSfL8GMHRHcHudo1fQnxELS9g4tiAupQ4= -github.com/DataDog/go-tuf v1.1.0-0.5.2/go.mod h1:zBcq6f654iVqmkk8n2Cx81E1JnNTMOAx1UEO/wZR+P0= +github.com/DataDog/go-tuf v1.1.1-0.5.2 h1:YWvghV4ZvrQsPcUw8IOUMSDpqc3W5ruOIC+KJxPknv0= +github.com/DataDog/go-tuf v1.1.1-0.5.2/go.mod h1:zBcq6f654iVqmkk8n2Cx81E1JnNTMOAx1UEO/wZR+P0= github.com/DataDog/gostackparse v0.7.0 h1:i7dLkXHvYzHV308hnkvVGDL3BR4FWl7IsXNPz/IGQh4= github.com/DataDog/gostackparse v0.7.0/go.mod h1:lTfqcJKqS9KnXQGnyQMCugq3u1FP6UZMfWR0aitKFMM= github.com/DataDog/sketches-go v1.4.5 h1:ki7VfeNz7IcNafq7yI/j5U/YCkO3LJiMDtXz9OMQbyE= @@ -1097,8 +1097,8 @@ github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCw github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/secure-systems-lab/go-securesystemslib v0.7.0 h1:OwvJ5jQf9LnIAS83waAjPbcMsODrTQUpJ02eNLUoxBg= -github.com/secure-systems-lab/go-securesystemslib v0.7.0/go.mod h1:/2gYnlnHVQ6xeGtfIqFy7Do03K4cdCY0A/GlJLDKLHI= +github.com/secure-systems-lab/go-securesystemslib v0.9.0 h1:rf1HIbL64nUpEIZnjLZ3mcNEL9NBPB0iuVjyxvq3LZc= +github.com/secure-systems-lab/go-securesystemslib v0.9.0/go.mod h1:DVHKMcZ+V4/woA/peqr+L0joiRXbPpQ042GgJckkFgw= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/shirou/gopsutil/v3 v3.24.1 h1:R3t6ondCEvmARp3wxODhXMTLC/klMa87h2PHUw5m7QI= diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 399685ee25..e30ad5b499 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -79,9 +79,8 @@ func Test_Experiment_StoppedRollback(t *testing.T) { // RC writes phase=stopped. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -123,9 +122,8 @@ func Test_Experiment_TimeoutRollback(t *testing.T) { // RC writes phase=running. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -163,39 +161,31 @@ func Test_Experiment_AbortOnManualChange(t *testing.T) { assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) - // The fake client does not auto-increment Generation on Update, so revisions - // end up with a zero CreationTimestamp (fake-client limitation). Patch them to - // a recent time so the timeout path in handleRollback is not accidentally - // triggered before the abort check runs. + // Patch revision timestamps to a recent time so the timeout path in + // handleRollback is not accidentally triggered before the abort check runs. for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { rev.CreationTimestamp = metav1.Now() assert.NoError(t, r.client.Update(context.TODO(), &rev)) } assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - // The fake client never increments Generation, so dda.Generation is always 0. - // Set experiment.Generation to a non-zero sentinel so the abortExperiment - // generation-mismatch check (instance.Generation != experiment.Generation) fires. - const experimentGen = int64(2) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: experimentGen, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) - // User manually changes the spec — in a real cluster this bumps Generation past - // experimentGen. In the fake client Generation stays at 0, which differs from - // the experimentGen sentinel above, so the mismatch is already in place. + // User manually changes the spec — the new spec won't match any known revision, + // so abortExperiment detects it as a manual change. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = ptr.To("datadoghq.com") + dda.Spec.Global.Site = ptr.To("manual-change.example.com") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) // Spec should be the user's manual change, not rolled back. - assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, "manual-change.example.com", *dda.Spec.Global.Site) assert.NotNil(t, dda.Status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseAborted, dda.Status.Experiment.Phase) } @@ -232,9 +222,8 @@ func Test_Experiment_TimeoutPhase_IsStable(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) time.Sleep(2 * timeout) @@ -269,9 +258,8 @@ func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 2) @@ -286,12 +274,12 @@ func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) } -// Test_Experiment_RunningAfterTimeout_StaleGeneration verifies that if RC -// writes phase=running after a timeout rollback has completed, the operator -// fires timeout again idempotently: the pre-experiment revision is old enough -// to exceed the timeout threshold, rollback is a no-op (spec already correct), -// and phase=timeout is written again. -func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { +// Test_Experiment_RunningAfterTimeout verifies that if RC writes phase=running +// after a timeout rollback has completed, the operator fires timeout again +// idempotently: the pre-experiment revision is old enough to exceed the timeout +// threshold, rollback is a no-op (spec already correct), and phase=timeout is +// written again. +func Test_Experiment_RunningAfterTimeout(t *testing.T) { const ns, name = "default", "test-dda" const uid = types.UID("uid-1") const timeout = 50 * time.Millisecond @@ -309,22 +297,19 @@ func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) time.Sleep(2 * timeout) reconcileN(t, r, ns, name, 2) assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) - // RC writes phase=running again with the old (pre-rollback) generation — stale. + // RC writes phase=running again after the rollback already completed. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - const staleGen = int64(99) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: staleGen, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -355,9 +340,8 @@ func Test_Experiment_StoppedAfterRollback(t *testing.T) { assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseStopped, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 2) @@ -397,9 +381,8 @@ func Test_Experiment_AbortDoesNotRollback(t *testing.T) { // Manually force phase=aborted (as if abort already happened). assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) dda.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseAborted, - Generation: dda.Generation, - ID: "exp-1", + Phase: v2alpha1.ExperimentPhaseAborted, + ID: "exp-1", } assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) @@ -413,3 +396,662 @@ func Test_Experiment_AbortDoesNotRollback(t *testing.T) { // Also verify the revision timestamp is not used as a proxy for time.Now() comparison. _ = metav1.Now() } + +// Test_Experiment_Abort_AnnotatesOnlyExperimentRevision verifies that when an +// experiment is aborted due to a manual spec change: +// - The experiment revision (highest at the time of abort) is annotated. +// - The new revision created by manageRevision for the manual-change spec +// is NOT annotated. +// - Subsequent reconciles do not re-annotate or spread the annotation. +func Test_Experiment_Abort_AnnotatesOnlyExperimentRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Record the experiment revision name (highest Revision number). + var experimentRevName string + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if experimentRevName == "" || rev.Revision > 0 { + experimentRevName = rev.Name + } + } + maxRev := int64(0) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Revision > maxRev { + maxRev = rev.Revision + experimentRevName = rev.Name + } + } + + // Patch timestamps so timeout doesn't fire before abort. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // Set phase=running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // User manually changes the spec — triggers abort. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("manual-change.example.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + // First reconcile: abort fires (manageExperiment), then manageRevision + // creates rev3 for the manual-change spec. + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, mustGetExperimentPhase(t, r, ns, name)) + + // Verify: only the experiment revision is annotated, not the new one. + revs := listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should be annotated", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "non-experiment revision %s should NOT be annotated", rev.Name) + } + } + + // Run additional reconciles — annotation state must be stable. + reconcileN(t, r, ns, name, 3) + + revs = listOwnedRevisions(t, r.client, ns, uid) + annotatedNames := []string{} + for _, rev := range revs { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedNames = append(annotatedNames, rev.Name) + } + } + assert.Equal(t, []string{experimentRevName}, annotatedNames, + "after multiple reconciles, only the original experiment revision should remain annotated") +} + +// Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision verifies +// that when an experiment is rolled back via phase=stopped: +// - The experiment revision is annotated with experiment-rollback. +// - The rollback target (baseline) revision is NOT annotated. +// - Subsequent reconciles do not spread or remove the annotation. +func Test_Experiment_StoppedRollback_AnnotatesOnlyExperimentRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Record baseline revision name. + baselineRevName := listOwnedRevisions(t, r.client, ns, uid)[0].Name + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Identify experiment revision. + var experimentRevName string + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name != baselineRevName { + experimentRevName = rev.Name + } + } + assert.NotEmpty(t, experimentRevName, "should have an experiment revision") + + // Add a pre-existing annotation to the experiment revision to verify the + // merge patch preserves it when annotateRevision adds its own annotation. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + rev := rev + if rev.Annotations == nil { + rev.Annotations = map[string]string{} + } + rev.Annotations["custom-annotation"] = "should-survive" + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + break + } + } + + // RC writes phase=stopped → triggers rollback. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // Verify: experiment revision annotated, baseline NOT annotated, + // and pre-existing annotations are preserved by the merge patch. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should be annotated after rollback", rev.Name) + assert.Equal(t, "should-survive", rev.Annotations["custom-annotation"], + "pre-existing annotation on experiment revision %s should be preserved", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "baseline revision %s should NOT be annotated", rev.Name) + } + } + + // Run additional reconciles — annotation state must be stable. + reconcileN(t, r, ns, name, 3) + + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision %s should still be annotated after extra reconciles", rev.Name) + } else { + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "baseline revision %s should still NOT be annotated after extra reconciles", rev.Name) + } + } +} + +// Test_Experiment_PromoteThenNewExperiment_NoImmediateTimeout verifies that +// after an experiment is promoted, a subsequent new experiment does not +// immediately timeout due to a stale revision timestamp. +// +// Regression test: the promoted experiment's revision was not annotated, so +// handleRollback fell back to its stale timestamp and fired an immediate +// timeout on the first reconcile of the new experiment. +func Test_Experiment_PromoteThenNewExperiment_NoImmediateTimeout(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const longTimeout = 5 * time.Second + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, longTimeout) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: first experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=promoted (experiment succeeded, keep the new spec). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile processes the promoted phase: manageExperiment annotates the + // revision, then ensureRevision sees the annotation and recreates it with + // a fresh timestamp (consuming the annotation in the process). + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, mustGetExperimentPhase(t, r, ns, name)) + + // New experiment: daemon patches the spec first, which triggers a reconcile + // while still phase=promoted (abortExperiment is a no-op for non-running + // phases). This reconcile creates a revision for the new spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.jp") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Daemon then updates status to running with a new experiment ID. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Patch all revision timestamps to now so fresh revisions have fresh timestamps. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // Reconcile: the new experiment's revision exists with a fresh timestamp, + // so neither timeout nor abort should fire. + reconcileN(t, r, ns, name, 1) + + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "new experiment should still be running — no false timeout after promotion") +} + +// Test_Experiment_Promoted_DoesNotRecreateRevision verifies that a promoted +// experiment's revision is annotated with experiment-promoted (not +// experiment-rollback), and ensureRevision does NOT delete+recreate it. +func Test_Experiment_Promoted_DoesNotRecreateRevision(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: baseline. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Record the experiment revision name (highest Revision number). + var experimentRevName string + maxRev := int64(0) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Revision > maxRev { + maxRev = rev.Revision + experimentRevName = rev.Name + } + } + + // RC writes phase=promoted. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile processes promoted phase. + reconcileN(t, r, ns, name, 1) + + // The experiment revision should have the promoted annotation, NOT rollback. + revs := listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentPromoted], + "experiment revision should have promoted annotation") + assert.NotEqual(t, "true", rev.Annotations[annotationExperimentRollback], + "experiment revision should NOT have rollback annotation") + } + } + + // Run additional reconciles — the revision should NOT be recreated. + // If it were recreated, the name would stay the same but the promoted + // annotation would be cleared. Verify it persists. + reconcileN(t, r, ns, name, 3) + + revs = listOwnedRevisions(t, r.client, ns, uid) + for _, rev := range revs { + if rev.Name == experimentRevName { + assert.Equal(t, "true", rev.Annotations[annotationExperimentPromoted], + "promoted annotation should persist — revision must not be recreated") + } + } +} + +// Test_Experiment_StateTransitions verifies that after any terminal state, +// a new experiment can start and reach any terminal state correctly — with +// no false timeouts from stale revision timestamps. +// +// Matrix: 4 previous states × 3 new outcomes × 2 (fresh / stale old revision) = 24 subtests. +// +// Each sub-test follows the same 5-phase flow: +// +// Phase 1: Set up baseline + first experiment, reach terminal state. +// Phase 2: (stale variant only) Age all existing revision timestamps past timeout. +// Phase 3: Start a new experiment (mimics daemon: patch spec, reconcile, set running). +// Phase 4: Assert no false timeout — the new experiment must stay running. +// Phase 5: Drive the new experiment to its target outcome (stop/promote/timeout). +func Test_Experiment_StateTransitions(t *testing.T) { + type terminalSetup struct { + name string + reach func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) + } + + type newOutcome struct { + name string + action func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) + expect v2alpha1.ExperimentPhase + } + + // --------------------------------------------------------------- + // Terminal states: how to get the first experiment into each one. + // --------------------------------------------------------------- + terminalStates := []terminalSetup{ + { + // promoted: RC signals success, spec stays as-is. + name: "promoted", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhasePromoted, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Reconcile annotates the experiment revision with experiment-promoted. + reconcileN(t, r, ns, name, 1) + }, + }, + { + // rollback: RC signals stop, operator restores previous spec. + name: "rollback", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Two reconciles: first restores spec (status conflicts), + // second persists phase=rollback. + reconcileN(t, r, ns, name, 2) + }, + }, + { + // timeout: experiment runs past the deadline, operator rolls back. + name: "timeout", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + r.options.ExperimentTimeout = 50 * time.Millisecond + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(100 * time.Millisecond) + // Two reconciles: first rolls back spec (status conflicts), + // second persists phase=timeout. + reconcileN(t, r, ns, name, 2) + }, + }, + { + // aborted: user manually changes spec while experiment is running. + name: "aborted", + reach: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + // Fresh timestamps so timeout doesn't race abort. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + // Manual spec change — doesn't match any revision → abort. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("manual-change.example.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + }, + }, + } + + // --------------------------------------------------------------- + // Outcomes: how to drive the new experiment to its terminal state. + // --------------------------------------------------------------- + newOutcomes := []newOutcome{ + { + // stop: RC signals stop → operator rolls back → phase=rollback. + name: "stop", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + }, + expect: v2alpha1.ExperimentPhaseRollback, + }, + { + // promote: RC signals success → phase=promoted. + name: "promote", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + }, + expect: v2alpha1.ExperimentPhasePromoted, + }, + { + // timeout: age the new experiment's revision past the deadline + // so the reconciler triggers a real timeout. + name: "timeout", + action: func(t *testing.T, r *Reconciler, ns, name string, uid types.UID, nsName types.NamespacedName, dda *v2alpha1.DatadogAgent) { + t.Helper() + r.options.ExperimentTimeout = 50 * time.Millisecond + // Only age unannotated revisions (the new experiment's revision). + // Annotated revisions belong to the old experiment and must not + // be touched — they're already handled by the fallback skip. + staleTime := metav1.NewTime(time.Now().Add(-time.Minute)) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Annotations[annotationExperimentRollback] != "true" && + rev.Annotations[annotationExperimentPromoted] != "true" { + rev.CreationTimestamp = staleTime + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + } + reconcileN(t, r, ns, name, 2) + }, + expect: v2alpha1.ExperimentPhaseTimeout, + }, + } + + // --------------------------------------------------------------- + // Test loop: 4 previous × 3 outcomes × 2 (fresh/stale) = 24 subtests. + // --------------------------------------------------------------- + for _, prev := range terminalStates { + for _, next := range newOutcomes { + for _, staleOldRevision := range []bool{false, true} { + suffix := "" + if staleOldRevision { + suffix = "/stale_old_revision" + } + testName := prev.name + "_then_" + next.name + suffix + t.Run(testName, func(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 5*time.Second) + + // -- Phase 1: set up first experiment and reach terminal state -- + + // Baseline spec (rev1). + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // First experiment spec (rev2). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Drive exp-1 to its terminal state (promoted/rollback/timeout/aborted). + prev.reach(t, r, ns, name, uid, nsName, dda) + + // -- Phase 2: (stale variant) age old revisions past timeout -- + + if staleOldRevision { + staleTime := metav1.NewTime(time.Now().Add(-10 * time.Minute)) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = staleTime + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + } + + // -- Phase 3: start new experiment (mimics daemon) -- + + // Daemon step 1: patch spec to the new experiment config. + r.options.ExperimentTimeout = 5 * time.Second + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.jp") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + // This reconcile runs while the old terminal phase is still set, + // so handleRollback is a no-op. manageRevision creates a revision + // for the new spec. + reconcileN(t, r, ns, name, 1) + + // Daemon step 2: set phase=running with new experiment ID. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Give the new experiment's revision a fresh timestamp. + // (Fake client doesn't set CreationTimestamp on create.) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + // -- Phase 4: assert no false timeout -- + + reconcileN(t, r, ns, name, 1) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "new experiment should be running — no false timeout") + + // -- Phase 5: drive new experiment to target outcome -- + + next.action(t, r, ns, name, uid, nsName, dda) + assert.Equal(t, next.expect, mustGetExperimentPhase(t, r, ns, name)) + }) + } + } + } +} + +// Test_Experiment_ReapplySameSpec_NoImmediateTimeout verifies the full +// annotation-based revision recreate flow end-to-end: +// +// 1. Baseline spec → experiment spec → timeout rollback. +// 2. Rollback annotates the experiment revision with experiment-rollback=true. +// 3. Re-apply the same experiment spec. +// 4. ensureRevision creates a new revision for the experiment spec (the content +// hash may differ from the original due to defaulting, but if it matches, +// the annotated revision is deleted+recreated with a fresh timestamp). +// 5. A subsequent reconcile with phase=running does NOT immediately timeout. +// +// NOTE: The fake client doesn't set CreationTimestamp on Create (unlike the +// real API server), so we manually patch timestamps to simulate fresh +// revisions after the re-apply reconcile. +func Test_Experiment_ReapplySameSpec_NoImmediateTimeout(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + // Use a short timeout for the initial experiment so it times out quickly, + // then switch to a long timeout for the re-applied experiment so we can + // assert it does NOT timeout within a single reconcile. + const shortTimeout = 50 * time.Millisecond + const longTimeout = 5 * time.Second + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, shortTimeout) + + // Step 1: Create baseline (rev1). + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Step 2: Apply experiment spec (rev2). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // Step 3: Set phase=running and let it timeout. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * shortTimeout) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be rolled back") + + // Verify the experiment revision is annotated as rolled back. + revs := listOwnedRevisions(t, r.client, ns, uid) + var annotatedCount int + for _, rev := range revs { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedCount++ + } + } + assert.Equal(t, 1, annotatedCount, "exactly one revision should have the rollback annotation") + + // Switch to a long timeout so the re-applied experiment doesn't timeout + // within the reconcile's own execution time. + r.options.ExperimentTimeout = longTimeout + + // Step 4: Clear experiment status (simulating fleet daemon acknowledging the + // rollback) and re-apply the same experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = nil + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + // This reconcile triggers ensureRevision which either: + // - matches the annotated revision (same content hash) → delete+recreate, or + // - creates a new revision (different hash due to defaulting differences). + // Either way, the current revision for this spec has no rollback annotation. + reconcileN(t, r, ns, name, 1) + + // Step 5: Set phase=running again. Patch all revision timestamps to now so + // the timeout check works correctly with the fake client (which doesn't set + // CreationTimestamp on Create like the real API server). + revs = listOwnedRevisions(t, r.client, ns, uid) + assert.Len(t, revs, 2) + for i := range revs { + revs[i].CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &revs[i])) + } + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-2", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Reconcile — should NOT timeout because the revision is fresh. + reconcileN(t, r, ns, name, 1) + + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, mustGetExperimentPhase(t, r, ns, name), + "experiment should still be running — no immediate timeout after reapply") +} diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index aa4a59c46c..4e7e581e17 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -17,6 +17,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" ) @@ -24,6 +25,19 @@ import ( // ExperimentDefaultTimeout is the duration after which a running experiment is automatically rolled back. const ExperimentDefaultTimeout = 15 * time.Minute +// annotationExperimentRollback marks a ControllerRevision whose experiment was +// rejected (rollback, timeout, or abort). The annotation tells handleRollback +// to skip the timeout check (the CreationTimestamp is stale from the previous +// experiment) and tells ensureRevision to delete+recreate the revision so it +// gets a fresh timestamp when the same spec is re-applied. +const annotationExperimentRollback = "operator.datadoghq.com/experiment-rollback" + +// annotationExperimentPromoted marks a ControllerRevision whose experiment was +// promoted. The annotation tells handleRollback to skip the timeout check so +// the stale CreationTimestamp doesn't cause a false timeout when a subsequent +// experiment starts before its own revision is created. +const annotationExperimentPromoted = "operator.datadoghq.com/experiment-promoted" + // manageExperiment handles all experiment state transitions for a reconcile cycle. // Must be called before manageRevision. func (r *Reconciler) manageExperiment( @@ -43,19 +57,28 @@ func (r *Reconciler) manageExperiment( if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { return err } - abortExperiment(ctx, instance.Generation, experiment, newStatus) + // Mark the highest revision when promoted so its stale timestamp doesn't + // cause a false timeout if a new experiment starts before manageRevision + // creates a fresh revision. + if experiment.Phase == v2alpha1.ExperimentPhasePromoted { + if rev := highestRevision(revList); rev != nil { + r.annotateRevision(ctx, rev, annotationExperimentPromoted) + } + } + r.abortExperiment(ctx, instance, experiment, newStatus, revList) return nil } // abortExperiment marks the experiment as aborted in newStatus if a manual spec -// change is detected (DDA generation differs from the recorded experiment generation). +// change is detected (current spec doesn't match any known ControllerRevision). // It is a no-op if handleRollback has already set a terminal phase (e.g. timeout), // preventing spurious abort logs and phase overwrites. -func abortExperiment( +func (r *Reconciler) abortExperiment( ctx context.Context, - instanceGeneration int64, + instance *v2alpha1.DatadogAgent, experiment *v2alpha1.ExperimentStatus, newStatus *v2alpha1.DatadogAgentStatus, + revisions []appsv1.ControllerRevision, ) { if experiment.Phase != v2alpha1.ExperimentPhaseRunning { return @@ -64,11 +87,31 @@ func abortExperiment( // handleRollback already determined a terminal phase (e.g. timeout); don't overwrite or log. return } - if experiment.Generation == 0 || instanceGeneration == experiment.Generation { + // On the first reconcile after experiment start, the new revision hasn't + // been created yet (manageExperiment runs before manageRevision). With + // only one revision (the pre-experiment baseline), the current spec won't + // match it — but that's expected, not a manual change. Skip the check + // when fewer than 2 revisions exist. + if len(revisions) < 2 { + return + } + if findMostRecentMatchingRevision(revisions, instance) != nil { + // Spec matches a known revision — no manual change detected. + // Edge case: if the user manually reverts to the pre-experiment spec, it + // matches the baseline revision, so abort does not fire. The experiment + // still terminates via timeout (the baseline revision's old timestamp + // exceeds the timeout threshold), and the rollback is a no-op because + // the spec already matches the target. The phase will read "timeout" + // rather than "aborted", but the end state is correct. return } ctrl.LoggerFrom(ctx).Info("Aborting experiment due to manual spec change") newStatus.Experiment.Phase = v2alpha1.ExperimentPhaseAborted + // Mark the experiment revision (highest-numbered) so its stale timestamp + // doesn't cause an immediate timeout if the same spec is re-applied. + if rev := highestRevision(revisions); rev != nil { + r.annotateRevision(ctx, rev, annotationExperimentRollback) + } } // handleRollback checks if the experiment needs rollback (explicit stop or timeout). @@ -93,6 +136,19 @@ func (r *Reconciler) handleRollback( return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) case phase == v2alpha1.ExperimentPhaseRunning: rev := findMostRecentMatchingRevision(revisions, instance) + if rev == nil && len(revisions) >= 2 { + // Spec was manually changed — no revision matches the current spec. + // Fall back to the highest-numbered revision (the experiment revision) + // so we can still detect timeout even after a manual spec change. + rev = highestRevision(revisions) + // Only skip for annotations in the fallback path: the spec doesn't + // match any revision (e.g. new experiment before manageRevision runs). + // Annotations on fallback revisions indicate a prior experiment's + // terminal state — their stale timestamps must not cause false timeouts. + if rev != nil && (rev.Annotations[annotationExperimentRollback] == "true" || rev.Annotations[annotationExperimentPromoted] == "true") { + break + } + } if rev != nil { elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { @@ -106,7 +162,10 @@ func (r *Reconciler) handleRollback( } // restorePreviousSpec restores the DDA spec from the previous ControllerRevision -// and, on success, sets the terminal experiment phase. +// and, on success, sets the terminal experiment phase. It also marks the +// experiment revision (the highest-numbered, non-rollback-target revision) with +// the rollback annotation so its stale CreationTimestamp doesn't cause an +// immediate timeout if the same spec is re-applied later. func (r *Reconciler) restorePreviousSpec( ctx context.Context, instanceMeta metav1.ObjectMeta, @@ -114,10 +173,20 @@ func (r *Reconciler) restorePreviousSpec( revisions []appsv1.ControllerRevision, terminalPhase v2alpha1.ExperimentPhase, ) error { - if err := r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)); err != nil { + rollbackTarget := findRollbackTarget(revisions) + if err := r.rollback(ctx, instanceMeta, rollbackTarget); err != nil { return err } newStatus.Experiment.Phase = terminalPhase + // Mark the experiment revision (highest-numbered) so its stale timestamp + // doesn't cause an immediate timeout if the same spec is re-applied. + // Only annotate the highest revision rather than all non-rollback-target + // revisions: if GC failed on a prior reconcile there may be 3+ revisions, + // and annotating old baselines would cause needless delete+recreate in + // ensureRevision if those specs are ever re-applied. + if rev := highestRevision(revisions); rev != nil && rev.Name != rollbackTarget { + r.annotateRevision(ctx, rev, annotationExperimentRollback) + } return nil } @@ -128,7 +197,8 @@ func (r *Reconciler) rollback( rollbackTarget string, ) error { if rollbackTarget == "" { - return fmt.Errorf("no previous revision to roll back to") + ctrl.LoggerFrom(ctx).Info("No previous revision to roll back to, skipping spec restore") + return nil } cr := &appsv1.ControllerRevision{} @@ -159,6 +229,9 @@ func (r *Reconciler) rollback( // Merge snapshot annotations (Datadog-only keys) on top of current annotations // so that non-Datadog annotations (user metadata, tooling labels, etc.) are preserved. merged := maps.Clone(current.Annotations) + if merged == nil { + merged = make(map[string]string, len(snapshot.Annotations)) + } maps.Copy(merged, snapshot.Annotations) toUpdate := &v2alpha1.DatadogAgent{ @@ -217,6 +290,37 @@ func findMostRecentMatchingRevision(revisions []appsv1.ControllerRevision, insta return result } +// highestRevision returns the revision with the largest Revision number. +func highestRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRevision { + var result *appsv1.ControllerRevision + for i := range revisions { + if result == nil || revisions[i].Revision > result.Revision { + result = &revisions[i] + } + } + return result +} + +// annotateRevision sets the given annotation on a ControllerRevision. +// This is best-effort: if the patch fails, the stale timestamp remains +// but is no worse than the existing behavior. +func (r *Reconciler) annotateRevision(ctx context.Context, rev *appsv1.ControllerRevision, annotation string) { + if rev.Annotations[annotation] == "true" { + return // already annotated + } + logger := ctrl.LoggerFrom(ctx).WithValues( + "object.kind", "ControllerRevision", + "object.namespace", rev.Namespace, + "object.name", rev.Name, + ) + patch := []byte(`{"metadata":{"annotations":{"` + annotation + `":"true"}}}`) + if err := r.client.Patch(ctx, rev, client.RawPatch(types.MergePatchType, patch)); err != nil { + logger.Error(err, "Failed to annotate experiment revision", "annotation", annotation) + return + } + logger.Info("Annotated experiment revision", "annotation", annotation) +} + func getExperimentTimeout(timeout time.Duration) time.Duration { if timeout == 0 { return ExperimentDefaultTimeout diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index be255f20b8..541d8ee12f 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -21,24 +21,78 @@ import ( func TestManageExperiment_AbortsOnManualChange(t *testing.T) { r, _ := newRevisionTestReconciler(t) - instance := newRevisionTestOwner("test-dda", "default") - instance.Generation = 3 - instance.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instance.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: 2, + // Create two revisions: pre-experiment (specA) and experiment (specB). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + // Simulate a manual spec change: specC doesn't match any revision. + instanceC := newRevisionTestOwner("test-dda", "default") + manualSite := "manual-change.example.com" + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} + instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + // Set recent timestamps so the timeout path in handleRollback does not fire. + revList := mustListRevisions(t, r, instanceC) + for i := range revList { + revList[i].CreationTimestamp = metav1.Now() } status := &v2alpha1.DatadogAgentStatus{ - Experiment: instance.Status.Experiment.DeepCopy(), + Experiment: instanceC.Status.Experiment.DeepCopy(), } - err := r.manageExperiment(context.Background(), instance, status, metav1.Now(), mustListRevisions(t, r, instance)) + err := r.manageExperiment(context.Background(), instanceC, status, metav1.Now(), revList) require.NoError(t, err) require.NotNil(t, status.Experiment) assert.Equal(t, v2alpha1.ExperimentPhaseAborted, status.Experiment.Phase) } +// TestManageExperiment_ManualRevertToBaselineTerminatesViaTimeout verifies that +// when the user manually reverts the spec to the pre-experiment value during a +// running experiment, the experiment terminates via timeout rather than abort. +// The revision-based abort check sees the spec matching the baseline revision +// and treats it as a known state. The timeout path fires because the baseline +// revision's old timestamp exceeds the threshold. The rollback is a no-op +// (spec already matches target), and the phase is set to "timeout". +func TestManageExperiment_ManualRevertToBaselineTerminatesViaTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Rev1: pre-experiment spec (specA). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + // Rev2: experiment spec (specB). + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceA)) + + // User manually reverts to specA. The spec matches rev1 (the baseline), + // so abortExperiment won't fire. Instead, handleRollback detects timeout + // from rev1's old CreationTimestamp. + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + revList := mustListRevisions(t, r, instanceA) + // Ensure the baseline revision's timestamp is old enough to trigger timeout. + for i := range revList { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceA.Status.Experiment.DeepCopy()} + require.NoError(t, r.manageExperiment(context.Background(), instanceA, newStatus, metav1.Now(), revList)) + // Abort does not fire — spec matches a known revision. Timeout fires instead + // because the matching revision's timestamp exceeds the threshold. + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + func TestRollback_RestoresSpec(t *testing.T) { r, c := newRevisionTestReconciler(t) @@ -69,8 +123,7 @@ func TestRollback_NoPreviousRevision(t *testing.T) { instance := newRevisionTestOwner("test-dda", "default") err := r.rollback(context.Background(), instance.ObjectMeta, "") - require.Error(t, err) - assert.Contains(t, err.Error(), "no previous revision") + require.NoError(t, err) } func TestHandleRollback_StoppedPhase(t *testing.T) { @@ -172,14 +225,14 @@ func TestManageExperiment_TimeoutWinsOverSpuriousAbort(t *testing.T) { require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) require.NoError(t, c.Create(context.Background(), instanceB)) - // Simulate a post-409 reconcile: phase=running, but generation was bumped by the - // rollback spec update (instanceB.Generation != experiment.Generation), AND timeout - // has elapsed. abortExperiment would fire for the generation mismatch, but - // handleRollback must win and persist phase=timeout. - instanceB.Generation = 99 + // Simulate a post-409 reconcile: phase=running, the spec was manually changed + // so it doesn't match any revision, AND timeout has elapsed. abortExperiment + // would fire for the revision mismatch, but handleRollback must win and + // persist phase=timeout. + manualSite := "manual-change.example.com" + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: 1, + Phase: v2alpha1.ExperimentPhaseRunning, } revList := mustListRevisions(t, r, instanceB) @@ -236,11 +289,8 @@ func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { // The DDA in the cluster already has the rolled-back spec (instanceA's spec), // as if reconcile-1 restored it but its status write 409'd. - // Generation is set to a realistic non-zero value: RC would have recorded - // instanceA.Generation when the experiment started. instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ - Phase: v2alpha1.ExperimentPhaseRunning, - Generation: instanceA.Generation, + Phase: v2alpha1.ExperimentPhaseRunning, } require.NoError(t, c.Create(context.Background(), instanceA)) @@ -260,16 +310,13 @@ func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { // TestReapplySameSpecAfterRollback_NoImmediateTimeout is the end-to-end // regression test for the stale-revision bug. // -// Without the fix: gcOldRevisions kept the experiment revision (rev2/B) after -// rollback. When RC later re-applied spec B as a new experiment and set -// phase=Running, findMostRecentMatchingRevision found the stale rev2 — its -// CreationTimestamp predated the current experiment — so elapsed >= timeout -// fired immediately and the operator rolled back again, making the re-apply -// appear to have no effect. +// Without the fix: the stale experiment revision's old CreationTimestamp caused +// an immediate timeout when the same spec was re-applied as a new experiment. // -// With the fix: gcOldRevisions deletes rev2 once phase=Rollback is persisted. -// Re-applying spec B creates a fresh revision with a current timestamp, and -// the timeout clock starts correctly. +// With the fix: restorePreviousSpec annotates the experiment revision with the +// rollback annotation. handleRollback skips the timeout check for annotated +// revisions. ensureRevision deletes+recreates the annotated revision with a +// fresh timestamp when the spec is re-applied. func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { r, c := newRevisionTestReconciler(t) @@ -292,37 +339,160 @@ func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { } // Rollback: RC sets phase=Stopped; operator restores spec A. + // restorePreviousSpec annotates the experiment revision (B) as rolled back. instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} rollbackStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} require.NoError(t, r.handleRollback(context.Background(), instanceB, rollbackStatus, metav1.Now(), revList)) require.Equal(t, v2alpha1.ExperimentPhaseRollback, rollbackStatus.Experiment.Phase) - // Next reconcile: Rollback phase is now persisted in instance.Status. - // gcOldRevisions must delete the stale rev2 (B). - instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback} - rollbackNewStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback}} - require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), rollbackNewStatus)) - + // Verify the experiment revision was annotated. remaining := mustListRevisions(t, r, instanceA) - require.Len(t, remaining, 1, "stale experiment revision (spec B) must be deleted once Rollback phase is persisted") + require.Len(t, remaining, 2, "both revisions should be kept (no aggressive GC)") + var annotatedCount int + for _, rev := range remaining { + if rev.Annotations[annotationExperimentRollback] == "true" { + annotatedCount++ + } + } + assert.Equal(t, 1, annotatedCount, "exactly one revision should have the rollback annotation") - // RC re-applies spec B as a new experiment (sets spec=B, phase=Running). - // In the real reconcile loop manageExperiment (handleRollback) runs before - // manageRevision, so no revision for spec B exists at the time of the check. - // findMostRecentMatchingRevision returns nil → timeout check is skipped. + // RC re-applies spec B as a new experiment. + // In the real flow, the daemon patches the spec first, then a reconcile runs + // (with no experiment phase set) where ensureRevision recreates the annotated + // revision with a fresh timestamp. Only then does the daemon set phase=Running + // and the next reconcile calls handleRollback. instanceB2 := newRevisionTestOwner("test-dda", "default") instanceB2.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} - revListBeforeNewRevision := mustListRevisions(t, r, instanceB2) - require.Len(t, revListBeforeNewRevision, 1, "only rev1 (A) should exist before the new experiment's revision is created") + // Step 1: ensureRevision recreates the annotated revision (fresh, no annotation). + _, err := r.ensureRevision(context.Background(), instanceB2, mustListRevisions(t, r, instanceB2), false) + require.NoError(t, err) + + finalRevs := mustListRevisions(t, r, instanceB2) + for _, rev := range finalRevs { + assert.Empty(t, rev.Annotations[annotationExperimentRollback], + "rollback annotation should be cleared after recreate") + } + + // Fake client doesn't set CreationTimestamp on Create, so patch all + // revision timestamps to now to simulate fresh revisions. + for i := range finalRevs { + finalRevs[i].CreationTimestamp = metav1.Now() + require.NoError(t, c.Update(context.Background(), &finalRevs[i])) + } + // Step 2: daemon sets phase=Running, next reconcile calls handleRollback. + instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + revListForNewExp := mustListRevisions(t, r, instanceB2) newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} - require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListBeforeNewRevision)) + require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListForNewExp)) assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, "re-applying the same spec after rollback must not immediately time out") } +// TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest verifies that +// when 3+ revisions exist (e.g. GC failed on a prior reconcile), only the +// highest-numbered revision (the experiment) is annotated — not older baselines. +func TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Build 3 revisions using ensureRevision directly (bypasses GC). + instanceA := newRevisionTestOwner("test-dda", "default") + rev1Name, err := r.ensureRevision(context.Background(), instanceA, nil, false) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + rev2Name, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + experimentSite := "datadoghq.eu" + instanceC := newRevisionTestOwner("test-dda", "default") + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &experimentSite}} + rev3Name, err := r.ensureRevision(context.Background(), instanceC, mustListRevisions(t, r, instanceC), false) + require.NoError(t, err) + + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 3, "need 3 revisions to test this scenario") + + // rollback fetches the current DDA; create it with the experiment spec. + require.NoError(t, c.Create(context.Background(), instanceC)) + + // Trigger rollback via phase=stopped. + instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceC.Status.Experiment.DeepCopy()} + require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC.ObjectMeta, newStatus, revList, v2alpha1.ExperimentPhaseRollback)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) + + // Verify: only rev3 (experiment, highest) is annotated. + // Rev1 (old baseline) and rev2 (rollback target) must NOT be annotated. + for _, rev := range mustListRevisions(t, r, instanceA) { + hasAnnotation := rev.Annotations[annotationExperimentRollback] == "true" + switch rev.Name { + case rev3Name: + assert.True(t, hasAnnotation, "rev3 (experiment, highest) should be annotated") + case rev2Name: + assert.False(t, hasAnnotation, "rev2 (rollback target) should NOT be annotated") + case rev1Name: + assert.False(t, hasAnnotation, "rev1 (old baseline) should NOT be annotated") + } + } +} + +// TestAbortExperiment_ThreeRevisions_AnnotatesOnlyHighest verifies that when +// 3+ revisions exist and abort fires, only the highest-numbered revision (the +// experiment) is annotated — not older baselines. +func TestAbortExperiment_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + + // Build 3 revisions using ensureRevision directly (bypasses GC). + instanceA := newRevisionTestOwner("test-dda", "default") + rev1Name, err := r.ensureRevision(context.Background(), instanceA, nil, false) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + rev2Name, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + experimentSite := "datadoghq.eu" + instanceC := newRevisionTestOwner("test-dda", "default") + instanceC.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &experimentSite}} + rev3Name, err := r.ensureRevision(context.Background(), instanceC, mustListRevisions(t, r, instanceC), false) + require.NoError(t, err) + + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 3) + + // Set recent timestamps so timeout doesn't fire first. + for i := range revList { + revList[i].CreationTimestamp = metav1.Now() + } + + // Simulate manual spec change (specD) — doesn't match any revision. + manualSite := "manual-change.example.com" + instanceD := newRevisionTestOwner("test-dda", "default") + instanceD.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: &manualSite}} + instanceD.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceD.Status.Experiment.DeepCopy()} + r.abortExperiment(context.Background(), instanceD, instanceD.Status.Experiment, newStatus, revList) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, newStatus.Experiment.Phase) + + // Verify: only rev3 (experiment, highest) is annotated. + for _, rev := range mustListRevisions(t, r, instanceA) { + hasAnnotation := rev.Annotations[annotationExperimentRollback] == "true" + switch rev.Name { + case rev3Name: + assert.True(t, hasAnnotation, "rev3 (experiment, highest) should be annotated") + case rev2Name: + assert.False(t, hasAnnotation, "rev2 should NOT be annotated") + case rev1Name: + assert.False(t, hasAnnotation, "rev1 (old baseline) should NOT be annotated") + } + } +} + func TestHandleRollback_Timeout(t *testing.T) { r, c := newRevisionTestReconciler(t) diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 1ec5abe75c..fbd02bd9ad 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -15,6 +15,7 @@ import ( appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -51,7 +52,7 @@ func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.Data if err != nil { return err } - if err := r.gcOldRevisions(ctx, instance, revName, revList); err != nil { + if err := r.gcOldRevisions(ctx, revName, revList); err != nil { ctrl.LoggerFrom(ctx).Error(err, "Failed to garbage collect old ControllerRevisions, will retry on next reconcile") } return nil @@ -107,6 +108,11 @@ func (r *Reconciler) ensureRevision( return "", fmt.Errorf("failed to get GVK for owner: %w", err) } + data := runtime.RawExtension{Raw: specBytes} + labels := map[string]string{ + apicommon.DatadogAgentNameLabelKey: instance.GetName(), + } + // Find any existing revision with identical data, and track the max Revision. var matchingRev *appsv1.ControllerRevision maxRevision := int64(0) @@ -121,6 +127,16 @@ func (r *Reconciler) ensureRevision( } if matchingRev != nil { + objLogger := logger.WithValues( + "object.kind", "ControllerRevision", + "object.namespace", matchingRev.Namespace, + "object.name", matchingRev.Name, + ) + + if matchingRev.Annotations[annotationExperimentRollback] == "true" && !skipBump { + return r.recreateRevision(ctx, matchingRev, instance, gvks[0], labels, data, maxRevision) + } + // Identical content already snapshotted. Bump Revision to max+1 if it // has been superseded (e.g. after a revert) so ordering stays correct. // Skip the bump during experiment rollback: bumping the pre-experiment @@ -128,11 +144,6 @@ func (r *Reconciler) ensureRevision( // to select the experiment revision as the rollback target on the next // stopped signal, reversing the rollback. if matchingRev.Revision < maxRevision && !skipBump { - objLogger := logger.WithValues( - "object.kind", "ControllerRevision", - "object.namespace", matchingRev.Namespace, - "object.name", matchingRev.Name, - ) objLogger.Info("Bumping ControllerRevision to latest") patch := fmt.Appendf(nil, `{"revision":%d}`, maxRevision+1) if err := r.client.Patch(ctx, matchingRev, client.RawPatch(types.MergePatchType, patch)); err != nil && !apierrors.IsConflict(err) { @@ -143,10 +154,6 @@ func (r *Reconciler) ensureRevision( } nextRevision := maxRevision + 1 - data := runtime.RawExtension{Raw: specBytes} - labels := map[string]string{ - apicommon.DatadogAgentNameLabelKey: instance.GetName(), - } rev := controllerrevisions.NewControllerRevision(instance, gvks[0], labels, data, nextRevision, nil) // Check for a name conflict before creating. @@ -179,6 +186,46 @@ func (r *Reconciler) ensureRevision( return rev.Name, nil } +// recreateRevision deletes a rolled-back ControllerRevision and creates a +// fresh one with the same content but a new CreationTimestamp. This prevents +// an immediate timeout when the same experiment spec is re-applied, since +// CreationTimestamp is immutable in Kubernetes. +// +// Failure recovery: +// - Delete fails: error returned, next reconcile retries. +// - Delete succeeds, Create fails (or operator crashes): the revision is +// gone, so the next reconcile's ensureRevision takes the normal "no +// matching revision" path and creates a fresh one. +func (r *Reconciler) recreateRevision( + ctx context.Context, + old *appsv1.ControllerRevision, + instance *v2alpha1.DatadogAgent, + gvk schema.GroupVersionKind, + labels map[string]string, + data runtime.RawExtension, + maxRevision int64, +) (string, error) { + logger := ctrl.LoggerFrom(ctx).WithValues( + "object.kind", "ControllerRevision", + "object.namespace", old.Namespace, + "object.name", old.Name, + ) + logger.Info("Recreating rolled-back ControllerRevision with fresh timestamp") + + if err := r.client.Delete(ctx, old); err != nil && !apierrors.IsNotFound(err) { + return "", fmt.Errorf("failed to delete rolled-back ControllerRevision %s: %w", old.Name, err) + } + + fresh := controllerrevisions.NewControllerRevision(instance, gvk, labels, data, maxRevision+1, nil) + if err := r.client.Create(ctx, fresh); err != nil { + if apierrors.IsAlreadyExists(err) { + return fresh.Name, nil + } + return "", fmt.Errorf("failed to recreate ControllerRevision %s: %w", fresh.Name, err) + } + return fresh.Name, nil +} + // datadogAnnotations returns a copy of annotations filtered to only those // with `.datadoghq.com/` in the key, which are used for preview features. func datadogAnnotations(all map[string]string) map[string]string { @@ -195,41 +242,27 @@ func datadogAnnotations(all map[string]string) map[string]string { } // gcOldRevisions deletes all but the two most recent ControllerRevisions: -// the current and previous. -// -// Exception: after a rejected experiment (rollback, timeout, or abort) the -// experiment revision carries a stale CreationTimestamp. If that revision were -// kept and the same spec were re-applied as a new experiment, -// findMostRecentMatchingRevision would find it, compute an elapsed time based -// on the old timestamp, and immediately time out. To prevent this, all -// non-current revisions are deleted once the rejected phase is persisted in -// instance.Status. The check uses the persisted status (not the in-flight -// newStatus) so that the revision survives any status-write conflicts during -// the rollback itself — the deletion is intentionally deferred to the next -// reconcile cycle after the terminal phase is confirmed. +// the current and previous. Stale experiment revisions (marked with the +// rollback annotation) are kept here — they are handled by ensureRevision +// which recreates them with a fresh timestamp when the same spec is re-applied. func (r *Reconciler) gcOldRevisions( ctx context.Context, - instance *v2alpha1.DatadogAgent, current string, revList []appsv1.ControllerRevision, ) error { logger := ctrl.LoggerFrom(ctx) // Identify the most recent non-current revision to keep as previous. - // Skip this when the persisted experiment phase is a rejected terminal - // phase — in that case we want to delete the stale experiment revision. previous := "" - if !isRejectedExperimentPhase(instance) { - previousRevision := int64(-1) - for i := range revList { - rev := &revList[i] - if rev.Name == current { - continue - } - if rev.Revision > previousRevision { - previousRevision = rev.Revision - previous = rev.Name - } + previousRevision := int64(-1) + for i := range revList { + rev := &revList[i] + if rev.Name == current { + continue + } + if rev.Revision > previousRevision { + previousRevision = rev.Revision + previous = rev.Name } } @@ -251,18 +284,3 @@ func (r *Reconciler) gcOldRevisions( return nil } - -// isRejectedExperimentPhase returns true when the persisted experiment phase -// indicates the experiment spec was rejected — rolled back, timed out, or -// aborted. These are the cases where keeping the experiment revision would -// cause an immediate timeout if the same spec were re-applied. -func isRejectedExperimentPhase(instance *v2alpha1.DatadogAgent) bool { - if instance.Status.Experiment == nil { - return false - } - switch instance.Status.Experiment.Phase { - case v2alpha1.ExperimentPhaseRollback, v2alpha1.ExperimentPhaseTimeout, v2alpha1.ExperimentPhaseAborted: - return true - } - return false -} diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index 2689f6db70..2c6140edf9 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -174,7 +174,7 @@ func TestGCOldRevisions_KeepsCurrentAndPrevious(t *testing.T) { names[i] = name } - err := r.gcOldRevisions(context.Background(), instances[2], names[2], mustListRevisions(t, r, instances[2])) + err := r.gcOldRevisions(context.Background(), names[2], mustListRevisions(t, r, instances[2])) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -223,7 +223,7 @@ func TestGCOldRevisions_KeepsTwoRevisions(t *testing.T) { name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instanceB, name2, mustListRevisions(t, r, instanceB)) + err = r.gcOldRevisions(context.Background(), name2, mustListRevisions(t, r, instanceB)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -268,7 +268,7 @@ func TestGCOldRevisions_NoPreviousWhenOnlyCurrent(t *testing.T) { revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instance, revName, mustListRevisions(t, r, instance)) + err = r.gcOldRevisions(context.Background(), revName, mustListRevisions(t, r, instance)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -291,7 +291,7 @@ func TestGCOldRevisions_DeletesMultipleOld(t *testing.T) { } current := newRevisionTestOwner("test-dda", "default") - err := r.gcOldRevisions(context.Background(), current, names[4], mustListRevisions(t, r, current)) + err := r.gcOldRevisions(context.Background(), names[4], mustListRevisions(t, r, current)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -408,81 +408,89 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { assert.Equal(t, int64(2), revList[0].Revision) } -// TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment verifies that when -// the persisted experiment phase is a rejected terminal phase (Rollback, Timeout, -// or Aborted), gcOldRevisions deletes the stale experiment revision instead of -// keeping it as "previous". This prevents an immediate timeout if the same spec -// is re-applied as a new experiment. -func TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment(t *testing.T) { - rejectedPhases := []v2alpha1.ExperimentPhase{ - v2alpha1.ExperimentPhaseRollback, - v2alpha1.ExperimentPhaseTimeout, - v2alpha1.ExperimentPhaseAborted, - } +// TestEnsureRevision_RecreatesRolledBackRevision verifies that when a revision +// has the experiment-rollback annotation, ensureRevision deletes and recreates +// it to get a fresh CreationTimestamp. This prevents an immediate timeout when +// the same experiment spec is re-applied after a previous experiment was rejected. +func TestEnsureRevision_RecreatesRolledBackRevision(t *testing.T) { + r, c := newRevisionTestReconciler(t) - for _, phase := range rejectedPhases { - t.Run(string(phase), func(t *testing.T) { - r, c := newRevisionTestReconciler(t) + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - instanceA := newRevisionTestOwner("test-dda", "default") - instanceB := newRevisionTestOwner("test-dda", "default") - instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) - nameA, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) - require.NoError(t, err) - _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) - require.NoError(t, err) + // Annotate revB as rolled back (simulating what restorePreviousSpec does). + revB := fetchRevisionByName(t, c, "default", nameB) + revB.Annotations = map[string]string{annotationExperimentRollback: "true"} + require.NoError(t, c.Update(context.Background(), revB)) + + // Re-apply the same experiment spec (instanceB). ensureRevision should + // delete+recreate the annotated revision with a fresh timestamp. + // (In real Kubernetes the CreationTimestamp is set server-side on create; + // the fake client doesn't refresh it, so we verify the annotation and + // revision number instead.) + nameB2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + assert.Equal(t, nameB, nameB2, "should reuse same name (same data hash)") - // Simulate the persisted status having a rejected terminal phase. - // current is instanceA (spec restored after rollback). - instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + revB2 := fetchRevisionByName(t, c, "default", nameB2) + assert.Empty(t, revB2.Annotations[annotationExperimentRollback], "rollback annotation should be cleared") + assert.Equal(t, int64(3), revB2.Revision, "revision number should be max+1") +} - err = r.gcOldRevisions(context.Background(), instanceA, nameA, mustListRevisions(t, r, instanceA)) - require.NoError(t, err) +// TestEnsureRevision_SkipsRecreateWhenSkipBump verifies that rolled-back +// revisions are NOT recreated when skipBump is true (during experiment rollback). +func TestEnsureRevision_SkipsRecreateWhenSkipBump(t *testing.T) { + r, c := newRevisionTestReconciler(t) - revList := &appsv1.ControllerRevisionList{} - require.NoError(t, c.List(context.Background(), revList)) - assert.Len(t, revList.Items, 1, "experiment revision should be deleted after rejected phase") - assert.Equal(t, nameA, revList.Items[0].Name, "only the current (pre-experiment) revision should remain") - }) - } -} + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} -// TestGCOldRevisions_KeepsPreviousForNonRejectedPhases verifies that for -// non-rejected experiment phases (Running, Promoted, or nil), the normal -// "keep current + previous" behavior is preserved. -func TestGCOldRevisions_KeepsPreviousForNonRejectedPhases(t *testing.T) { - nonRejectedPhases := []v2alpha1.ExperimentPhase{ - v2alpha1.ExperimentPhaseRunning, - v2alpha1.ExperimentPhasePromoted, - "", // nil experiment - } + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) - for _, phase := range nonRejectedPhases { - t.Run(string(phase)+"_or_nil", func(t *testing.T) { - r, c := newRevisionTestReconciler(t) + // Annotate revB as rolled back. + revB := fetchRevisionByName(t, c, "default", nameB) + revB.Annotations = map[string]string{annotationExperimentRollback: "true"} + require.NoError(t, c.Update(context.Background(), revB)) - instanceA := newRevisionTestOwner("test-dda", "default") - instanceB := newRevisionTestOwner("test-dda", "default") - instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + // With skipBump=true, the annotated revision should be returned as-is. + nameB2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), true) + require.NoError(t, err) + assert.Equal(t, nameB, nameB2) - _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) - require.NoError(t, err) - nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) - require.NoError(t, err) + revB2 := fetchRevisionByName(t, c, "default", nameB2) + assert.Equal(t, "true", revB2.Annotations[annotationExperimentRollback], "annotation should still be present") +} - if phase != "" { - instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} - } +// TestGCOldRevisions_AlwaysKeepsPrevious verifies that GC always keeps the +// current and previous revisions regardless of experiment phase. +func TestGCOldRevisions_AlwaysKeepsPrevious(t *testing.T) { + r, c := newRevisionTestReconciler(t) - err = r.gcOldRevisions(context.Background(), instanceB, nameB, mustListRevisions(t, r, instanceB)) - require.NoError(t, err) + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - revList := &appsv1.ControllerRevisionList{} - require.NoError(t, c.List(context.Background(), revList)) - assert.Len(t, revList.Items, 2, "current and previous should both be kept for non-rejected phase") - }) - } + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + err = r.gcOldRevisions(context.Background(), nameB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 2, "current and previous should both be kept") } // TestListRevisions_ExcludesForeignOwner verifies that a ControllerRevision diff --git a/pkg/fleet/daemon.go b/pkg/fleet/daemon.go index 706e4aeb15..d3a1e20c51 100644 --- a/pkg/fleet/daemon.go +++ b/pkg/fleet/daemon.go @@ -7,16 +7,29 @@ package fleet import ( "context" + "errors" "fmt" "sync" + pbgo "github.com/DataDog/datadog-agent/pkg/proto/pbgo/core" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" - "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" "github.com/DataDog/datadog-operator/pkg/remoteconfig" ) +// stateDoesntMatchError is returned by verifyExpectedState when the task's expected +// state doesn't match the operator's current installer state. +type stateDoesntMatchError struct { + msg string +} + +func (e *stateDoesntMatchError) Error() string { return e.msg } + const ( methodStartDatadogAgentExperiment = "operator/start_datadogagent_experiment" methodStopDatadogAgentExperiment = "operator/stop_datadogagent_experiment" @@ -29,31 +42,54 @@ var _ manager.LeaderElectionRunnable = &Daemon{} // Daemon subscribes to fleet-specific RC products (installer configs and tasks) // and runs after leader election as a controller-runtime Runnable. type Daemon struct { - logger logr.Logger - rcClient remoteconfig.RCClient - mu sync.RWMutex - configs map[string]installerConfig // keyed by config ID; replaced on each RC update + rcClient remoteconfig.RCClient + client client.Client + revisionsEnabled bool + mu sync.RWMutex + configs map[string]installerConfig // keyed by config ID; replaced on each RC update + experimentTarget types.NamespacedName // DDA targeted by the current experiment; set on startExperiment } -// NewDaemon creates a new Fleet Daemon. -func NewDaemon(logger logr.Logger, rcClient remoteconfig.RCClient) *Daemon { +// NewDaemon creates a new Fleet Daemon. When revisionsEnabled is false, experiment +// signals are rejected because the reconciler cannot process them without the +// ControllerRevision machinery. +func NewDaemon(rcClient remoteconfig.RCClient, k8sClient client.Client, revisionsEnabled bool) *Daemon { return &Daemon{ - logger: logger, - rcClient: rcClient, - configs: make(map[string]installerConfig), + rcClient: rcClient, + client: k8sClient, + revisionsEnabled: revisionsEnabled, + configs: make(map[string]installerConfig), } } // Start implements manager.Runnable. It subscribes to fleet RC products and // blocks until ctx is cancelled. func (d *Daemon) Start(ctx context.Context) error { - d.logger.Info("Starting Fleet daemon") + logger := ctrl.LoggerFrom(ctx).WithName("fleet-daemon").WithValues("kind", "DatadogAgent") + ctx = ctrl.LoggerInto(ctx, logger) + logger.Info("Starting Fleet daemon") - d.rcClient.Subscribe(state.ProductUpdaterAgent, handleInstallerConfigUpdate(d.handleConfigs)) - d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(d.handleRemoteAPIRequest)) + d.rcClient.Subscribe(state.ProductInstallerConfig, handleInstallerConfigUpdate(ctx, func(configs map[string]installerConfig) error { + return d.handleConfigs(ctx, configs) + })) + d.rcClient.Subscribe(state.ProductUpdaterTask, handleUpdaterTaskUpdate(ctx, func(req remoteAPIRequest) error { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_RUNNING, nil) + err := d.handleRemoteAPIRequest(ctx, req) + if err != nil { + var stateErr *stateDoesntMatchError + if errors.As(err, &stateErr) { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_INVALID_STATE, err) + } else { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_ERROR, err) + } + } else { + d.setTaskState(req.Package, req.ID, pbgo.TaskState_DONE, nil) + } + return err + })) <-ctx.Done() - d.logger.Info("Stopping Fleet daemon") + logger.Info("Stopping Fleet daemon") return nil } @@ -65,15 +101,17 @@ func (d *Daemon) NeedLeaderElection() bool { // handleConfigs replaces the stored installer configs with the latest RC update. // Configs are indexed by their ID so they can be retrieved by task handlers. -func (d *Daemon) handleConfigs(configs map[string]installerConfig) error { +func (d *Daemon) handleConfigs(ctx context.Context, configs map[string]installerConfig) error { d.mu.Lock() defer d.mu.Unlock() + logger := ctrl.LoggerFrom(ctx) newConfigs := make(map[string]installerConfig, len(configs)) for _, cfg := range configs { - d.logger.Info("Received installer config", "id", cfg.ID, "file_operations", len(cfg.FileOperations)) + logger.V(2).Info("Received installer config", "id", cfg.ID, "operations", len(cfg.Operations)) newConfigs[cfg.ID] = cfg } d.configs = newConfigs + logger.V(2).Info("Updated installer configs", "configs", d.configs) return nil } @@ -88,40 +126,370 @@ func (d *Daemon) getConfig(id string) (installerConfig, error) { return cfg, nil } +// verifyExpectedState checks that the config versions in the task's expected_state +// match the operator's current installer state for the given package. +// Returns *stateDoesntMatchError when they don't match. +func (d *Daemon) verifyExpectedState(req remoteAPIRequest) error { + stable, experiment := d.getPackageConfigVersions(req.Package) + if req.ExpectedState.StableConfig != stable || req.ExpectedState.ExperimentConfig != experiment { + return &stateDoesntMatchError{ + msg: fmt.Sprintf( + "state mismatch for package %s: expected stable_config=%q experiment_config=%q, got stable_config=%q experiment_config=%q", + req.Package, + req.ExpectedState.StableConfig, req.ExpectedState.ExperimentConfig, + stable, experiment, + ), + } + } + return nil +} + // handleRemoteAPIRequest dispatches the incoming task to the appropriate handler. -func (d *Daemon) handleRemoteAPIRequest(req remoteAPIRequest) error { - d.logger.Info("Received remote API request", "id", req.ID, "package", req.Package, "method", req.Method) +func (d *Daemon) handleRemoteAPIRequest(ctx context.Context, req remoteAPIRequest) error { + logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "package", req.Package, "method", req.Method) + logger.Info("Received remote API request") + + if !d.revisionsEnabled { + return fmt.Errorf("experiment signals require the CreateControllerRevisions and DatadogAgentInternal feature gates") + } + + if err := d.verifyExpectedState(req); err != nil { + logger.Error(err, "Expected state mismatch") + return err + } + switch req.Method { case methodStartDatadogAgentExperiment: - return d.startDatadogAgentExperiment(req) + return d.startDatadogAgentExperiment(ctx, req) case methodStopDatadogAgentExperiment: - return d.stopDatadogAgentExperiment(req) + return d.stopDatadogAgentExperiment(ctx, req) case methodPromoteDatadogAgentExperiment: - return d.promoteDatadogAgentExperiment(req) + return d.promoteDatadogAgentExperiment(ctx, req) default: return fmt.Errorf("unknown method: %s", req.Method) } } -func (d *Daemon) startDatadogAgentExperiment(req remoteAPIRequest) error { - cfg, err := d.getConfig(req.ExpectedState.ExperimentConfig) +// resolveOperation looks up the installer config for the request, validates its single +// DatadogAgent operation, and fills in the canonical GVK. Returns the operation ready for use. +func (d *Daemon) resolveOperation(req remoteAPIRequest, signal string) (fleetManagementOperation, error) { + // get params version from req + id := req.Params.Version + if id == "" { + return fleetManagementOperation{}, fmt.Errorf("%s: version is required", signal) + } + // match to d.configs[params.version] to get config + cfg, err := d.getConfig(id) + if err != nil { + return fleetManagementOperation{}, fmt.Errorf("%s: %w", signal, err) + } + + if len(cfg.Operations) != 1 { + return fleetManagementOperation{}, fmt.Errorf("%s: config %s must have exactly 1 operation, got %d", signal, cfg.ID, len(cfg.Operations)) + } + op := cfg.Operations[0] + + if err := validateOperation(op); err != nil { + return fleetManagementOperation{}, fmt.Errorf("%s: invalid operation: %w", signal, err) + } + + op.GroupVersionKind = v2alpha1.GroupVersion.WithKind("DatadogAgent") + + return op, nil +} + +// startDatadogAgentExperiment starts a DatadogAgent experiment. +// The first step updates the DDA spec with the experiment configuration. +// The second step updates the DDA status to running, recording the experiment ID. +func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + logger := ctrl.LoggerFrom(ctx).WithValues("id", req.ID) + logger.V(1).Info("Starting DatadogAgent experiment", "config", req.Params.Version) + op, err := d.resolveOperation(req, "start DatadogAgent experiment") if err != nil { + logger.Error(err, "Failed to resolve operation") + return err + } + + // Store the target DDA for promote/stop signals (which don't carry a config). + d.experimentTarget = op.NamespacedName + + logger = logger.WithValues("namespace", op.NamespacedName.Namespace, "name", op.NamespacedName.Name) + ctx = ctrl.LoggerInto(ctx, logger) + + // Check the operation + if op.Operation != OperationUpdate { + return fmt.Errorf("start DatadogAgent experiment: invalid operation: %s", op.Operation) + } + + // Fetch current DDA to check signal preconditions. + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to get DatadogAgent %s: %w", op.NamespacedName, err) + } + + if err := canStart(getExperimentPhase(dda)); err != nil { + // If an experiment is already running, treat repeat start signals as idempotent. + // The backend retries with new task UUIDs until it sees an ack, and in-memory + // state (ExperimentConfigVersion) doesn't survive operator restarts, so we + // cannot reliably match on config version. The backend is responsible for not + // sending a start for a different experiment while one is already running. + if getExperimentPhase(dda) == v2alpha1.ExperimentPhaseRunning { + logger.Info("Experiment already running, acknowledging start signal as idempotent", "experimentID", dda.Status.Experiment.ID) + // Restore the experiment config version — it may have been lost on restart. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, req.Params.Version) + return nil + } return fmt.Errorf("start DatadogAgent experiment: %w", err) } - d.logger.Info("Starting DatadogAgent experiment", "id", req.ID, "package", req.Package, "config_id", cfg.ID) + + // Apply the spec patch. + if err := retryWithBackoff(ctx, func() error { + return d.client.Patch(ctx, dda, client.RawPatch(types.MergePatchType, op.Config)) + }); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to patch spec: %w", err) + } + + // Update status: phase=running, record experiment ID. + // Re-fetch inside the retry to get the latest ResourceVersion on conflict. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, op.NamespacedName, dda); err != nil { + return err + } + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: req.ID, + } + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("start DatadogAgent experiment: failed to update status: %w", err) + } + + // Report the experiment config version to the backend. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, req.Params.Version) + + logger.Info("Started DatadogAgent experiment") return nil } -func (d *Daemon) stopDatadogAgentExperiment(req remoteAPIRequest) error { - d.logger.Info("Stopping DatadogAgent experiment", "id", req.ID, "package", req.Package) +func (d *Daemon) stopDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + nsn := d.experimentTarget + if nsn.Name == "" { + return fmt.Errorf("stop DatadogAgent experiment: no experiment target set") + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) + logger := ctrl.LoggerFrom(ctx) + logger.V(1).Info("Stopping DatadogAgent experiment") + + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, nsn, dda); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: failed to get DatadogAgent %s: %w", nsn, err) + } + + if isNoOp, err := canStop(ctx, getExperimentPhase(dda)); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: %w", err) + } else if isNoOp { + return nil + } + + // Update status: set phase=stopped; leave ID and generation for the reconciler. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, nsn, dda); err != nil { + return err + } + if dda.Status.Experiment == nil { + return fmt.Errorf("experiment status was cleared unexpectedly") + } + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("stop DatadogAgent experiment: failed to update status: %w", err) + } + + // Clear the experiment config version. + stable, _ := d.getPackageConfigVersions(req.Package) + d.setPackageConfigVersions(req.Package, stable, "") + + logger.Info("Stopped DatadogAgent experiment") return nil } -func (d *Daemon) promoteDatadogAgentExperiment(req remoteAPIRequest) error { - cfg, err := d.getConfig(req.ExpectedState.ExperimentConfig) - if err != nil { +func (d *Daemon) promoteDatadogAgentExperiment(ctx context.Context, req remoteAPIRequest) error { + nsn := d.experimentTarget + if nsn.Name == "" { + return fmt.Errorf("promote DatadogAgent experiment: no experiment target set") + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("id", req.ID, "namespace", nsn.Namespace, "name", nsn.Name)) + logger := ctrl.LoggerFrom(ctx) + logger.V(1).Info("Promoting DatadogAgent experiment") + + // Verify there is an active experiment to promote. + _, experiment := d.getPackageConfigVersions(req.Package) + if experiment == "" { + return fmt.Errorf("promote DatadogAgent experiment: no experiment config version set") + } + + dda := &v2alpha1.DatadogAgent{} + if err := d.client.Get(ctx, nsn, dda); err != nil { + return fmt.Errorf("promote DatadogAgent experiment: failed to get DatadogAgent %s: %w", nsn, err) + } + + if isNoOp, err := canPromote(ctx, getExperimentPhase(dda)); err != nil { return fmt.Errorf("promote DatadogAgent experiment: %w", err) + } else if isNoOp { + return nil } - d.logger.Info("Promoting DatadogAgent experiment", "id", req.ID, "package", req.Package, "config_id", cfg.ID) + + // Update status: set phase=promoted; leave ID and generation intact. + if err := retryWithBackoff(ctx, func() error { + if err := d.client.Get(ctx, nsn, dda); err != nil { + return err + } + if dda.Status.Experiment == nil { + return fmt.Errorf("experiment status was cleared unexpectedly") + } + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted + return d.client.Status().Update(ctx, dda) + }); err != nil { + return fmt.Errorf("promote DatadogAgent experiment: failed to update status: %w", err) + } + + // Promote: stable = old experiment, experiment = "" + d.setPackageConfigVersions(req.Package, experiment, "") + + logger.Info("Promoted DatadogAgent experiment") return nil } + +// setTaskState updates the Task field of the package entry in the RC installer state. +// If the package is not yet present in the state, a new entry is added. +// This is a no-op when rcClient is nil (e.g. in unit tests that construct Daemon directly). +func (d *Daemon) setTaskState(pkgName, taskID string, taskState pbgo.TaskState, taskErr error) { + if d.rcClient == nil { + return + } + task := &pbgo.PackageStateTask{ + Id: taskID, + State: taskState, + } + if taskErr != nil { + task.Error = &pbgo.TaskError{Message: taskErr.Error()} + } + + current := d.rcClient.GetInstallerState() + updated := make([]*pbgo.PackageState, 0, len(current)+1) + found := false + for _, pkg := range current { + if pkg.GetPackage() == pkgName { + updated = append(updated, &pbgo.PackageState{ + Package: pkg.GetPackage(), + StableVersion: pkg.GetStableVersion(), + ExperimentVersion: pkg.GetExperimentVersion(), + Task: task, + StableConfigVersion: pkg.GetStableConfigVersion(), + ExperimentConfigVersion: pkg.GetExperimentConfigVersion(), + }) + found = true + } else { + updated = append(updated, pkg) + } + } + if !found { + updated = append(updated, &pbgo.PackageState{ + Package: pkgName, + Task: task, + }) + } + d.rcClient.SetInstallerState(updated) + d.logInstallerState("setTaskState") +} + +// getPackageConfigVersions returns the current stable and experiment config versions +// for the given package from the RC installer state. +func (d *Daemon) getPackageConfigVersions(pkgName string) (stable, experiment string) { + if d.rcClient == nil { + return "", "" + } + for _, pkg := range d.rcClient.GetInstallerState() { + if pkg.GetPackage() == pkgName { + return pkg.GetStableConfigVersion(), pkg.GetExperimentConfigVersion() + } + } + return "", "" +} + +// setPackageConfigVersions updates the config version fields of the +// package entry in the RC installer state. Only the config variants +// (StableConfigVersion/ExperimentConfigVersion) are set; the package variants +// (StableVersion/ExperimentVersion) are preserved since this is a config +// experiment, not a package upgrade. +func (d *Daemon) setPackageConfigVersions(pkgName, stable, experiment string) { + if d.rcClient == nil { + return + } + current := d.rcClient.GetInstallerState() + updated := make([]*pbgo.PackageState, 0, len(current)+1) + found := false + for _, pkg := range current { + if pkg.GetPackage() == pkgName { + updated = append(updated, &pbgo.PackageState{ + Package: pkg.GetPackage(), + StableVersion: pkg.GetStableVersion(), + ExperimentVersion: pkg.GetExperimentVersion(), + Task: pkg.GetTask(), + StableConfigVersion: stable, + ExperimentConfigVersion: experiment, + }) + found = true + } else { + updated = append(updated, pkg) + } + } + if !found { + updated = append(updated, &pbgo.PackageState{ + Package: pkgName, + StableConfigVersion: stable, + ExperimentConfigVersion: experiment, + }) + } + d.rcClient.SetInstallerState(updated) + d.logInstallerState("setPackageConfigVersions") +} + +// logInstallerState logs the full installer state for debugging. +func (d *Daemon) logInstallerState(caller string) { + if d.rcClient == nil { + return + } + logger := ctrl.Log.WithName("fleet-daemon") + for _, pkg := range d.rcClient.GetInstallerState() { + var taskID string + var taskState pbgo.TaskState + if pkg.GetTask() != nil { + taskID = pkg.GetTask().GetId() + taskState = pkg.GetTask().GetState() + } + logger.V(1).Info("Installer state", + "caller", caller, + "package", pkg.GetPackage(), + "stableVersion", pkg.GetStableVersion(), + "experimentVersion", pkg.GetExperimentVersion(), + "stableConfigVersion", pkg.GetStableConfigVersion(), + "experimentConfigVersion", pkg.GetExperimentConfigVersion(), + "taskID", taskID, + "taskState", taskState, + ) + } +} + +// getExperimentPhase returns the current experiment phase from a DDA's status, +// or empty string if no experiment is active. +func getExperimentPhase(dda *v2alpha1.DatadogAgent) v2alpha1.ExperimentPhase { + if dda.Status.Experiment == nil { + return "" + } + return dda.Status.Experiment.Phase +} diff --git a/pkg/fleet/daemon_test.go b/pkg/fleet/daemon_test.go new file mode 100644 index 0000000000..6708253f5e --- /dev/null +++ b/pkg/fleet/daemon_test.go @@ -0,0 +1,760 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package fleet + +import ( + "context" + "encoding/json" + "errors" + "testing" + + pbgo "github.com/DataDog/datadog-agent/pkg/proto/pbgo/core" + "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// --- Test helpers --- + +func testFleetScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = v2alpha1.AddToScheme(s) + return s +} + +func testDaemon(dda *v2alpha1.DatadogAgent, configs map[string]installerConfig) (*Daemon, client.Client) { + s := testFleetScheme() + b := fake.NewClientBuilder().WithScheme(s).WithStatusSubresource(&v2alpha1.DatadogAgent{}) + if dda != nil { + b = b.WithObjects(dda) + } + c := b.Build() + return &Daemon{ + client: c, + revisionsEnabled: true, + configs: configs, + experimentTarget: testDDANSN, + }, c +} + +var testDDAGVK = schema.GroupVersionKind{ + Group: "datadoghq.com", + Version: "v2alpha1", + Kind: "DatadogAgent", +} + +var testDDANSN = types.NamespacedName{Namespace: "datadog", Name: "datadog-agent"} + +func testDDAObject(phase v2alpha1.ExperimentPhase) *v2alpha1.DatadogAgent { + dda := &v2alpha1.DatadogAgent{ + ObjectMeta: metav1.ObjectMeta{ + Name: testDDANSN.Name, + Namespace: testDDANSN.Namespace, + }, + } + if phase != "" { + dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase, ID: "old-exp"} + } + return dda +} + +func testInstallerConfigWithDDA() map[string]installerConfig { + return map[string]installerConfig{ + "test-config": { + ID: "test-config", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + }, + }, + }, + } +} + +func testStartRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Package: "datadog-operator", + Method: methodStartDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, + } +} + +// --- startDatadogAgentExperiment tests --- + +func TestStartDatadogAgentExperiment_ConfigNotFound(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.Params.Version = "nonexistent" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_NoDDAOperation(t *testing.T) { + configs := map[string]installerConfig{ + "no-dda-config": { + ID: "no-dda-config", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: schema.GroupVersionKind{Group: "other.io", Version: "v1", Kind: "Other"}, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + }, + }, + }, + } + d, _ := testDaemon(testDDAObject(""), configs) + req := testStartRequest() + req.Params.Version = "no-dda-config" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) // no DDA pre-created + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +} + +func TestStartDatadogAgentExperiment_Running_Idempotent(t *testing.T) { + dda := testDDAObject(v2alpha1.ExperimentPhaseRunning) + d, c := testDaemon(dda, testInstallerConfigWithDDA()) + // Backend retries with a new task UUID; should be treated as idempotent. + req := testStartRequest() + req.ID = "retry-task-uuid" + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + // DDA should be unchanged — no re-patch, no status update. + got := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, got)) + require.NotNil(t, got.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, got.Status.Experiment.Phase) + assert.Equal(t, "old-exp", got.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_Stopped(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseStopped), testInstallerConfigWithDDA()) + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest())) +} + +func TestStartDatadogAgentExperiment_Success_NilPhase(t *testing.T) { + d, c := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_Success_FromRollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_Success_FromTimeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) +} + +func TestStartDatadogAgentExperiment_Success_FromAborted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) +} + +func TestStartDatadogAgentExperiment_Success_OverwritesPreviousExperiment(t *testing.T) { + // Start a new experiment when a previous one exists (e.g. after rollback). + // The old experiment's ID and generation must be fully replaced. + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + req := testStartRequest() + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) + assert.NotEqual(t, "old-exp", dda.Status.Experiment.ID) +} + +// --- resolveOperation: params.version matching tests --- + +func TestStartDatadogAgentExperiment_VersionMatchesInstallerConfig(t *testing.T) { + // Simulates the real payload pairing: + // - installer_config has id "cyg9-1ztz-cdnn" with an APM-enabling patch + // - updater_task has params.version "cyg9-1ztz-cdnn" linking them together + configs := map[string]installerConfig{ + "aaaa-bbbb-cccc": { + ID: "aaaa-bbbb-cccc", + Operations: []fleetManagementOperation{ + { + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{"spec":{"features":{"apm":{"enabled":true}}}}`), + }, + }, + }, + } + d, c := testDaemon(testDDAObject(""), configs) + req := remoteAPIRequest{ + ID: "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + Method: methodStartDatadogAgentExperiment, + Params: experimentParams{Version: "aaaa-bbbb-cccc"}, + ExpectedState: expectedState{ + Stable: "0.0.1", + StableConfig: "0.0.1", + ClientID: "aAbBcCdDeEfFgGhHiIjJk", + }, + } + require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req)) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, dda.Status.Experiment.Phase) + assert.Equal(t, req.ID, dda.Status.Experiment.ID) +} + +func TestStartDatadogAgentExperiment_EmptyVersion(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.Params.Version = "" + assert.Error(t, d.startDatadogAgentExperiment(context.Background(), req)) +} + +func TestStartDatadogAgentExperiment_VersionMismatch(t *testing.T) { + // params.version doesn't match any installer config ID + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + req := testStartRequest() + req.Params.Version = "xxxx-yyyy-zzzz" // no config with this ID + err := d.startDatadogAgentExperiment(context.Background(), req) + assert.Error(t, err) + assert.Contains(t, err.Error(), "not found") +} + +// --- stopDatadogAgentExperiment tests --- + +func testStopRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Package: "datadog-operator", + Method: methodStopDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, + } +} + +func TestStopDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_NilPhase(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_Aborted(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + assert.Error(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) +} + +func TestStopDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + // Status must not change — already rolled back. + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhasePromoted), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) +} + +func TestStopDatadogAgentExperiment_Success_Running(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + require.NoError(t, d.stopDatadogAgentExperiment(context.Background(), testStopRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseStopped, dda.Status.Experiment.Phase) + // ID must be preserved. + assert.Equal(t, "old-exp", dda.Status.Experiment.ID) +} + +// --- promoteDatadogAgentExperiment tests --- + +func testPromoteRequest() remoteAPIRequest { + return remoteAPIRequest{ + ID: "exp-abc", + Package: "datadog-operator", + Method: methodPromoteDatadogAgentExperiment, + Params: experimentParams{Version: "test-config"}, + } +} + +func TestPromoteDatadogAgentExperiment_DDANotFound(t *testing.T) { + d, _ := testDaemon(nil, testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_NilPhase(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_Aborted(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseAborted), testInstallerConfigWithDDA()) + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_NoExperimentVersion(t *testing.T) { + d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1"}, + }} + assert.Error(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Rollback(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRollback), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Timeout(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseTimeout), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_NoOp_Promoted(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhasePromoted), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) +} + +func TestPromoteDatadogAgentExperiment_Success_Running(t *testing.T) { + d, c := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA()) + rc := &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "stable-1", ExperimentConfigVersion: "exp-1"}, + }} + d.rcClient = rc + require.NoError(t, d.promoteDatadogAgentExperiment(context.Background(), testPromoteRequest())) + + dda := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), testDDANSN, dda)) + require.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, dda.Status.Experiment.Phase) + // ID must be preserved. + assert.Equal(t, "old-exp", dda.Status.Experiment.ID) + // Stable should now be the old experiment, experiment should be cleared. + require.Len(t, rc.state, 1) + // StableVersion/ExperimentVersion are not set by config experiments. + assert.Equal(t, "", rc.state[0].StableVersion) + assert.Equal(t, "", rc.state[0].ExperimentVersion) + assert.Equal(t, "exp-1", rc.state[0].StableConfigVersion) + assert.Equal(t, "", rc.state[0].ExperimentConfigVersion) +} + +// --- verifyExpectedState tests --- + +func TestVerifyExpectedState_Match(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: "2.0.0"}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "1.0.0", ExperimentConfig: "2.0.0"}, + } + assert.NoError(t, d.verifyExpectedState(req)) +} + +func TestVerifyExpectedState_StableMismatch(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: ""}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "wrong", ExperimentConfig: ""}, + } + err := d.verifyExpectedState(req) + require.Error(t, err) + var stateErr *stateDoesntMatchError + assert.True(t, errors.As(err, &stateErr)) +} + +func TestVerifyExpectedState_ExperimentMismatch(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: "2.0.0"}, + }) + d.rcClient = rc + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "1.0.0", ExperimentConfig: "wrong"}, + } + err := d.verifyExpectedState(req) + require.Error(t, err) + var stateErr *stateDoesntMatchError + assert.True(t, errors.As(err, &stateErr)) +} + +func TestVerifyExpectedState_NilClient(t *testing.T) { + // When rcClient is nil, getPackageConfigVersions returns ("", ""). + // A request with empty expected state should pass. + d := &Daemon{} + req := remoteAPIRequest{ + Package: "datadog-operator", + ExpectedState: expectedState{StableConfig: "", ExperimentConfig: ""}, + } + assert.NoError(t, d.verifyExpectedState(req)) +} + +func TestHandleRemoteAPIRequest_InvalidState(t *testing.T) { + d, _ := testDaemon(testDDAObject(""), testInstallerConfigWithDDA()) + d.rcClient = &mockRCClient{state: []*pbgo.PackageState{ + {Package: "datadog-operator", StableConfigVersion: "1.0.0", ExperimentConfigVersion: ""}, + }} + req := testStartRequest() + req.ExpectedState = expectedState{StableConfig: "stale", ExperimentConfig: ""} + err := d.handleRemoteAPIRequest(context.Background(), req) + require.Error(t, err) + var stateErr *stateDoesntMatchError + assert.True(t, errors.As(err, &stateErr)) +} + +// --- validateOperation tests --- + +func TestValidateOperation_Valid(t *testing.T) { + op := fleetManagementOperation{ + Operation: OperationUpdate, + GroupVersionKind: testDDAGVK, + NamespacedName: testDDANSN, + Config: json.RawMessage(`{}`), + } + assert.NoError(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyName(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: testDDAGVK, + NamespacedName: types.NamespacedName{Namespace: "datadog", Name: ""}, + } + assert.Error(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyNamespace(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: testDDAGVK, + NamespacedName: types.NamespacedName{Namespace: "", Name: "datadog-agent"}, + } + assert.Error(t, validateOperation(op)) +} + +func TestValidateOperation_EmptyGroup_Allowed(t *testing.T) { + // Group is auto-detected from the cluster; empty group is valid input. + op := fleetManagementOperation{ + GroupVersionKind: schema.GroupVersionKind{Group: "", Version: "", Kind: "DatadogAgent"}, + NamespacedName: testDDANSN, + } + assert.NoError(t, validateOperation(op)) +} + +func TestValidateOperation_WrongKind(t *testing.T) { + op := fleetManagementOperation{ + GroupVersionKind: schema.GroupVersionKind{Group: "datadoghq.com", Version: "v2alpha1", Kind: "DatadogMonitor"}, + NamespacedName: testDDANSN, + } + assert.Error(t, validateOperation(op)) +} + +// --- canStart tests --- + +func TestCanStart_NilPhase(t *testing.T) { + assert.NoError(t, canStart("")) +} + +func TestCanStart_Running(t *testing.T) { + assert.Error(t, canStart(v2alpha1.ExperimentPhaseRunning)) +} + +func TestCanStart_Stopped(t *testing.T) { + assert.Error(t, canStart(v2alpha1.ExperimentPhaseStopped)) +} + +func TestCanStart_Rollback(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseRollback)) +} + +func TestCanStart_Timeout(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseTimeout)) +} + +func TestCanStart_Promoted(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhasePromoted)) +} + +func TestCanStart_Aborted(t *testing.T) { + assert.NoError(t, canStart(v2alpha1.ExperimentPhaseAborted)) +} + +// --- canStop tests --- + +func TestCanStop_NilPhase(t *testing.T) { + isNoOp, err := canStop(context.Background(), "") + assert.False(t, isNoOp) + assert.Error(t, err) +} + +func TestCanStop_Running(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseRunning) + assert.False(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Stopped(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseStopped) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Rollback(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseRollback) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Timeout(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseTimeout) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Promoted(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhasePromoted) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanStop_Aborted(t *testing.T) { + isNoOp, err := canStop(context.Background(), v2alpha1.ExperimentPhaseAborted) + assert.False(t, isNoOp) + assert.Error(t, err) +} + +// --- canPromote tests --- + +func TestCanPromote_NilPhase(t *testing.T) { + isNoOp, err := canPromote(context.Background(), "") + assert.False(t, isNoOp) + assert.Error(t, err) +} + +func TestCanPromote_Running(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseRunning) + assert.False(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Stopped(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseStopped) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Rollback(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseRollback) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Timeout(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseTimeout) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Promoted(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhasePromoted) + assert.True(t, isNoOp) + assert.NoError(t, err) +} + +func TestCanPromote_Aborted(t *testing.T) { + isNoOp, err := canPromote(context.Background(), v2alpha1.ExperimentPhaseAborted) + assert.False(t, isNoOp) + assert.Error(t, err) +} + +// --- mockRCClient --- + +// mockRCClient is a minimal RCClient used to test package state updates. +type mockRCClient struct { + state []*pbgo.PackageState +} + +func (m *mockRCClient) Subscribe(_ string, _ func(map[string]state.RawConfig, func(string, state.ApplyStatus))) { +} + +func (m *mockRCClient) GetInstallerState() []*pbgo.PackageState { + return m.state +} + +func (m *mockRCClient) SetInstallerState(packages []*pbgo.PackageState) { + m.state = packages +} + +func testDaemonWithRC(initialState []*pbgo.PackageState) (*Daemon, *mockRCClient) { + rc := &mockRCClient{state: initialState} + d := &Daemon{ + rcClient: rc, + configs: make(map[string]installerConfig), + } + return d, rc +} + +// --- setTaskState tests --- + +func TestSetTaskState_Running(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + + require.Len(t, rc.state, 1) + pkg := rc.state[0] + assert.Equal(t, "datadog-operator", pkg.Package) + assert.Equal(t, "1.0.0", pkg.StableVersion) + require.NotNil(t, pkg.Task) + assert.Equal(t, "task-1", pkg.Task.Id) + assert.Equal(t, pbgo.TaskState_RUNNING, pkg.Task.State) + assert.Nil(t, pkg.Task.Error) +} + +func TestSetTaskState_Done(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_DONE, nil) + + require.Len(t, rc.state, 1) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_DONE, rc.state[0].Task.State) + assert.Nil(t, rc.state[0].Task.Error) +} + +func TestSetTaskState_Error(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_ERROR, errors.New("something went wrong")) + + require.Len(t, rc.state, 1) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_ERROR, rc.state[0].Task.State) + require.NotNil(t, rc.state[0].Task.Error) + assert.Equal(t, "something went wrong", rc.state[0].Task.Error.Message) +} + +func TestSetTaskState_PackageNotInState(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{}) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + + require.Len(t, rc.state, 1) + assert.Equal(t, "datadog-operator", rc.state[0].Package) + require.NotNil(t, rc.state[0].Task) + assert.Equal(t, pbgo.TaskState_RUNNING, rc.state[0].Task.State) +} + +func TestSetTaskState_PreservesOtherPackages(t *testing.T) { + d, rc := testDaemonWithRC([]*pbgo.PackageState{ + {Package: "other-package", StableVersion: "2.0.0"}, + {Package: "datadog-operator", StableVersion: "1.0.0"}, + }) + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_DONE, nil) + + require.Len(t, rc.state, 2) + // other-package must be unchanged + assert.Equal(t, "other-package", rc.state[0].Package) + assert.Nil(t, rc.state[0].Task) + // datadog-operator has task set + assert.Equal(t, "datadog-operator", rc.state[1].Package) + require.NotNil(t, rc.state[1].Task) + assert.Equal(t, pbgo.TaskState_DONE, rc.state[1].Task.State) +} + +func TestSetTaskState_NilClient(t *testing.T) { + d := &Daemon{} + // Must not panic when rcClient is nil. + assert.NotPanics(t, func() { + d.setTaskState("datadog-operator", "task-1", pbgo.TaskState_RUNNING, nil) + }) +} diff --git a/pkg/fleet/experiment.go b/pkg/fleet/experiment.go new file mode 100644 index 0000000000..ec4bf24369 --- /dev/null +++ b/pkg/fleet/experiment.go @@ -0,0 +1,128 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package fleet + +import ( + "context" + "fmt" + "math" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// validateOperation checks that a fleetManagementOperation has the fields +// required to locate and act on a DatadogAgent resource. +func validateOperation(op fleetManagementOperation) error { + if op.NamespacedName.Name == "" { + return fmt.Errorf("operation namespaced name must have a non-empty name") + } + if op.NamespacedName.Namespace == "" { + return fmt.Errorf("operation namespaced name must have a non-empty namespace") + } + if op.GroupVersionKind.Kind != "DatadogAgent" { + return fmt.Errorf("operation kind must be DatadogAgent, got %q", op.GroupVersionKind.Kind) + } + return nil +} + +// canStart returns whether a start signal is allowed given the current experiment phase. +// +// Allowed from: nil/empty, rollback, timeout, promoted, aborted (start new experiment). +// Error from: running (experiment already running), stopped (rollback in progress). +func canStart(phase v2alpha1.ExperimentPhase) error { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return fmt.Errorf("experiment already running (phase=%s); stop it before starting a new one", phase) + case v2alpha1.ExperimentPhaseStopped: + return fmt.Errorf("rollback in progress (phase=%s); wait for rollback to complete before starting a new experiment", phase) + default: + // nil/empty, rollback, timeout, promoted, aborted: start is allowed + return nil + } +} + +// canStop returns whether a stop signal is allowed given the current experiment phase. +// Returns (true, nil) if the signal is a no-op and should be silently ignored, (false, err) if rejected, or (false, nil) to proceed. +// +// Allowed from: running. +// No-op from: stopped, rollback, timeout, promoted (already terminal or in-progress rollback). +// Error from: nil/empty (nothing to stop), aborted (terminal, user-driven state). +func canStop(ctx context.Context, phase v2alpha1.ExperimentPhase) (bool, error) { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return false, nil + case v2alpha1.ExperimentPhaseStopped, + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhasePromoted: + ctrl.LoggerFrom(ctx).Info("Stop signal ignored: experiment is not in a stoppable state", "phase", phase) + return true, nil + case v2alpha1.ExperimentPhaseAborted: + return false, fmt.Errorf("experiment was aborted due to a manual spec change (phase=%s)", phase) + default: + // nil/empty: no active experiment to stop + return false, fmt.Errorf("no active experiment to stop (phase=%s)", phase) + } +} + +// canPromote returns whether a promote signal is allowed given the current experiment phase. +// Returns (true, nil) if the signal is a no-op and should be silently ignored, (false, err) if rejected, or (false, nil) to proceed. +// +// Allowed from: running. +// No-op from: stopped, rollback, timeout, promoted (already terminal or rolling back). +// Error from: nil/empty (nothing to promote), aborted (terminal, user-driven state). +func canPromote(ctx context.Context, phase v2alpha1.ExperimentPhase) (bool, error) { + switch phase { + case v2alpha1.ExperimentPhaseRunning: + return false, nil + case v2alpha1.ExperimentPhaseStopped, + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhasePromoted: + ctrl.LoggerFrom(ctx).Info("Promote signal ignored: experiment is not in a promotable state", "phase", phase) + return true, nil + case v2alpha1.ExperimentPhaseAborted: + return false, fmt.Errorf("experiment was aborted due to a manual spec change (phase=%s)", phase) + default: + // nil/empty: no active experiment to promote + return false, fmt.Errorf("no active experiment to promote (phase=%s)", phase) + } +} + +// experimentBackoff is the retry backoff for k8s operations during experiment signals. +// Retries start at 1s, doubling each attempt up to 10s, for up to 3 minutes total. +var experimentBackoff = wait.Backoff{ + Duration: 1 * time.Second, + Factor: 2.0, + Jitter: 0.1, + Cap: 10 * time.Second, + Steps: math.MaxInt32, +} + +// isRetryable returns true for errors that are worth retrying (i.e. not permanent). +func isRetryable(err error) bool { + return !apierrors.IsNotFound(err) && + !apierrors.IsForbidden(err) && + !apierrors.IsInvalid(err) && + !apierrors.IsMethodNotSupported(err) +} + +// retryWithBackoff retries fn on transient errors with exponential backoff. +// The total retry window is bounded by a 3-minute context timeout. +// Permanent errors (not-found, forbidden, invalid, method-not-supported) are not retried. +func retryWithBackoff(ctx context.Context, fn func() error) error { + ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + return retry.OnError(experimentBackoff, func(err error) bool { + return ctx.Err() == nil && isRetryable(err) + }, fn) +} diff --git a/pkg/fleet/remote_config.go b/pkg/fleet/remote_config.go index 2acee81fd8..47555ae076 100644 --- a/pkg/fleet/remote_config.go +++ b/pkg/fleet/remote_config.go @@ -6,34 +6,49 @@ package fleet import ( + "context" "encoding/json" "fmt" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" ) // installerConfig is the fleet installer configuration received via RC. type installerConfig struct { - ID string `json:"id"` - FileOperations []installerConfigFileOperation `json:"file_operations"` + ID string `json:"id"` + Operations []fleetManagementOperation `json:"operations"` } -// installerConfigFileOperation is a single file operation in an installerConfig. -type installerConfigFileOperation struct { - FileOperationType string `json:"file_op"` - FilePath string `json:"file_path"` - Patch json.RawMessage `json:"patch"` +// Operation is the type of fleet management operation to perform on a Kubernetes resource. +type Operation string + +const ( + OperationCreate Operation = "create" + OperationUpdate Operation = "update" + OperationDelete Operation = "delete" +) + +// fleetManagementOperation is a single fleet operation for config management of a Kubernetes resource. +// Config is a JSON merge patch (no strategic merge patch). +type fleetManagementOperation struct { + Operation Operation `json:"operation"` + GroupVersionKind schema.GroupVersionKind `json:"group_version_kind"` + NamespacedName types.NamespacedName `json:"namespaced_name"` + Config json.RawMessage `json:"config"` } // remoteAPIRequest is a task sent to the fleet daemon via RC. type remoteAPIRequest struct { - ID string `json:"id"` - Package string `json:"package_name"` - TraceID string `json:"trace_id"` - ParentSpanID string `json:"parent_span_id"` - ExpectedState expectedState `json:"expected_state"` - Method string `json:"method"` - Params json.RawMessage `json:"params"` + ID string `json:"id"` + Package string `json:"package_name"` + TraceID string `json:"trace_id"` + ParentSpanID string `json:"parent_span_id"` + ExpectedState expectedState `json:"expected_state"` + Method string `json:"method"` + Params experimentParams `json:"params"` } // expectedState describes the package state expected before executing the request. @@ -46,12 +61,20 @@ type expectedState struct { ClientID string `json:"client_id"` } +// experimentParams holds the parsed params for experiment methods. +type experimentParams struct { + Version string `json:"version"` +} + // handleInstallerConfigUpdate returns an RC subscription callback that parses // UPDATER_AGENT payloads and forwards them as a map[configID]installerConfig to h. -func handleInstallerConfigUpdate(h func(map[string]installerConfig) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { +func handleInstallerConfigUpdate(ctx context.Context, h func(map[string]installerConfig) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { return func(updates map[string]state.RawConfig, applyStatus func(string, state.ApplyStatus)) { + logger := ctrl.LoggerFrom(ctx) configs := make(map[string]installerConfig, len(updates)) for cfgPath, raw := range updates { + logger.V(1).Info("Received INSTALLER_CONFIG payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + var cfg installerConfig if err := json.Unmarshal(raw.Config, &cfg); err != nil { applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: fmt.Sprintf("failed to unmarshal installer config: %v", err)}) @@ -76,23 +99,32 @@ func handleInstallerConfigUpdate(h func(map[string]installerConfig) error) func( // handleUpdaterTaskUpdate returns an RC subscription callback that parses a single // UPDATER_TASK payload and forwards it as a remoteAPIRequest to h. // Requests that have already been executed (tracked by seen IDs) are ignored. -func handleUpdaterTaskUpdate(h func(remoteAPIRequest) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { +func handleUpdaterTaskUpdate(ctx context.Context, h func(remoteAPIRequest) error) func(map[string]state.RawConfig, func(string, state.ApplyStatus)) { seen := make(map[string]struct{}) return func(updates map[string]state.RawConfig, applyStatus func(string, state.ApplyStatus)) { + logger := ctrl.LoggerFrom(ctx) for cfgPath, raw := range updates { + logger.V(1).Info("Received UPDATER_TASK payload", "cfgPath", cfgPath, "rawPayload", string(raw.Config)) + var req remoteAPIRequest if err := json.Unmarshal(raw.Config, &req); err != nil { + logger.Error(err, "Failed to unmarshal remote API request") applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: fmt.Sprintf("failed to unmarshal remote API request: %v", err)}) continue } if _, ok := seen[req.ID]; ok { // Already executed; acknowledge without re-running. + logger.Info("Remote API request already executed", "id", req.ID) applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateAcknowledged}) continue } + // Signal received and parsed; notify the backend before applying. + applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateUnacknowledged}) + if err := h(req); err != nil { + logger.Error(err, "Failed to handle remote API request") applyStatus(cfgPath, state.ApplyStatus{State: state.ApplyStateError, Error: err.Error()}) continue } diff --git a/pkg/fleet/remote_config_test.go b/pkg/fleet/remote_config_test.go index 09d3ebc670..883e39edb6 100644 --- a/pkg/fleet/remote_config_test.go +++ b/pkg/fleet/remote_config_test.go @@ -6,23 +6,34 @@ package fleet import ( + "context" "encoding/json" "testing" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" ) // Test data var testInstallerConfig = installerConfig{ ID: "test", - FileOperations: []installerConfigFileOperation{ + Operations: []fleetManagementOperation{ { - FileOperationType: "write", - FilePath: "/etc/datadog-agent/config.yaml", - Patch: json.RawMessage(`{"key":"value"}`), + Operation: OperationUpdate, + GroupVersionKind: schema.GroupVersionKind{ + Group: "datadoghq.com", + Version: "v2alpha1", + Kind: "DatadogAgent", + }, + NamespacedName: types.NamespacedName{ + Namespace: "datadog", + Name: "datadog-agent", + }, + Config: json.RawMessage(`{"spec":{"features":{"apm":{"enabled":true}}}}`), }, }, } @@ -30,7 +41,7 @@ var testInstallerConfig = installerConfig{ var testRemoteAPIRequest = remoteAPIRequest{ ID: "test", Method: "some_method", - Params: json.RawMessage(`{}`), + Params: experimentParams{}, } // callbackMock records calls made by the RC handler callbacks. @@ -64,7 +75,7 @@ func marshalRawConfig(t *testing.T, v any) state.RawConfig { func TestInstallerConfigUpdate(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) raw := marshalRawConfig(t, testInstallerConfig) updates := map[string]state.RawConfig{"path/to/config": raw} @@ -81,7 +92,7 @@ func TestInstallerConfigUpdate(t *testing.T) { func TestInstallerConfigUpdateBadConfig(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) updates := map[string]state.RawConfig{ "path/to/config": {Config: []byte("not json")}, @@ -101,7 +112,7 @@ func TestInstallerConfigUpdateBadConfig(t *testing.T) { func TestInstallerConfigUpdateError(t *testing.T) { cb := &callbackMock{} - handler := handleInstallerConfigUpdate(cb.handleConfigs) + handler := handleInstallerConfigUpdate(context.Background(), cb.handleConfigs) raw := marshalRawConfig(t, testInstallerConfig) updates := map[string]state.RawConfig{"path/to/config": raw} @@ -124,23 +135,66 @@ func TestInstallerConfigUpdateError(t *testing.T) { func TestRemoteAPIRequest(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(nil) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}).Return() handler(updates, cb.applyStateCallback) cb.AssertCalled(t, "handleRemoteAPIRequest", testRemoteAPIRequest) + cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}) cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}) } +func TestRemoteAPIRequestParsesParamsVersion(t *testing.T) { + // Verifies that a real UPDATER_TASK payload has params.version correctly parsed. + cb := &callbackMock{} + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) + + rawPayload := []byte(`{ + "id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + "package_name": "datadog-operator", + "trace_id": "12345678901234567890", + "parent_span_id": "11111111111111111111", + "expected_state": { + "stable": "0.0.1", + "experiment": "", + "stable_config": "0.0.1", + "client_id": "aAbBcCdDeEfFgGhHiIjJk" + }, + "method": "operator/start_datadogagent_experiment", + "params": { + "version": "aaaa-bbbb-cccc" + } + }`) + + updates := map[string]state.RawConfig{ + "datadog/2/UPDATER_TASK/a1b2c3d4/1234567890abcdef": {Config: rawPayload}, + } + + cb.On("handleRemoteAPIRequest", mock.MatchedBy(func(req remoteAPIRequest) bool { + return req.ID == "a1b2c3d4-e5f6-7890-abcd-ef1234567890" && + req.Method == "operator/start_datadogagent_experiment" && + req.Params.Version == "aaaa-bbbb-cccc" && + req.ExpectedState.StableConfig == "0.0.1" + })).Return(nil) + cb.On("applyStateCallback", mock.Anything, mock.Anything).Return() + + handler(updates, cb.applyStateCallback) + + cb.AssertCalled(t, "handleRemoteAPIRequest", mock.MatchedBy(func(req remoteAPIRequest) bool { + return req.Params.Version == "aaaa-bbbb-cccc" + })) +} + func TestRemoteAPIRequestBadConfig(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) updates := map[string]state.RawConfig{ "path/to/task": {Config: []byte("not json")}, @@ -160,12 +214,13 @@ func TestRemoteAPIRequestBadConfig(t *testing.T) { func TestRemoteAPIRequestError(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(assert.AnError) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", mock.MatchedBy(func(s state.ApplyStatus) bool { return s.State == state.ApplyStateError && s.Error != "" })).Return() @@ -173,6 +228,7 @@ func TestRemoteAPIRequestError(t *testing.T) { handler(updates, cb.applyStateCallback) cb.AssertCalled(t, "handleRemoteAPIRequest", testRemoteAPIRequest) + cb.AssertCalled(t, "applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}) cb.AssertCalled(t, "applyStateCallback", "path/to/task", mock.MatchedBy(func(s state.ApplyStatus) bool { return s.State == state.ApplyStateError })) @@ -180,22 +236,23 @@ func TestRemoteAPIRequestError(t *testing.T) { func TestRemoteAPIRequestIgnoresAlreadyExecutedRequests(t *testing.T) { cb := &callbackMock{} - handler := handleUpdaterTaskUpdate(cb.handleRemoteAPIRequest) + handler := handleUpdaterTaskUpdate(context.Background(), cb.handleRemoteAPIRequest) raw := marshalRawConfig(t, testRemoteAPIRequest) updates := map[string]state.RawConfig{"path/to/task": raw} cb.On("handleRemoteAPIRequest", testRemoteAPIRequest).Return(nil) + cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateUnacknowledged}).Return() cb.On("applyStateCallback", "path/to/task", state.ApplyStatus{State: state.ApplyStateAcknowledged}).Return() - // First call — should invoke the handler. + // First call — should invoke the handler; sends Unacknowledged then Acknowledged. handler(updates, cb.applyStateCallback) cb.AssertNumberOfCalls(t, "handleRemoteAPIRequest", 1) - // Second call with same request ID — handler must NOT be called again. + // Second call with same request ID — handler must NOT be called again; sends only Acknowledged. handler(updates, cb.applyStateCallback) cb.AssertNumberOfCalls(t, "handleRemoteAPIRequest", 1) - // applyStateCallback is called both times (acknowledge on repeat). - cb.AssertNumberOfCalls(t, "applyStateCallback", 2) + // First call: Unacknowledged + Acknowledged. Second call: Acknowledged only. + cb.AssertNumberOfCalls(t, "applyStateCallback", 3) } diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index bca9c5e92d..3374dee90a 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -62,6 +62,8 @@ type RcServiceConfiguration struct { // RCClient is the interface for subscribing to RC product updates. type RCClient interface { Subscribe(product string, fn func(update map[string]state.RawConfig, applyStateCallback func(string, state.ApplyStatus))) + GetInstallerState() []*pbgo.PackageState + SetInstallerState(packages []*pbgo.PackageState) } // Client returns the underlying RC client. @@ -186,9 +188,13 @@ func (r *RemoteConfigUpdater) Start(apiKey string, site string, clusterName stri } r.rcService = rcService + updaterTags := []string{"updater_type:datadog-operator"} + if r.serviceConf.clusterName != "" { + updaterTags = append(updaterTags, "cluster_name:"+r.serviceConf.clusterName) + } rcClient, err := client.NewClient( rcService, - client.WithUpdater(), + client.WithUpdater(updaterTags...), client.WithProducts(state.ProductAgentConfig, state.ProductOrchestratorK8sCRDs), client.WithDirectorRootOverride(r.serviceConf.cfg.GetString("site"), r.serviceConf.cfg.GetString("remote_configuration.director_root")), client.WithPollInterval(pollInterval),