Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions internal/controller/datadogagent/controller_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1753,7 +1753,7 @@ func Test_DDAI_ReconcileV3(t *testing.T) {
wantFunc: func(t *testing.T, c client.Client) {
expectedDDAI := getBaseDDAI(dda)
expectedDDAI.Annotations = map[string]string{
constants.MD5DDAIDeploymentAnnotationKey: "ccac39a3a007bad81d7baf8febc6445f",
constants.MD5DDAIDeploymentAnnotationKey: "d472c741d84f93e553772d29ba5ea914",
}

verifyDDAI(t, c, []v1alpha1.DatadogAgentInternal{expectedDDAI})
Expand Down Expand Up @@ -1785,7 +1785,7 @@ func Test_DDAI_ReconcileV3(t *testing.T) {
baseDDAI := getBaseDDAI(dda)
expectedDDAI := baseDDAI.DeepCopy()
expectedDDAI.Annotations = map[string]string{
constants.MD5DDAIDeploymentAnnotationKey: "f2aa21d0ecced63c091ca2df3d31e451",
constants.MD5DDAIDeploymentAnnotationKey: "9d6165c057934517d4c0623699a4257a",
}
expectedDDAI.Spec.Features.ClusterChecks.UseClusterChecksRunners = ptr.To(true)
expectedDDAI.Spec.Global.Credentials = &v2alpha1.DatadogCredentials{
Expand Down Expand Up @@ -1861,7 +1861,7 @@ func Test_DDAI_ReconcileV3(t *testing.T) {
profileDDAI := getBaseDDAI(dda)
profileDDAI.Name = "foo-profile"
profileDDAI.Annotations = map[string]string{
constants.MD5DDAIDeploymentAnnotationKey: "73e0cc1e445001e326507ac23654104e",
constants.MD5DDAIDeploymentAnnotationKey: "d868cbefaa12de8dfc8e94552dcf8528",
}
profileDDAI.Labels[constants.ProfileLabelKey] = "foo-profile"
profileDDAI.Spec.Override = map[v2alpha1.ComponentName]*v2alpha1.DatadogAgentComponentOverride{
Expand Down Expand Up @@ -2095,7 +2095,7 @@ func getBaseDDAI(dda *v2alpha1.DatadogAgent) v1alpha1.DatadogAgentInternal {
func getDefaultDDAI(dda *v2alpha1.DatadogAgent) v1alpha1.DatadogAgentInternal {
expectedDDAI := getBaseDDAI(dda)
expectedDDAI.Annotations = map[string]string{
constants.MD5DDAIDeploymentAnnotationKey: "f98c0497c66e2747f6d116970ab8f0b1",
constants.MD5DDAIDeploymentAnnotationKey: "1a68caa8fd645f09d034b2b97a61f543",
}
expectedDDAI.Spec.Override = map[v2alpha1.ComponentName]*v2alpha1.DatadogAgentComponentOverride{
v2alpha1.NodeAgentComponentName: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ const (

defaultGPUMonitoringEnabled bool = false

defaultServiceDiscoveryEnabled bool = false

defaultAPMEnabled bool = true
defaultAPMHostPortEnabled bool = false
defaultAPMHostPort int32 = 8126
Expand Down Expand Up @@ -304,7 +302,9 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) {
if ddaSpec.Features.ServiceDiscovery == nil {
ddaSpec.Features.ServiceDiscovery = &v2alpha1.ServiceDiscoveryFeatureConfig{}
}
apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled)
// Enabled is intentionally not defaulted to false — Enabled=nil (not configured by user)
// must remain distinguishable from Enabled=false (explicit opt-out) so that the feature
// can be auto-activated by the operator in the future without exposing a CRD field.

// GPU monitoring feature
if ddaSpec.Features.GPU == nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -362,9 +360,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -461,9 +457,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -597,9 +591,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -758,9 +750,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -914,9 +904,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1070,9 +1058,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1235,9 +1221,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1391,9 +1375,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1550,9 +1532,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1752,9 +1732,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -1877,9 +1855,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -2034,9 +2010,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -2214,9 +2188,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down Expand Up @@ -2373,9 +2345,7 @@ func Test_defaultFeatures(t *testing.T) {
EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{
Enabled: ptr.To(defaultEBPFCheckEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{
Enabled: ptr.To(defaultServiceDiscoveryEnabled),
},
ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{},
GPU: &v2alpha1.GPUFeatureConfig{
Enabled: ptr.To(defaultGPUMonitoringEnabled),
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
package servicediscovery

import (
"fmt"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
Expand All @@ -30,6 +32,11 @@ func buildFeature(*feature.Options) feature.Feature {
}

type serviceDiscoveryFeature struct {
userExplicitlyEnabled bool
// features holds a pointer to the live DDA features struct so that ManageNodeAgent
// can re-evaluate hasOtherSystemProbeFeatures after Remote Config state has been
// merged by other features' Configure calls (e.g. USM merges RC state into the spec).
features *v2alpha1.DatadogFeatures
}

// ID returns the ID of the Feature
Expand All @@ -39,17 +46,58 @@ func (f *serviceDiscoveryFeature) ID() feature.IDType {

// Configure is used to configure the feature from a v2alpha1.DatadogAgent instance.
func (f *serviceDiscoveryFeature) Configure(_ metav1.Object, ddaSpec *v2alpha1.DatadogAgentSpec, _ *v2alpha1.RemoteConfigConfiguration) (reqComp feature.RequiredComponents) {
if ddaSpec.Features != nil && ddaSpec.Features.ServiceDiscovery != nil && apiutils.BoolValue(ddaSpec.Features.ServiceDiscovery.Enabled) {
reqComp.Agent = feature.RequiredComponent{
IsRequired: ptr.To(true),
Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName},
}
if ddaSpec.Features == nil || ddaSpec.Features.ServiceDiscovery == nil {
return reqComp
}

sd := ddaSpec.Features.ServiceDiscovery

// Enabled=nil (not configured) or Enabled=false (explicit opt-out) → disabled.
if !apiutils.BoolValue(sd.Enabled) {
return reqComp
}

reqComp.Agent = feature.RequiredComponent{
IsRequired: ptr.To(true),
Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName},
}

f.features = ddaSpec.Features
// True only when Enabled=true (user opted in); nil means operator auto-enabled (future path).
f.userExplicitlyEnabled = sd.Enabled != nil && *sd.Enabled

return reqComp
}

// systemProbeLiteCommand returns the shell command for the system-probe container when
// system-probe-lite is preferred. If userOptedIn is true (user explicitly enabled discovery),
// system-probe is used as the fallback — the user has accepted the resource cost.
// Otherwise (enabled by default), the fallback is sleep infinity to avoid unexpectedly
// running system-probe on older agent images where the discovery feature may not be supported.
func systemProbeLiteCommand(socketPath string, userOptedIn bool) string {
fallback := "sleep infinity"
if userOptedIn {
fallback = "system-probe --config=/etc/datadog-agent/system-probe.yaml"
}
return fmt.Sprintf("system-probe-lite run --socket %s --log-level ${DD_LOG_LEVEL:-info} || %s", socketPath, fallback)
}

// hasOtherSystemProbeFeatures returns true if any feature besides service discovery
// requires the full system-probe binary. When true, system-probe-lite cannot be used.
func hasOtherSystemProbeFeatures(features *v2alpha1.DatadogFeatures) bool {
if features == nil {
return false
}
return (features.NPM != nil && apiutils.BoolValue(features.NPM.Enabled)) ||
(features.CWS != nil && apiutils.BoolValue(features.CWS.Enabled)) ||
(features.CSPM != nil && apiutils.BoolValue(features.CSPM.Enabled) && apiutils.BoolValue(features.CSPM.RunInSystemProbe)) ||
(features.USM != nil && apiutils.BoolValue(features.USM.Enabled)) ||
(features.OOMKill != nil && apiutils.BoolValue(features.OOMKill.Enabled)) ||
(features.TCPQueueLength != nil && apiutils.BoolValue(features.TCPQueueLength.Enabled)) ||
(features.EBPFCheck != nil && apiutils.BoolValue(features.EBPFCheck.Enabled)) ||
(features.GPU != nil && apiutils.BoolValue(features.GPU.Enabled) && apiutils.BoolValue(features.GPU.PrivilegedMode))
}

// ManageDependencies allows a feature to manage its dependencies.
// Feature's dependencies should be added in the store.
func (f *serviceDiscoveryFeature) ManageDependencies(managers feature.ResourceManagers, provider string) error {
Expand Down Expand Up @@ -105,6 +153,20 @@ func (f *serviceDiscoveryFeature) ManageNodeAgent(managers feature.PodTemplateMa
managers.EnvVar().AddEnvVarToContainer(apicommon.CoreAgentContainerName, socketEnvVar)
managers.EnvVar().AddEnvVarToContainer(apicommon.SystemProbeContainerName, socketEnvVar)

// Direct PodTemplateSpec mutation: no managers API for command overrides.
// Re-evaluate here (not cached from Configure) so that RC state merged by other
// features' Configure calls (e.g. USM) is taken into account.
if !hasOtherSystemProbeFeatures(f.features) {
for i := range managers.PodTemplateSpec().Spec.Containers {
c := &managers.PodTemplateSpec().Spec.Containers[i]
if c.Name == string(apicommon.SystemProbeContainerName) {
c.Command = []string{"/bin/sh", "-c"}
c.Args = []string{systemProbeLiteCommand(common.DefaultSystemProbeSocketPath, f.userExplicitlyEnabled)}
Comment on lines +163 to +164
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we 100% sure we want to change both cmd and args ? Cmd changes the entrypoint which might have un-intended consequences, see #2864 for a more detailed explanation

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason we change both is because we need the shell logic behind it. One problem we have with the Operator and Helm chart, is that we can't know which agent version we are running, which means if you modify only the args:

  • you might try to rely on system-probe's fallback to SPL mechanism. But if you are on a system-probe version that doesn't have the support, it will crash because of an unknown CLI option.
  • on supported versions, you spawned the full system-probe to have it check a config we already have in the operator, which means a huge spike in resource consumption which can be enough to trigger pods memory limits and defeats the purpose of having SPL in the first place.
  • If only discovery is explicitly enabled, on non-supported versions, you might end up with system-probe starting up, and figuring out it doesn't need to run because it does not have discovery at all, and exits. In this case, you will have a crashloopbackoff.

By using a shell here, we can try to do best effort, run SPL directly when possible, or the full system-probe, and fallback to a sleep infinity on non-supported version, that will prevent the container from going into a CLB.

Of course, we are open to suggestions on improving this.

break
}
}
}

return nil
}

Expand Down
Loading
Loading