diff --git a/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go b/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go index afb87eb22f..50fb83da40 100644 --- a/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go +++ b/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go @@ -27,6 +27,7 @@ const ( defaultFreeBasedRatio = 0.6 defaultStaticBasedCapacity = 20 << 30 // 20GB defaultCacheBasedRatio = 0 + defaultRequestBasedRatio = 0.1 defaultMaxOversoldRate = 2.0 ) @@ -35,6 +36,7 @@ type UtilBasedOptions struct { FreeBasedRatio float64 StaticBasedCapacity float64 CacheBasedRatio float64 + RequestBasedRatio float64 MaxOversoldRate float64 } @@ -44,6 +46,7 @@ func NewUtilBasedOptions() *UtilBasedOptions { FreeBasedRatio: defaultFreeBasedRatio, StaticBasedCapacity: defaultStaticBasedCapacity, CacheBasedRatio: defaultCacheBasedRatio, + RequestBasedRatio: defaultRequestBasedRatio, MaxOversoldRate: defaultMaxOversoldRate, } } @@ -57,6 +60,8 @@ func (o *UtilBasedOptions) AddFlags(fs *pflag.FlagSet) { "the static oversold memory size by bytes") fs.Float64Var(&o.CacheBasedRatio, "memory-headroom-cache-based-ratio", o.CacheBasedRatio, "the rate of cache oversold, 0 means disable cache oversold") + fs.Float64Var(&o.RequestBasedRatio, "memory-headroom-request-based-ratio", o.RequestBasedRatio, + "the rate of reserved memory for request, 0 means disable reserved memory") fs.Float64Var(&o.MaxOversoldRate, "memory-headroom-max-oversold-rate", o.MaxOversoldRate, "the max oversold rate of memory headroom to the memory limit of reclaimed_cores cgroup") } @@ -66,6 +71,7 @@ func (o *UtilBasedOptions) ApplyTo(c *memoryheadroom.MemoryUtilBasedConfiguratio c.FreeBasedRatio = o.FreeBasedRatio c.StaticBasedCapacity = o.StaticBasedCapacity c.CacheBasedRatio = o.CacheBasedRatio + c.RequestBasedRatio = o.RequestBasedRatio c.MaxOversoldRate = o.MaxOversoldRate return nil } diff --git a/cmd/katalyst-agent/app/options/qrm/cpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/cpu_plugin.go index 3671789a02..7b261ffc16 100644 --- a/cmd/katalyst-agent/app/options/qrm/cpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/cpu_plugin.go @@ -54,6 +54,7 @@ type CPUDynamicPolicyOptions struct { EnableCPUBurst bool EnableDefaultDedicatedCoresCPUBurst bool EnableDefaultSharedCoresCPUBurst bool + EnableCPUBurstForMainContainerOnly bool *irqtuner.IRQTunerOptions *hintoptimizer.HintOptimizerOptions } @@ -140,6 +141,8 @@ func (o *CPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.EnableDefaultSharedCoresCPUBurst, "if set true, it will enable cpu burst for shared cores by default") fs.BoolVar(&o.EnableDefaultDedicatedCoresCPUBurst, "enable-default-dedicated-cores-cpu-burst", o.EnableDefaultDedicatedCoresCPUBurst, "if set true, it will enable cpu burst for dedicated cores by default") + fs.BoolVar(&o.EnableCPUBurstForMainContainerOnly, "enable-cpu-burst-for-main-container-only", + o.EnableCPUBurstForMainContainerOnly, "if set true, it will enable cpu burst for main container only") o.HintOptimizerOptions.AddFlags(fss) o.IRQTunerOptions.AddFlags(fss) } @@ -164,6 +167,7 @@ func (o *CPUOptions) ApplyTo(conf *qrmconfig.CPUQRMPluginConfig) error { conf.EnableCPUBurst = o.EnableCPUBurst conf.EnableDefaultDedicatedCoresCPUBurst = o.EnableDefaultDedicatedCoresCPUBurst conf.EnableDefaultSharedCoresCPUBurst = o.EnableDefaultSharedCoresCPUBurst + conf.EnableCPUBurstForMainContainerOnly = o.EnableCPUBurstForMainContainerOnly if err := o.HintOptimizerOptions.ApplyTo(conf.HintOptimizerConfiguration); err != nil { return err } diff --git a/cmd/katalyst-agent/app/options/qrm/mb_plugin.go b/cmd/katalyst-agent/app/options/qrm/mb_plugin.go index 4f0501bb7f..788b07e51e 100644 --- a/cmd/katalyst-agent/app/options/qrm/mb_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/mb_plugin.go @@ -52,6 +52,7 @@ type MBOptions struct { CrossDomainGroups []string ResetResctrlOnly bool LocalIsVictimAndTotalIsAllRead bool + ExtraGroupPriorities map[string]int } func NewMBOptions() *MBOptions { @@ -89,6 +90,8 @@ func (o *MBOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.ResetResctrlOnly, "not to run mb plugin really, and only reset to ensure resctrl FS in default status") fs.BoolVar(&o.LocalIsVictimAndTotalIsAllRead, "mb-local-is-victim", o.LocalIsVictimAndTotalIsAllRead, "turn resctrl local as victim") + fs.StringToIntVar(&o.ExtraGroupPriorities, "mb-extra-group-priorities", + o.ExtraGroupPriorities, "extra resctrl groups with priorities") } func (o *MBOptions) ApplyTo(conf *qrm.MBQRMPluginConfig) error { @@ -103,5 +106,6 @@ func (o *MBOptions) ApplyTo(conf *qrm.MBQRMPluginConfig) error { conf.DomainGroupAwareCapacityPCT = o.DomainGroupAwareCapacityPCT conf.ResetResctrlOnly = o.ResetResctrlOnly conf.LocalIsVictimAndTotalIsAllRead = o.LocalIsVictimAndTotalIsAllRead + conf.ExtraGroupPriorities = o.ExtraGroupPriorities return nil } diff --git a/cmd/katalyst-agent/app/options/qrm/sriov_plugin.go b/cmd/katalyst-agent/app/options/qrm/sriov_plugin.go index c38ca3c557..8b09edcc2f 100644 --- a/cmd/katalyst-agent/app/options/qrm/sriov_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/sriov_plugin.go @@ -46,6 +46,7 @@ type SriovStaticPolicyOptions struct { } type SriovDynamicPolicyOptions struct { + PodRequiredAnnotations map[string]string LargeSizeVFQueueCount int LargeSizeVFCPUThreshold int LargeSizeVFFailOnExhaustion bool @@ -69,6 +70,7 @@ func NewSriovOptions() *SriovOptions { MaxBondingVFQueueCount: math.MaxInt32, }, SriovDynamicPolicyOptions: SriovDynamicPolicyOptions{ + PodRequiredAnnotations: map[string]string{}, LargeSizeVFQueueCount: 32, LargeSizeVFCPUThreshold: 24, LargeSizeVFFailOnExhaustion: true, @@ -90,6 +92,7 @@ func (o *SriovOptions) AddFlags(fss *cliflag.NamedFlagSets) { fs.StringToStringVar(&o.ExtraAnnotations, "sriov-vf-extra-annotations", o.ExtraAnnotations, "Extra annotations for sriov vf") fs.IntVar(&o.MinBondingVFQueueCount, "static-min-bonding-vf-queue-count", o.MinBondingVFQueueCount, "Min queue count of bonding VF can be allocated in static policy") fs.IntVar(&o.MaxBondingVFQueueCount, "static-max-bonding-vf-queue-count", o.MaxBondingVFQueueCount, "Max queue count of bonding VF can be allocated in static policy") + fs.StringToStringVar(&o.PodRequiredAnnotations, "dynamic-pod-vf-required-annotations", o.PodRequiredAnnotations, "Required annotations for pod to dynamic allocate VF") fs.IntVar(&o.LargeSizeVFQueueCount, "dynamic-large-size-vf-queue-count", o.LargeSizeVFQueueCount, "Queue count for VF to be identified as large size VF in dynamic policy") fs.IntVar(&o.LargeSizeVFCPUThreshold, "dynamic-large-size-vf-cpu-threshold", o.LargeSizeVFCPUThreshold, "Threshold of cpu quantity to allocate large size VF in dynamic policy") fs.BoolVar(&o.LargeSizeVFFailOnExhaustion, "dynamic-large-size-vf-fail-on-exhaustion", o.LargeSizeVFFailOnExhaustion, "Should fail or not when large size VF is exhausted in dynamic policy") @@ -109,6 +112,7 @@ func (s *SriovOptions) ApplyTo(config *qrmconfig.SriovQRMPluginConfig) error { } config.MinBondingVFQueueCount = s.MinBondingVFQueueCount config.MaxBondingVFQueueCount = s.MaxBondingVFQueueCount + config.PodRequiredAnnotations = s.PodRequiredAnnotations config.LargeSizeVFQueueCount = s.LargeSizeVFQueueCount config.LargeSizeVFCPUThreshold = s.LargeSizeVFCPUThreshold config.LargeSizeVFFailOnExhaustion = s.LargeSizeVFFailOnExhaustion diff --git a/cmd/katalyst-agent/app/options/reporter/reporter_base.go b/cmd/katalyst-agent/app/options/reporter/reporter_base.go index 397f1dbabf..9c1ae383db 100644 --- a/cmd/katalyst-agent/app/options/reporter/reporter_base.go +++ b/cmd/katalyst-agent/app/options/reporter/reporter_base.go @@ -35,6 +35,7 @@ const ( type GenericReporterOptions struct { CollectInterval time.Duration InnerPlugins []string + AgentReporters []string RefreshLatestCNRPeriod time.Duration DefaultCNRLabels map[string]string } @@ -43,6 +44,7 @@ type GenericReporterOptions struct { func NewGenericReporterOptions() *GenericReporterOptions { return &GenericReporterOptions{ InnerPlugins: []string{"*"}, + AgentReporters: []string{"*"}, CollectInterval: defaultCollectInterval, RefreshLatestCNRPeriod: defaultRefreshLatestCNRPeriod, DefaultCNRLabels: make(map[string]string), @@ -61,6 +63,9 @@ func (o *GenericReporterOptions) AddFlags(fss *cliflag.NamedFlagSets) { fs.StringSliceVar(&o.InnerPlugins, "reporter-plugins", o.InnerPlugins, fmt.Sprintf(""+ "A list of reporter plugins to enable. '*' enables all on-by-default reporter plugins, 'foo' enables the reporter plugin "+ "named 'foo', '-foo' disables the reporter plugin named 'foo'")) + fs.StringSliceVar(&o.AgentReporters, "agent-reporters", o.AgentReporters, fmt.Sprintf(""+ + "A list of agent reporters to enable. '*' enables all on-by-default reporters, 'foo' enables the reporter "+ + "named 'foo', '-foo' disables the reporter named 'foo'")) fs.StringToStringVar(&o.DefaultCNRLabels, "default-cnr-labels", o.DefaultCNRLabels, "the default labels of cnr created by agent, this config must be consistent with the label-selector in katalyst-controller.") } @@ -69,6 +74,7 @@ func (o *GenericReporterOptions) AddFlags(fss *cliflag.NamedFlagSets) { func (o *GenericReporterOptions) ApplyTo(c *reporterconfig.GenericReporterConfiguration) error { c.CollectInterval = o.CollectInterval c.InnerPlugins = o.InnerPlugins + c.AgentReporters = o.AgentReporters c.RefreshLatestCNRPeriod = o.RefreshLatestCNRPeriod c.DefaultCNRLabels = o.DefaultCNRLabels return nil diff --git a/cmd/katalyst-controller/app/controller/lifecycle.go b/cmd/katalyst-controller/app/controller/lifecycle.go index 63cfaf91d4..c7d9893855 100644 --- a/cmd/katalyst-controller/app/controller/lifecycle.go +++ b/cmd/katalyst-controller/app/controller/lifecycle.go @@ -41,18 +41,20 @@ func StartLifeCycleController(ctx context.Context, controlCtx *katalystbase.Gene err error ) - cnrLifecycle, err = lifecycle.NewCNRLifecycle(ctx, - conf.GenericConfiguration, - conf.GenericControllerConfiguration, - conf.ControllersConfiguration.CNRLifecycleConfig, - controlCtx.Client, - controlCtx.KubeInformerFactory.Core().V1().Nodes(), - controlCtx.InternalInformerFactory.Node().V1alpha1().CustomNodeResources(), - controlCtx.EmitterPool.GetDefaultMetricsEmitter(), - ) - if err != nil { - klog.Errorf("failed to new CNR lifecycle controller") - return false, err + if conf.LifeCycleConfig.EnableCNRLifecycle { + cnrLifecycle, err = lifecycle.NewCNRLifecycle(ctx, + conf.GenericConfiguration, + conf.GenericControllerConfiguration, + conf.ControllersConfiguration.CNRLifecycleConfig, + controlCtx.Client, + controlCtx.KubeInformerFactory.Core().V1().Nodes(), + controlCtx.InternalInformerFactory.Node().V1alpha1().CustomNodeResources(), + controlCtx.EmitterPool.GetDefaultMetricsEmitter(), + ) + if err != nil { + klog.Errorf("failed to new CNR lifecycle controller") + return false, err + } } if conf.LifeCycleConfig.EnableCNCLifecycle { diff --git a/cmd/katalyst-controller/app/options/lifecycle.go b/cmd/katalyst-controller/app/options/lifecycle.go index 347ad94117..9322080130 100644 --- a/cmd/katalyst-controller/app/options/lifecycle.go +++ b/cmd/katalyst-controller/app/options/lifecycle.go @@ -47,6 +47,7 @@ type HealthzOptions struct { type LifeCycleOptions struct { EnableHealthz bool EnableCNCLifecycle bool + EnableCNRLifecycle bool *HealthzOptions } @@ -56,6 +57,7 @@ func NewLifeCycleOptions() *LifeCycleOptions { return &LifeCycleOptions{ EnableHealthz: false, EnableCNCLifecycle: true, + EnableCNRLifecycle: true, HealthzOptions: &HealthzOptions{ DryRun: false, NodeSelector: "", @@ -83,6 +85,8 @@ func (o *LifeCycleOptions) AddFlags(fss *cliflag.NamedFlagSets) { "whether to enable the healthz controller") fs.BoolVar(&o.EnableCNCLifecycle, "cnc-lifecycle-enabled", o.EnableCNCLifecycle, "whether to enable the cnc lifecycle controller") + fs.BoolVar(&o.EnableCNRLifecycle, "cnr-lifecycle-enabled", o.EnableCNRLifecycle, + "whether to enable the cnr lifecycle controller") fs.BoolVar(&o.DryRun, "healthz-dry-run", o.DryRun, "a bool to enable and disable dry-run logic of healthz controller.") @@ -117,6 +121,7 @@ func (o *LifeCycleOptions) AddFlags(fss *cliflag.NamedFlagSets) { func (o *LifeCycleOptions) ApplyTo(c *controller.LifeCycleConfig) error { c.EnableHealthz = o.EnableHealthz c.EnableCNCLifecycle = o.EnableCNCLifecycle + c.EnableCNRLifecycle = o.EnableCNRLifecycle c.DryRun = o.DryRun diff --git a/go.mod b/go.mod index e96fb39627..e52152e7ca 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/google/uuid v1.3.0 github.com/h2non/gock v1.2.0 github.com/klauspost/cpuid/v2 v2.2.6 - github.com/kubewharf/katalyst-api v0.5.11-0.20260324091059-cae1d07d9882 + github.com/kubewharf/katalyst-api v0.5.11-0.20260423040236-f1a2330d266e github.com/moby/sys/mountinfo v0.6.2 github.com/montanaflynn/stats v0.7.1 github.com/opencontainers/runc v1.1.6 diff --git a/go.sum b/go.sum index 748426d8ac..ddbcde668c 100644 --- a/go.sum +++ b/go.sum @@ -574,8 +574,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/katalyst-api v0.5.11-0.20260324091059-cae1d07d9882 h1:4KYYk/mAJAOIYDW5V+43wnjnP8p3bwHXAkAcw/AbzuQ= -github.com/kubewharf/katalyst-api v0.5.11-0.20260324091059-cae1d07d9882/go.mod h1:BZMVGVl3EP0eCn5xsDgV41/gjYkoh43abIYxrB10e3k= +github.com/kubewharf/katalyst-api v0.5.11-0.20260423040236-f1a2330d266e h1:/0LP1rzsXEI09g3eqUZCJcyv/XGU8MofdFyi7NBtZA0= +github.com/kubewharf/katalyst-api v0.5.11-0.20260423040236-f1a2330d266e/go.mod h1:BZMVGVl3EP0eCn5xsDgV41/gjYkoh43abIYxrB10e3k= github.com/kubewharf/kubelet v1.24.6-kubewharf-pre.2 h1:2KLMzgntDypiFJRX4fSQJCD+a6zIgHuhcAzd/7nAGmU= github.com/kubewharf/kubelet v1.24.6-kubewharf-pre.2/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager.go index 36ddd83581..11918989f0 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager.go @@ -26,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -90,10 +91,15 @@ func (m *managerImpl) UpdateCPUBurst(conf *config.Configuration, dynamicConfig * continue } + var mainContainerName string + if conf.EnableCPUBurstForMainContainerOnly { + mainContainerName = qrmutil.GetMainContainer(pod, conf.MainContainerAnnotationKey) + } + switch cpuBurstPolicy { case consts.PodAnnotationCPUEnhancementCPUBurstPolicyClosed: // For closed policy, we just set the cpu burst value to be 0. - if err = m.updateCPUBurstByPercent(0, pod); err != nil { + if err = m.updateCPUBurstByPercent(0, pod, mainContainerName); err != nil { errList = append(errList, fmt.Errorf("error setting cpu burst for policy %s for pod %s: %v", consts.PodAnnotationCPUEnhancementCPUBurstPolicyClosed, pod.Name, err)) } @@ -107,7 +113,7 @@ func (m *managerImpl) UpdateCPUBurst(conf *config.Configuration, dynamicConfig * continue } - if err = m.updateCPUBurstByPercent(cpuBurstPercent, pod); err != nil { + if err = m.updateCPUBurstByPercent(cpuBurstPercent, pod, mainContainerName); err != nil { errList = append(errList, fmt.Errorf("error setting cpu burst for policy %s for pod %s: %v", consts.PodAnnotationCPUEnhancementCPUBurstPolicyStatic, pod.Name, err)) } @@ -123,13 +129,20 @@ func (m *managerImpl) UpdateCPUBurst(conf *config.Configuration, dynamicConfig * // updateCPUBurstByPercent updates the value of cpu burst for static policy by taking the // cpu quota from cgroup and calculating the cpu burst value by taking cpu quota * percent / 100. -func (m *managerImpl) updateCPUBurstByPercent(percent float64, pod *v1.Pod) error { +// If the main container name is not empty, we update the cpu burst only for that container, +// otherwise we update the cpu burst for all containers. +func (m *managerImpl) updateCPUBurstByPercent(percent float64, pod *v1.Pod, mainContainerName string) error { var errList []error podUID := string(pod.GetUID()) podName := pod.Name for _, container := range pod.Spec.Containers { containerName := container.Name + // skip updating the container if the container is not the main container + if mainContainerName != "" && containerName != mainContainerName { + continue + } + containerID, err := m.metaServer.GetContainerID(podUID, containerName) if err != nil { general.Errorf("get container id failed, pod: %s, podName: %s, container: %s(%s), err: %v", podUID, podName, containerName, containerID, err) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager_test.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager_test.go index 76d1c93733..3ca2e0fbf1 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager_test.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuburst/manager_test.go @@ -53,9 +53,6 @@ func generateTestMetaServer(pods []*v1.Pod) *metaserver.MetaServer { func TestManagerImpl_UpdateCPUBurst(t *testing.T) { t.Parallel() - - conf := config.NewConfiguration() - conf.QoSConfiguration = generic.NewQoSConfiguration() type resultState map[string]uint64 tests := []struct { @@ -65,6 +62,8 @@ func TestManagerImpl_UpdateCPUBurst(t *testing.T) { wantErr bool adminQoSConfig *adminqos.AdminQoSConfiguration wantResults resultState + enableMainOnly bool + mainAnnoKey string }{ { name: "dedicated cores pods with no cpu burst policy means there is no change in cpu burst value", @@ -517,6 +516,93 @@ func TestManagerImpl_UpdateCPUBurst(t *testing.T) { "/sys/fs/cgroup/cpu/test-pod-2/test-container-2-id": 50, }, }, + { + name: "cpu burst main container only updates only the specified main container", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + UID: "test-pod", + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationCPUEnhancementKey: `{"cpu_burst_policy":"static", "cpu_burst_percent":"50"}`, + "test.main.container": "test-container-2", + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "test-container-1"}, + {Name: "test-container-2"}, + }, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + {Name: "test-container-1", ContainerID: "test-container-1-id"}, + {Name: "test-container-2", ContainerID: "test-container-2-id"}, + }, + }, + }, + }, + mocks: func(s resultState) { + mockey.Mock(common.IsContainerCgroupExist).Return(true, nil).Build() + mockey.Mock(common.GetContainerAbsCgroupPath).To(func(_, podUID, containerID string) (string, error) { + return "/sys/fs/cgroup/cpu/" + podUID + "/" + containerID, nil + }).Build() + mockey.Mock(manager.GetCPUWithAbsolutePath).Return(&common.CPUStats{CpuQuota: 200}, nil).Build() + mockey.Mock(manager.ApplyCPUWithAbsolutePath). + To(func(absPath string, cpuData *common.CPUData) error { + s[absPath] = *cpuData.CpuBurst + return nil + }).Build() + }, + enableMainOnly: true, + mainAnnoKey: "test.main.container", + wantResults: resultState{ + "/sys/fs/cgroup/cpu/test-pod/test-container-2-id": 100, + }, + }, + { + name: "cpu burst main container only falls back to the first container when main container annotation is missing", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + UID: "test-pod", + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationCPUEnhancementKey: `{"cpu_burst_policy":"static", "cpu_burst_percent":"50"}`, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "test-container-1"}, + {Name: "test-container-2"}, + }, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + {Name: "test-container-1", ContainerID: "test-container-1-id"}, + {Name: "test-container-2", ContainerID: "test-container-2-id"}, + }, + }, + }, + }, + mocks: func(s resultState) { + mockey.Mock(common.IsContainerCgroupExist).Return(true, nil).Build() + mockey.Mock(common.GetContainerAbsCgroupPath).To(func(_, podUID, containerID string) (string, error) { + return "/sys/fs/cgroup/cpu/" + podUID + "/" + containerID, nil + }).Build() + mockey.Mock(manager.GetCPUWithAbsolutePath).Return(&common.CPUStats{CpuQuota: 200}, nil).Build() + mockey.Mock(manager.ApplyCPUWithAbsolutePath). + To(func(absPath string, cpuData *common.CPUData) error { + s[absPath] = *cpuData.CpuBurst + return nil + }).Build() + }, + enableMainOnly: true, + mainAnnoKey: "test.main.container", + wantResults: resultState{ + "/sys/fs/cgroup/cpu/test-pod/test-container-1-id": 100, + }, + }, { name: "get container ID fails does not return error", pods: []*v1.Pod{ @@ -640,6 +726,13 @@ func TestManagerImpl_UpdateCPUBurst(t *testing.T) { tt.mocks(results) } + conf := config.NewConfiguration() + conf.QoSConfiguration = generic.NewQoSConfiguration() + conf.EnableCPUBurstForMainContainerOnly = tt.enableMainOnly + if tt.mainAnnoKey != "" { + conf.MainContainerAnnotationKey = tt.mainAnnoKey + } + cpuBurstManager := newManager(generateTestMetaServer(tt.pods)) dynamicConfig := dynamic.NewDynamicAgentConfiguration() diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index fa312dc903..a63bb5b825 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -107,6 +107,7 @@ func NewBasePlugin( // Run starts the asynchronous tasks of the plugin func (p *BasePlugin) Run(stopCh <-chan struct{}) { + go p.DeviceTopologyRegistry.Run(stopCh) go func() { select { case <-p.stateInitializedCh: @@ -117,7 +118,6 @@ func (p *BasePlugin) Run(stopCh <-chan struct{}) { return } }() - go p.DeviceTopologyRegistry.Run(stopCh) } // GetState may return a nil state because the state is only initialized when InitState is called. diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index 2404f2ca27..f924692833 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -21,6 +21,7 @@ import ( "fmt" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" @@ -79,9 +80,244 @@ func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices( } func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints( - context.Context, *pluginapi.AssociatedDeviceRequest, + _ context.Context, req *pluginapi.AssociatedDeviceRequest, ) (*pluginapi.AssociatedDeviceHintsResponse, error) { - return &pluginapi.AssociatedDeviceHintsResponse{}, nil + if req == nil || req.ResourceRequest == nil { + return nil, fmt.Errorf("GetAssociatedDeviceTopologyHints got invalid request") + } + + resReq := req.ResourceRequest + qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + general.InfoS("GetAssociatedDeviceTopologyHints called", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "deviceName", req.DeviceName, + "qosLevel", qosLevel, + ) + + // Find the target device request + var targetDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.DeviceName { + targetDeviceReq = deviceRequest + break + } + } + + if targetDeviceReq == nil { + return nil, fmt.Errorf("no target device plugin found for target device %s", req.DeviceName) + } + + var hints []*pluginapi.TopologyHint + + // 1. Check if GPU device allocation already exists. + gpuAllocationInfo := p.GetState().GetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName) + if gpuAllocationInfo != nil && gpuAllocationInfo.TopologyAwareAllocations != nil { + general.InfoS("generating hints from existing GPU allocation", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "deviceName", req.DeviceName, + ) + hints = p.generateHintsFromAllocation(gpuAllocationInfo) + } else if preAllocateResourceAllocationInfo := p.GetState().GetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), resReq.PodUid, resReq.ContainerName); preAllocateResourceAllocationInfo != nil && preAllocateResourceAllocationInfo.TopologyAwareAllocations != nil { + // 2. Check if pre-allocate resource allocation already exists. + general.InfoS("generating hints from existing GPU pre-allocate resource allocation", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "deviceName", req.DeviceName, + ) + hints = p.generateHintsFromAllocation(preAllocateResourceAllocationInfo) + } else { + // 3. Fallback to generating hints from the available devices and the GPU topology. + gpuTopology, err := p.DeviceTopologyRegistry.GetDeviceTopology(targetDeviceReq.DeviceName) + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, fmt.Errorf("failed to get gpu topology: %w", err) + } + + hints = p.generateDeviceTopologyHints(targetDeviceReq, gpuTopology, resReq, qosLevel) + } + + if len(hints) == 0 { + return nil, fmt.Errorf("GetAssociatedDeviceTopologyHints got empty hints") + } + + return p.buildAssociatedDeviceHintsResponse(req, hints), nil +} + +func (p *GPUDevicePlugin) generateHintsFromAllocation(allocationInfo *state.AllocationInfo) []*pluginapi.TopologyHint { + nodesSet := sets.NewInt() + for _, alloc := range allocationInfo.TopologyAwareAllocations { + nodesSet.Insert(alloc.NUMANodes...) + } + + nodes := make([]uint64, 0, nodesSet.Len()) + for _, node := range nodesSet.List() { + nodes = append(nodes, uint64(node)) + } + + return []*pluginapi.TopologyHint{ + { + Nodes: nodes, + Preferred: true, + }, + } +} + +func (p *GPUDevicePlugin) generateDeviceTopologyHints( + deviceReq *pluginapi.DeviceRequest, + gpuTopology *machine.DeviceTopology, + resReq *pluginapi.ResourceRequest, + qosLevel string, +) []*pluginapi.TopologyHint { + request := int(deviceReq.DeviceRequest) + available := sets.NewString(deviceReq.AvailableDevices...) + reusable := sets.NewString(deviceReq.ReusableDevices...) + + if available.Union(reusable).Len() < request { + general.Warningf("Unable to generate topology hints: requested number of devices unavailable, request: %d, available: %d", + request, available.Union(reusable).Len()) + return nil + } + + // Gather all NUMA nodes that have healthy GPUs + numaNodesSet := sets.NewInt() + for _, dev := range gpuTopology.Devices { + if dev.Health != pluginapi.Healthy { + continue + } + + numaNodesSet.Insert(dev.NumaNodes...) + } + numaNodes := numaNodesSet.List() + + // minAffinitySize tracks the minimum mask size that satisfies the Strategy + minAffinitySize := len(numaNodes) + var hints []*pluginapi.TopologyHint + + // Iterate through all combinations of NUMA Nodes and build hints from them. + machine.IterateBitMasks(numaNodes, len(numaNodes), func(mask machine.BitMask) { + // Fast Path: Check to see if all of the reusable devices are part of the bitmask. + numMatching := 0 + for d := range reusable { + dev, ok := gpuTopology.Devices[d] + if !ok || len(dev.NumaNodes) == 0 { + continue + } + if !mask.AnySet(dev.NumaNodes) { + return + } + numMatching++ + } + + // Fast Path: Check to see if enough available devices remain on the + // current NUMA node combination to satisfy the device request. + for d := range available { + dev, ok := gpuTopology.Devices[d] + if !ok || len(dev.NumaNodes) == 0 { + continue + } + if mask.AnySet(dev.NumaNodes) { + numMatching++ + } + } + + // If they don't, then move onto the next combination. + if numMatching < request { + return + } + + // Slow Path: Use Strategy Framework to verify inter-GPU affinity (NVLink/PCIe etc.) + bits := mask.GetBits() + nodes := make([]uint64, len(bits)) + for i, bit := range bits { + nodes[i] = uint64(bit) + } + + // Deep copy deviceReq and inject the current NUMA hint + deviceReqCopy := *deviceReq + deviceReqCopy.Hint = &pluginapi.TopologyHint{ + Nodes: nodes, + Preferred: false, + } + + result, err := manager.AllocateGPUUsingStrategy( + resReq, + &deviceReqCopy, + gpuTopology, + p.Conf.GPUQRMPluginConfig, + p.Emitter, + p.MetaServer, + p.GetState().GetMachineState(), + qosLevel, + ) + + if err != nil || !result.Success { + general.InfoS("mask failed strategy verification", + "mask", nodes, + "error", err, + ) + return + } + + // Strategy succeeded, this mask is valid. Update minAffinitySize. + if mask.Count() < minAffinitySize { + minAffinitySize = mask.Count() + } + + // Add it to the list of hints. We set all hint preferences to 'false' on the first pass through. + hints = append(hints, &pluginapi.TopologyHint{ + Nodes: nodes, + Preferred: false, + }) + }) + + // Loop back through all hints and update the 'Preferred' field based on + // counting the number of bits sets in the affinity mask and comparing it + // to the minAffinity. Only those with an equal number of bits set will be + // considered preferred. + for i := range hints { + if len(hints[i].Nodes) == minAffinitySize { + hints[i].Preferred = true + } + } + + return hints +} + +func (p *GPUDevicePlugin) buildAssociatedDeviceHintsResponse( + req *pluginapi.AssociatedDeviceRequest, + hints []*pluginapi.TopologyHint, +) *pluginapi.AssociatedDeviceHintsResponse { + resReq := req.ResourceRequest + var deviceHints *pluginapi.ListOfTopologyHints + if hints != nil { + deviceHints = &pluginapi.ListOfTopologyHints{Hints: hints} + } + return &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: resReq.PodUid, + PodNamespace: resReq.PodNamespace, + PodName: resReq.PodName, + ContainerName: resReq.ContainerName, + ContainerType: resReq.ContainerType, + ContainerIndex: resReq.ContainerIndex, + PodRole: resReq.PodRole, + PodType: resReq.PodType, + DeviceName: req.DeviceName, + DeviceHints: deviceHints, + Labels: resReq.Labels, + Annotations: resReq.Annotations, + } } func (p *GPUDevicePlugin) AllocateAssociatedDevice( @@ -127,12 +363,12 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( } var allocatedDevices []string - memoryAllocationInfo := p.GetState().GetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), resReq.PodUid, resReq.ContainerName) - // GPU memory should have been allocated at this stage. - // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu memory state. - if memoryAllocationInfo == nil || memoryAllocationInfo.TopologyAwareAllocations == nil { - // When GPU memory allocation info is nil, invoke the GPU allocate strategy to perform GPU allocation - general.InfoS("GPU memory allocation info is nil, invoking GPU allocate strategy", + preAllocateResourceAllocationInfo := p.GetState().GetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), resReq.PodUid, resReq.ContainerName) + // GPU pre-allocate resource should have been allocated at this stage. + // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu pre-allocate resource state. + if preAllocateResourceAllocationInfo == nil || preAllocateResourceAllocationInfo.TopologyAwareAllocations == nil { + // When GPU pre-allocate resource allocation info is nil, invoke the GPU allocate strategy to perform GPU allocation + general.InfoS("GPU pre-allocate resource allocation info is nil, invoking GPU allocate strategy", "podNamespace", resReq.PodNamespace, "podName", resReq.PodName, "containerName", resReq.ContainerName) @@ -165,8 +401,8 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( allocatedDevices = result.AllocatedDevices } else { - // when GPU memory allocation info exists - for gpuID := range memoryAllocationInfo.TopologyAwareAllocations { + // when GPU pre-allocate resource allocation info exists + for gpuID := range preAllocateResourceAllocationInfo.TopologyAwareAllocations { allocatedDevices = append(allocatedDevices, gpuID) } } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go index bf73b77c97..38ae8a6ccf 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go @@ -27,11 +27,16 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/statedirectory" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -39,6 +44,10 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/machine" ) +const ( + testDeviceAffinityAllocation = "deviceAffinityAllocation" +) + func generateTestConfiguration(t *testing.T) *config.Configuration { conf := config.NewConfiguration() tmpDir := t.TempDir() @@ -177,15 +186,15 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { t.Parallel() tests := []struct { - name string - podUID string - containerName string - allocationInfo *state.AllocationInfo - accompanyResourceAllocationInfo *state.AllocationInfo - deviceReq *pluginapi.DeviceRequest - deviceTopology *machine.DeviceTopology - expectedErr bool - expectedResp *pluginapi.AssociatedDeviceAllocationResponse + name string + podUID string + containerName string + allocationInfo *state.AllocationInfo + preAllocateResourceAllocationInfo *state.AllocationInfo + deviceReq *pluginapi.DeviceRequest + deviceTopology *machine.DeviceTopology + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceAllocationResponse }{ { name: "Allocation already exists", @@ -221,7 +230,7 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { }, { name: "gpu memory allocation exists", - accompanyResourceAllocationInfo: &state.AllocationInfo{ + preAllocateResourceAllocationInfo: &state.AllocationInfo{ AllocatedAllocation: state.Allocation{ Quantity: 4, NUMANodes: []int{0, 1}, @@ -248,16 +257,16 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { deviceTopology: &machine.DeviceTopology{ Devices: map[string]machine.DeviceInfo{ "test-gpu-0": { - NumaNodes: []int{0}, + NumaNodes: []int{0}, Health: pluginapi.Healthy, }, "test-gpu-1": { - NumaNodes: []int{1}, + NumaNodes: []int{1}, Health: pluginapi.Healthy, }, "test-gpu-2": { - NumaNodes: []int{0}, + NumaNodes: []int{0}, Health: pluginapi.Healthy, }, "test-gpu-3": { - NumaNodes: []int{1}, + NumaNodes: []int{1}, Health: pluginapi.Healthy, }, }, }, @@ -269,7 +278,7 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { }, { name: "device topology does not exist", - accompanyResourceAllocationInfo: &state.AllocationInfo{ + preAllocateResourceAllocationInfo: &state.AllocationInfo{ AllocatedAllocation: state.Allocation{ Quantity: 4, NUMANodes: []int{0, 1}, @@ -309,8 +318,8 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { basePlugin.GetState().SetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) } - if tt.accompanyResourceAllocationInfo != nil { - basePlugin.GetState().SetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false) + if tt.preAllocateResourceAllocationInfo != nil { + basePlugin.GetState().SetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), tt.podUID, tt.containerName, tt.preAllocateResourceAllocationInfo, false) } if tt.deviceTopology != nil { @@ -355,3 +364,574 @@ func evaluateAllocatedDevicesResult(t *testing.T, expectedResp, actualResp *plug assert.ElementsMatch(t, expectedResp.AllocationResult.AllocatedDevices, actualResp.AllocationResult.AllocatedDevices) } + +func TestGPUDevicePlugin_GetAssociatedDeviceTopologyHints(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + qosLevel string + allocationInfo *state.AllocationInfo + preAllocateResourceAllocationInfo *state.AllocationInfo + deviceReq *pluginapi.AssociatedDeviceRequest + deviceTopology *machine.DeviceTopology + requiredDeviceAffinity bool + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceHintsResponse + }{ + { + name: "Invalid Request", + expectedErr: true, + }, + { + name: "GPU Allocation exists", + podUID: "test-uid", + containerName: "test-container", + allocationInfo: &state.AllocationInfo{ + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": {NUMANodes: []int{0}}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + {DeviceName: "test-gpu", DeviceRequest: 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + }, + }, + }, + }, + { + name: "GPU pre-allocate resource allocation exists", + podUID: "test-uid", + containerName: "test-container", + preAllocateResourceAllocationInfo: &state.AllocationInfo{ + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-1": {NUMANodes: []int{1}}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + {DeviceName: "test-gpu", DeviceRequest: 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{1}, + Preferred: true, + }, + }, + }, + }, + }, + { + name: "device topology does not exist", + podUID: "test-uid", + containerName: "test-container", + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1"}, + DeviceRequest: 1, + }, + }, + }, + expectedErr: true, + }, + { + name: "requested devices more than available", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0"}, + DeviceRequest: 2, + }, + }, + }, + expectedErr: true, + }, + { + name: "generate from available devices without allocation info", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + "test-gpu-1": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1"}, + DeviceRequest: 1, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + {Nodes: []uint64{0}, Preferred: true}, + {Nodes: []uint64{1}, Preferred: true}, + {Nodes: []uint64{0, 1}, Preferred: false}, + }, + }, + }, + }, + { + name: "strategy framework filters NVLink mesh", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + PriorityDimensions: []string{"NVLINK"}, + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "group-1"}}, + "test-gpu-1": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "group-1"}}, + "test-gpu-2": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "group-2"}}, + "test-gpu-3": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "group-2"}}, + }, + }, + requiredDeviceAffinity: true, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1", "test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // {0} and {1} will fail Strategy verification because each only contains GPUs from different NVLink groups. + // The only valid combination is {0, 1} which has all 4 GPUs, allowing the Strategy to pick 2 GPUs from the same NVLink group. + {Nodes: []uint64{0, 1}, Preferred: true}, + }, + }, + }, + }, + { + name: "strict affinity: request 4 devices, each NUMA has 4 devices in 2 groups", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + PriorityDimensions: []string{"0"}, + Devices: map[string]machine.DeviceInfo{ + "test-gpu-1": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g12"}}, + "test-gpu-2": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g12"}}, + "test-gpu-3": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g34"}}, + "test-gpu-4": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g34"}}, + "test-gpu-5": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g56"}}, + "test-gpu-6": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g56"}}, + "test-gpu-7": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g78"}}, + "test-gpu-8": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g78"}}, + }, + }, + requiredDeviceAffinity: true, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-1", "test-gpu-2", "test-gpu-3", "test-gpu-4", "test-gpu-5", "test-gpu-6", "test-gpu-7", "test-gpu-8"}, + DeviceRequest: 4, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // {0} has 4 devices (two full groups), satisfies request 4 + // {1} has 4 devices (two full groups), satisfies request 4 + {Nodes: []uint64{0}, Preferred: true}, + {Nodes: []uint64{1}, Preferred: true}, + {Nodes: []uint64{0, 1}, Preferred: false}, + }, + }, + }, + }, + { + name: "strict affinity: request 5 devices from two NUMA nodes with 4 devices each", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + PriorityDimensions: []string{"0"}, + Devices: map[string]machine.DeviceInfo{ + "test-gpu-1": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g12"}}, + "test-gpu-2": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g12"}}, + "test-gpu-3": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g34"}}, + "test-gpu-4": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g34"}}, + "test-gpu-5": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g56"}}, + "test-gpu-6": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g56"}}, + "test-gpu-7": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g78"}}, + "test-gpu-8": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"0": "g78"}}, + }, + }, + requiredDeviceAffinity: true, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-1", "test-gpu-2", "test-gpu-3", "test-gpu-4", "test-gpu-5", "test-gpu-6", "test-gpu-7", "test-gpu-8"}, + DeviceRequest: 5, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // {0} only has 4 devices, fails + // {1} only has 4 devices, fails + // {0, 1} has 8 devices. We request 5 devices, which needs ceil(5/2)=3 groups. + // This can be satisfied, so {0, 1} is preferred. + {Nodes: []uint64{0, 1}, Preferred: true}, + }, + }, + }, + }, + { + name: "strict affinity: multi-level affinity, fragmented NUMA fails, unfragmented NUMA succeeds", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + PriorityDimensions: []string{"NVLINK", "SOCKET"}, + Devices: map[string]machine.DeviceInfo{ + "test-gpu-1": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g1", "SOCKET": "s1"}}, + "test-gpu-2": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g1", "SOCKET": "s1"}}, + "test-gpu-3": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g2", "SOCKET": "s1"}}, + "test-gpu-4": {NumaNodes: []int{0}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g2", "SOCKET": "s1"}}, + "test-gpu-5": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g3", "SOCKET": "s2"}}, + "test-gpu-6": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g3", "SOCKET": "s2"}}, + "test-gpu-7": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g4", "SOCKET": "s2"}}, + "test-gpu-8": {NumaNodes: []int{1}, Health: pluginapi.Healthy, Dimensions: map[string]string{"NVLINK": "g4", "SOCKET": "s2"}}, + }, + }, + requiredDeviceAffinity: true, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-1", "test-gpu-3", "test-gpu-5", "test-gpu-6"}, + DeviceRequest: 2, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // {0} has fragmented available devices (1 and 3) that belong to different NVLINK groups, fails strategy. + // {1} has unfragmented available devices (5 and 6) that belong to the same NVLINK group, passes strategy. + {Nodes: []uint64{1}, Preferred: true}, + {Nodes: []uint64{0, 1}, Preferred: false}, + }, + }, + }, + }, + { + name: "strategy allocation always succeeds, hint generation aligns with kubelet Device Manager", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + "test-gpu-1": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + "test-gpu-2": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + "test-gpu-3": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1", "test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // When allocation always succeeds (no strict affinity required), + // all combinations that have enough devices are returned. + // {0} has 2 devices, satisfies request 2. Size 1 -> Preferred: true + // {1} has 2 devices, satisfies request 2. Size 1 -> Preferred: true + // {0, 1} has 4 devices, satisfies request 2. Size 2 -> Preferred: false + {Nodes: []uint64{0}, Preferred: true}, + {Nodes: []uint64{1}, Preferred: true}, + {Nodes: []uint64{0, 1}, Preferred: false}, + }, + }, + }, + }, + { + name: "strategy allocation always succeeds, request 3 devices, hint generation aligns with kubelet Device Manager", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + "test-gpu-1": {NumaNodes: []int{0}, Health: pluginapi.Healthy}, + "test-gpu-2": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + "test-gpu-3": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1", "test-gpu-2", "test-gpu-3"}, + DeviceRequest: 3, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // {0} has 2 devices, does not satisfy request 3. + // {1} has 2 devices, does not satisfy request 3. + // {0, 1} has 4 devices, satisfies request 3. Size 2 -> Preferred: true (because it's the minimum size) + {Nodes: []uint64{0, 1}, Preferred: true}, + }, + }, + }, + }, + { + name: "generate from available devices with unhealthy GPUs filtered out", + podUID: "test-uid", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": {NumaNodes: []int{0}, Health: pluginapi.Unhealthy}, + "test-gpu-1": {NumaNodes: []int{1}, Health: pluginapi.Healthy}, + }, + }, + deviceReq: &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + ContainerName: "test-container", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + }, + DeviceName: "test-gpu", + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-0", "test-gpu-1"}, + DeviceRequest: 1, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceHintsResponse{ + PodUid: "test-uid", + ContainerName: "test-container", + DeviceName: "test-gpu", + Annotations: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + Labels: map[string]string{consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores}, + DeviceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + // Only NUMA node 1 has healthy GPUs, so only 1 should be generated. + {Nodes: []uint64{1}, Preferred: true}, + }, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewGPUDevicePlugin(basePlugin) + + if tt.allocationInfo != nil { + basePlugin.GetState().SetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) + } + if tt.preAllocateResourceAllocationInfo != nil { + basePlugin.GetState().SetAllocationInfo(v1.ResourceName(defaultPreAllocateResourceName), tt.podUID, tt.containerName, tt.preAllocateResourceAllocationInfo, false) + } + if tt.deviceTopology != nil { + err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology("test-gpu", tt.deviceTopology) + assert.NoError(t, err) + + if len(tt.deviceTopology.PriorityDimensions) > 0 { + basePlugin.Conf.GPUQRMPluginConfig.RequiredDeviceAffinity = tt.requiredDeviceAffinity + + err = manager.GetGlobalStrategyManager().RegisterGenericAllocationStrategy( + testDeviceAffinityAllocation, + []string{canonical.StrategyNameCanonical, gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, + deviceaffinity.StrategyNameDeviceAffinity, + ) + // Ignore already registered error + if err != nil && err.Error() != "strategy "+testDeviceAffinityAllocation+" already exists" { + t.Logf("register strategy err: %v", err) + } + + if basePlugin.Conf.GPUQRMPluginConfig.CustomAllocationStrategy == nil { + basePlugin.Conf.GPUQRMPluginConfig.CustomAllocationStrategy = make(map[string]string) + } + basePlugin.Conf.GPUQRMPluginConfig.CustomAllocationStrategy["test-gpu"] = testDeviceAffinityAllocation + } + } + + resp, err := devicePlugin.GetAssociatedDeviceTopologyHints(context.Background(), tt.deviceReq) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + // Order of hints doesn't matter, we can use ElementsMatch + if tt.expectedResp != nil && tt.expectedResp.DeviceHints != nil { + assert.ElementsMatch(t, tt.expectedResp.DeviceHints.Hints, resp.DeviceHints.Hints) + resp.DeviceHints.Hints = nil + tt.expectedResp.DeviceHints.Hints = nil + } + assert.Equal(t, tt.expectedResp, resp) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 5a58f677dc..fa5dc64c4b 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -149,6 +149,8 @@ func (p *StaticPolicy) Start() (err error) { periodicalhandler.ReadyToStartHandlersByGroup(appqrm.QRMGPUPluginPeriodicalHandlerGroupName) }, 5*time.Second, p.stopCh) + p.BasePlugin.Run(p.stopCh) + return nil } @@ -662,10 +664,29 @@ func (p *StaticPolicy) UpdateAllocatableAssociatedDevices( return resp, nil } -func (*StaticPolicy) GetAssociatedDeviceTopologyHints( - _ context.Context, _ *pluginapi.AssociatedDeviceRequest, +func (p *StaticPolicy) GetAssociatedDeviceTopologyHints( + ctx context.Context, req *pluginapi.AssociatedDeviceRequest, ) (*pluginapi.AssociatedDeviceHintsResponse, error) { - return &pluginapi.AssociatedDeviceHintsResponse{}, nil + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("request is nil") + } + + if err := p.ensureState(req.DeviceName); err != nil { + return nil, fmt.Errorf("ensure state failed: %v", err) + } + + customDevicePlugin := p.getCustomDevicePlugin(req.DeviceName) + if customDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for device %s", req.DeviceName) + } + + resp, err := customDevicePlugin.GetAssociatedDeviceTopologyHints(ctx, req) + if err != nil { + return nil, fmt.Errorf("custom device plugin GetAssociatedDeviceTopologyHints failed with error: %v", err) + } + + return resp, nil } // AllocateAssociatedDevice allocates a device in this sequence: diff --git a/pkg/agent/qrm-plugins/mb/advisor/advisor_helper.go b/pkg/agent/qrm-plugins/mb/advisor/advisor_helper.go index f49182c72c..7494c454ea 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/advisor_helper.go +++ b/pkg/agent/qrm-plugins/mb/advisor/advisor_helper.go @@ -17,17 +17,21 @@ limitations under the License. package advisor import ( + "errors" "fmt" "strings" "golang.org/x/exp/maps" "k8s.io/apimachinery/pkg/util/sets" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/priority" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/resource" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/monitor" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/plan" ) +const combinedGroupPrefix = "combined-" + // getMinEffectiveCapacity identifies the min dynamic capacity required by pre-defined groups, // if the specific groups have active MB traffics func getMinEffectiveCapacity(base int, groupCaps map[string]int, incomingStats monitor.GroupMBStats) int { @@ -126,13 +130,111 @@ func getGroupIncomingInfo(capacity int, incomingStats monitor.GroupMBStats) *res CapacityInMB: capacity, } - result.GroupSorted = sortGroups(maps.Keys(incomingStats)) + result.GroupSorted = priority.GetInstance().SortGroups(maps.Keys(incomingStats)) result.GroupTotalUses = getUsedTotalByGroup(incomingStats) result.FreeInMB, result.GroupLimits = getLimitsByGroupSorted(capacity, result.GroupSorted, result.GroupTotalUses) result.ResourceState = resource.GetResourceState(capacity, result.FreeInMB) return result } +// groupByWeight extracts the common logic of grouping by weight +func groupByWeight[T any](stats map[string]T) map[int][]string { + groups := make(map[int][]string, len(stats)) + for group := range stats { + weight := priority.GetInstance().GetWeight(group) + groups[weight] = append(groups[weight], group) + } + return groups +} + +// preProcessGroupInfo combines groups with same priority together +func preProcessGroupInfo(stats monitor.GroupMBStats) (monitor.GroupMBStats, domainGroupMapping, error) { + groups := groupByWeight(stats) + + result := make(monitor.GroupMBStats) + groupInfos := domainGroupMapping{} + + for weight, equivGroups := range groups { + if len(equivGroups) == 1 { + result[equivGroups[0]] = stats[equivGroups[0]] + continue + } + + newKey := getCombinedGroupKey(weight) + groupInfo := combinedGroupMapping{} + combined := make(monitor.GroupMB) + maxMap := make(map[int]int) + + // First pass: find max TotalMB for each CCD and build combined stats + for _, group := range equivGroups { + for id, stat := range stats[group] { + if stat.TotalMB > combined[id].TotalMB { + combined[id] = stat + maxMap[id] = stat.TotalMB + } + } + } + + // Second pass: validate and build CCD sets for each group (only within equivGroups) + for _, group := range equivGroups { + ccdSet := ccdSet{} + for id, mbStat := range stats[group] { + // skip shared ccd with similar incoming data + if mbStat.TotalMB > maxMap[id]/2 && mbStat.TotalMB < maxMap[id] { + return nil, nil, errors.New("invalid incoming inputs") + } + if mbStat.TotalMB == maxMap[id] { + ccdSet[id] = struct{}{} + } + } + groupInfo[group] = ccdSet + } + + result[newKey] = combined + groupInfos[newKey] = groupInfo + } + + return result, groupInfos, nil +} + +func preProcessGroupSumStat(sumStats map[string][]monitor.MBInfo) map[string][]monitor.MBInfo { + groups := groupByWeight(sumStats) + + result := make(map[string][]monitor.MBInfo) + + for weight, equivGroups := range groups { + if len(equivGroups) == 1 { + result[equivGroups[0]] = sumStats[equivGroups[0]] + continue + } + + newKey := getCombinedGroupKey(weight) + // sumStats holds outgoing summary of each domain for each group, in other words, + // each slot of sumStats has uniform shape: slice with length of domain number + numDomains := len(sumStats[equivGroups[0]]) + sumList := make([]monitor.MBInfo, numDomains) + + for _, group := range equivGroups { + for id, stat := range sumStats[group] { + sumList[id].LocalMB += stat.LocalMB + sumList[id].RemoteMB += stat.RemoteMB + sumList[id].TotalMB += stat.TotalMB + } + } + result[newKey] = sumList + } + + return result +} + +func getCombinedGroupKey(weight int) string { + return fmt.Sprintf("%s%d", combinedGroupPrefix, weight) +} + +func isCombinedGroup(groupName string) bool { + return strings.Contains(groupName, combinedGroupPrefix) +} + func getLimitsByGroupSorted(capacity int, groupSorting []sets.String, groupUsages map[string]int) (int, map[string]int) { result := make(map[string]int) diff --git a/pkg/agent/qrm-plugins/mb/advisor/advisor_helper_test.go b/pkg/agent/qrm-plugins/mb/advisor/advisor_helper_test.go index 948ad4dcae..7fd29bbb43 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/advisor_helper_test.go +++ b/pkg/agent/qrm-plugins/mb/advisor/advisor_helper_test.go @@ -22,10 +22,16 @@ import ( "k8s.io/apimachinery/pkg/util/sets" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/priority" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/resource" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/monitor" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/plan" ) +func init() { + priority.GetInstance().AddWeight("machine", 9_000) +} + func Test_maskPlanWithNoThrottles(t *testing.T) { t.Parallel() type args struct { @@ -179,3 +185,138 @@ func Test_getDomainTotalMBs(t *testing.T) { }) } } + +func Test_preProcessGroupInfo(t *testing.T) { + t.Parallel() + tests := []struct { + name string + stats monitor.GroupMBStats + wantResult monitor.GroupMBStats + wantGroupInfos domainGroupMapping + wantErr bool + }{ + { + name: "two groups with different weights - no combination", + stats: monitor.GroupMBStats{ + "dedicated": { + 0: {LocalMB: 5_000, RemoteMB: 3_000, TotalMB: 8_000}, + }, + "share-50": { + 1: {LocalMB: 4_000, RemoteMB: 2_000, TotalMB: 6_000}, + }, + }, + wantResult: monitor.GroupMBStats{ + "dedicated": { + 0: {LocalMB: 5_000, RemoteMB: 3_000, TotalMB: 8_000}, + }, + "share-50": { + 1: {LocalMB: 4_000, RemoteMB: 2_000, TotalMB: 6_000}, + }, + }, + wantGroupInfos: domainGroupMapping{}, + wantErr: false, + }, + { + name: "two groups with same weight - combine into one", + stats: monitor.GroupMBStats{ + "dedicated": { + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + }, + "machine": { + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + wantResult: monitor.GroupMBStats{ + "combined-9000": { + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + wantGroupInfos: domainGroupMapping{ + "combined-9000": { + "dedicated": {0: struct{}{}}, + "machine": {1: struct{}{}}, + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + gotResult, gotGroupInfos, err := preProcessGroupInfo(tt.stats) + if (err != nil) != tt.wantErr { + t.Errorf("preProcessGroupInfo() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(gotResult, tt.wantResult) { + t.Errorf("preProcessGroupInfo() gotResult = %v, want %v", gotResult, tt.wantResult) + } + if !reflect.DeepEqual(gotGroupInfos, tt.wantGroupInfos) { + t.Errorf("preProcessGroupInfo() gotGroupInfos = %v, want %v", gotGroupInfos, tt.wantGroupInfos) + } + }) + } +} + +func Test_preProcessGroupSumStat(t *testing.T) { + t.Parallel() + tests := []struct { + name string + sumStats map[string][]monitor.MBInfo + want map[string][]monitor.MBInfo + }{ + { + name: "two groups with different weights - no combination", + sumStats: map[string][]monitor.MBInfo{ + "dedicated": { + {LocalMB: 5_000, RemoteMB: 3_000, TotalMB: 8_000}, + {LocalMB: 4_000, RemoteMB: 2_000, TotalMB: 6_000}, + }, + "share-50": { + {LocalMB: 3_000, RemoteMB: 1_000, TotalMB: 4_000}, + {LocalMB: 2_000, RemoteMB: 1_000, TotalMB: 3_000}, + }, + }, + want: map[string][]monitor.MBInfo{ + "dedicated": { + {LocalMB: 5_000, RemoteMB: 3_000, TotalMB: 8_000}, + {LocalMB: 4_000, RemoteMB: 2_000, TotalMB: 6_000}, + }, + "share-50": { + {LocalMB: 3_000, RemoteMB: 1_000, TotalMB: 4_000}, + {LocalMB: 2_000, RemoteMB: 1_000, TotalMB: 3_000}, + }, + }, + }, + { + name: "two groups with same weight - combine and sum", + sumStats: map[string][]monitor.MBInfo{ + "dedicated": { + {LocalMB: 5_000, RemoteMB: 3_000, TotalMB: 8_000}, + {LocalMB: 4_000, RemoteMB: 2_000, TotalMB: 6_000}, + }, + "machine": { + {LocalMB: 3_000, RemoteMB: 1_000, TotalMB: 4_000}, + {LocalMB: 2_000, RemoteMB: 1_000, TotalMB: 3_000}, + }, + }, + want: map[string][]monitor.MBInfo{ + "combined-9000": { + {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + {LocalMB: 6_000, RemoteMB: 3_000, TotalMB: 9_000}, + }, + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := preProcessGroupSumStat(tt.sumStats); !reflect.DeepEqual(got, tt.want) { + t.Errorf("preProcessGroupSumStat() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor_metrics.go b/pkg/agent/qrm-plugins/mb/advisor/advisor_metrics.go similarity index 61% rename from pkg/agent/qrm-plugins/mb/advisor/domain_advisor_metrics.go rename to pkg/agent/qrm-plugins/mb/advisor/advisor_metrics.go index 71fd5e3972..9b9cd7be8c 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor_metrics.go +++ b/pkg/agent/qrm-plugins/mb/advisor/advisor_metrics.go @@ -37,31 +37,31 @@ const ( namePlanUpdate = "mbm_plan_update" ) -func (d *domainAdvisor) emitDomIncomingStatSummaryMetrics(domLimits map[int]*resource.MBGroupIncomingStat) { +func (a *uniqPriorityAdvisor) emitDomIncomingStatSummaryMetrics(domLimits map[int]*resource.MBGroupIncomingStat) { for domID, limit := range domLimits { tags := map[string]string{ "domain": fmt.Sprintf("%d", domID), "state": string(limit.ResourceState), } - emitKV(d.emitter, nameMBMCapacity, limit.CapacityInMB, tags) - emitKV(d.emitter, nameMBMFree, limit.FreeInMB, tags) + emitKV(a.emitter, nameMBMCapacity, limit.CapacityInMB, tags) + emitKV(a.emitter, nameMBMFree, limit.FreeInMB, tags) } } -func (d *domainAdvisor) emitStatsMtrics(domainsMon *monitor.DomainStats) { - d.emitOutgoingStats(domainsMon.Outgoings) - d.emitIncomingStats(domainsMon.Incomings) +func (a *uniqPriorityAdvisor) emitStatsMtrics(domainsMon *monitor.DomainStats) { + a.emitOutgoingStats(domainsMon.Outgoings) + a.emitIncomingStats(domainsMon.Incomings) } -func (d *domainAdvisor) emitIncomingStats(incomings map[int]monitor.DomainMonStat) { - d.emitStat(incomings, nameMBMIncomingStat) +func (a *uniqPriorityAdvisor) emitIncomingStats(incomings map[int]monitor.DomainMonStat) { + a.emitStat(incomings, nameMBMIncomingStat) } -func (d *domainAdvisor) emitOutgoingStats(outgoings map[int]monitor.DomainMonStat) { - d.emitStat(outgoings, nameMBMOutgoingStat) +func (a *uniqPriorityAdvisor) emitOutgoingStats(outgoings map[int]monitor.DomainMonStat) { + a.emitStat(outgoings, nameMBMOutgoingStat) } -func (d *domainAdvisor) emitStat(stats map[int]monitor.DomainMonStat, metricName string) { +func (a *uniqPriorityAdvisor) emitStat(stats map[int]monitor.DomainMonStat, metricName string) { for domId, monStat := range stats { for group, ccdMBs := range monStat { dom := fmt.Sprintf("%d", domId) @@ -71,33 +71,33 @@ func (d *domainAdvisor) emitStat(stats map[int]monitor.DomainMonStat, metricName "group": group, "ccd": fmt.Sprintf("%d", ccd), } - emitKV(d.emitter, metricName, v.TotalMB, tags) + emitKV(a.emitter, metricName, v.TotalMB, tags) } } } } -func (d *domainAdvisor) emitIncomingTargets(groupedDomIncomingTargets map[string][]int) { - emitNamedGroupTargets(d.emitter, nameMBMIncomingTarget, groupedDomIncomingTargets) +func (a *uniqPriorityAdvisor) emitIncomingTargets(groupedDomIncomingTargets map[string][]int) { + emitNamedGroupTargets(a.emitter, nameMBMIncomingTarget, groupedDomIncomingTargets) } -func (d *domainAdvisor) emitOutgoingTargets(groupedDomOutgoingTargets map[string][]int) { - emitNamedGroupTargets(d.emitter, nameMBMOutgoingTarget, groupedDomOutgoingTargets) +func (a *uniqPriorityAdvisor) emitOutgoingTargets(groupedDomOutgoingTargets map[string][]int) { + emitNamedGroupTargets(a.emitter, nameMBMOutgoingTarget, groupedDomOutgoingTargets) } -func (d *domainAdvisor) emitAdjustedOutgoingTargets(groupedDomOutgoingTargets map[string][]int) { - emitNamedGroupTargets(d.emitter, nameMBMAdjustedOutgoingTarget, groupedDomOutgoingTargets) +func (a *uniqPriorityAdvisor) emitAdjustedOutgoingTargets(groupedDomOutgoingTargets map[string][]int) { + emitNamedGroupTargets(a.emitter, nameMBMAdjustedOutgoingTarget, groupedDomOutgoingTargets) } -func (d *domainAdvisor) emitRawPlan(plan *plan.MBPlan) { - d.emitPlanWithMetricName(plan, namePlanRaw) +func (a *uniqPriorityAdvisor) emitRawPlan(plan *plan.MBPlan) { + a.emitPlanWithMetricName(plan, namePlanRaw) } -func (d *domainAdvisor) emitUpdatePlan(plan *plan.MBPlan) { - d.emitPlanWithMetricName(plan, namePlanUpdate) +func (a *uniqPriorityAdvisor) emitUpdatePlan(plan *plan.MBPlan) { + a.emitPlanWithMetricName(plan, namePlanUpdate) } -func (d *domainAdvisor) emitPlanWithMetricName(plan *plan.MBPlan, metricName string) { +func (a *uniqPriorityAdvisor) emitPlanWithMetricName(plan *plan.MBPlan, metricName string) { if plan == nil || len(plan.MBGroups) == 0 { return } @@ -108,7 +108,7 @@ func (d *domainAdvisor) emitPlanWithMetricName(plan *plan.MBPlan, metricName str "group": group, "ccd": fmt.Sprintf("%d", ccd), } - emitKV(d.emitter, metricName, v, tags) + emitKV(a.emitter, metricName, v, tags) } } } diff --git a/pkg/agent/qrm-plugins/mb/advisor/distributor/types.go b/pkg/agent/qrm-plugins/mb/advisor/distributor/types.go index 56ace05b0b..40f83dc38b 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/distributor/types.go +++ b/pkg/agent/qrm-plugins/mb/advisor/distributor/types.go @@ -16,7 +16,11 @@ limitations under the License. package distributor -import "math" +import ( + "math" + + "github.com/kubewharf/katalyst-core/pkg/util/general" +) type Distributor interface { // Distribute distributes total mb into ccds by their relative weights @@ -24,6 +28,7 @@ type Distributor interface { } func New(min, max int) Distributor { + general.Infof("mbm: ccd distributor is composite of logarithmic and linear, with min %v, max %v", min, max) return &logarithmicBoundedDistributor{ inner: &linearBoundedDistributor{ min: min, diff --git a/pkg/agent/qrm-plugins/mb/advisor/priority/group_priority.go b/pkg/agent/qrm-plugins/mb/advisor/priority/group_priority.go new file mode 100644 index 0000000000..a92598bbeb --- /dev/null +++ b/pkg/agent/qrm-plugins/mb/advisor/priority/group_priority.go @@ -0,0 +1,101 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package priority + +import ( + "fmt" + "sort" + "sync" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/resource" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +var ( + instance GroupPriority + once sync.Once +) + +func GetInstance() GroupPriority { + once.Do(func() { + instance = make(GroupPriority, len(resctrlMajorGroupWeights)) + for key, value := range resctrlMajorGroupWeights { + instance[key] = value + } + }) + return instance +} + +type GroupPriority map[string]int + +func (g GroupPriority) GetWeight(name string) int { + baseWeight, ok := g[getMajor(name)] + if !ok { + return defaultWeight + } + + return baseWeight + getSubWeight(name) +} + +func (g GroupPriority) SortGroups(groups []string) []sets.String { + sort.Slice(groups, func(i, j int) bool { + return g.GetWeight(groups[i]) > g.GetWeight(groups[j]) + }) + + return g.mergeGroupsByWeight(groups) +} + +func (g GroupPriority) mergeGroupsByWeight(groups []string) []sets.String { + var mergedGroups []sets.String + for _, group := range groups { + if len(mergedGroups) == 0 { + mergedGroups = append(mergedGroups, sets.NewString(group)) + continue + } + + lastGroup := mergedGroups[len(mergedGroups)-1] + weightLastGroup, err := g.getWeightOfEquivGroup(lastGroup) + if err != nil { + general.Warningf("[mbm] failed to get allocation weight of group %v: %v", lastGroup, err) + continue + } + + if g.GetWeight(group) == weightLastGroup { + lastGroup.Insert(group) + continue + } + + mergedGroups = append(mergedGroups, sets.NewString(group)) + } + return mergedGroups +} + +func (g GroupPriority) getWeightOfEquivGroup(equivGroups sets.String) (int, error) { + repGroup, err := resource.GetGroupRepresentative(equivGroups) + if err != nil { + return 0, errors.Wrap(err, fmt.Sprintf("failed to get representative of groups %v", equivGroups)) + } + + return g.GetWeight(repGroup), nil +} + +func (g GroupPriority) AddWeight(name string, weight int) { + g[name] = weight +} diff --git a/pkg/agent/qrm-plugins/mb/advisor/group_sort.go b/pkg/agent/qrm-plugins/mb/advisor/priority/group_sort.go similarity index 51% rename from pkg/agent/qrm-plugins/mb/advisor/group_sort.go rename to pkg/agent/qrm-plugins/mb/advisor/priority/group_sort.go index 2195048ade..9fed4c809c 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/group_sort.go +++ b/pkg/agent/qrm-plugins/mb/advisor/priority/group_sort.go @@ -14,24 +14,18 @@ See the License for the specific language governing permissions and limitations under the License. */ -package advisor +package priority import ( - "fmt" - "sort" "strconv" "strings" - "github.com/pkg/errors" - "k8s.io/apimachinery/pkg/util/sets" - - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/resource" "github.com/kubewharf/katalyst-core/pkg/consts" - "github.com/kubewharf/katalyst-core/pkg/util/general" ) const defaultWeight = 5_000 +// resctrlMajorGroupWeights is initialized with built-in resctrl groups and their priorities var resctrlMajorGroupWeights = map[string]int{ consts.ResctrlGroupRoot: 10_000, // intrinsic root the highest consts.ResctrlGroupDedicated: 9_000, @@ -72,56 +66,3 @@ func getSubWeight(name string) int { return 0 } - -func getWeight(name string) int { - baseWeight, ok := resctrlMajorGroupWeights[getMajor(name)] - if !ok { - return defaultWeight - } - - return baseWeight + getSubWeight(name) -} - -// sortGroups sorts groups by their weights in desc order; -// groups having identical weight put in one set -func sortGroups(groups []string) []sets.String { - sort.Slice(groups, func(i, j int) bool { - return getWeight(groups[i]) > getWeight(groups[j]) - }) - - return mergeGroupsByWeight(groups) -} - -func mergeGroupsByWeight(groups []string) []sets.String { - var mergedGroups []sets.String - for _, group := range groups { - if len(mergedGroups) == 0 { - mergedGroups = append(mergedGroups, sets.NewString(group)) - continue - } - - lastGroup := mergedGroups[len(mergedGroups)-1] - weightLastGroup, err := getWeightOfEquivGroup(lastGroup) - if err != nil { - general.Warningf("[mbm] failed to get allocation weight of group %v: %v", lastGroup, err) - continue - } - - if getWeight(group) == weightLastGroup { - lastGroup.Insert(group) - continue - } - - mergedGroups = append(mergedGroups, sets.NewString(group)) - } - return mergedGroups -} - -func getWeightOfEquivGroup(equivGroups sets.String) (int, error) { - repGroup, err := resource.GetGroupRepresentative(equivGroups) - if err != nil { - return 0, errors.Wrap(err, fmt.Sprintf("failed to get representative of groups %v", equivGroups)) - } - - return getWeight(repGroup), nil -} diff --git a/pkg/agent/qrm-plugins/mb/advisor/group_sort_test.go b/pkg/agent/qrm-plugins/mb/advisor/priority/group_sort_test.go similarity index 93% rename from pkg/agent/qrm-plugins/mb/advisor/group_sort_test.go rename to pkg/agent/qrm-plugins/mb/advisor/priority/group_sort_test.go index af63f0da51..e4bfdad1fd 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/group_sort_test.go +++ b/pkg/agent/qrm-plugins/mb/advisor/priority/group_sort_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package advisor +package priority import ( "reflect" @@ -63,7 +63,7 @@ func Test_getSortedGroups(t *testing.T) { tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() - if got := sortGroups(tt.args.groups); !reflect.DeepEqual(got, tt.want) { + if got := GetInstance().SortGroups(tt.args.groups); !reflect.DeepEqual(got, tt.want) { t.Errorf("sortGroups() = %v, want %v", got, tt.want) } }) diff --git a/pkg/agent/qrm-plugins/mb/advisor/priority_advisor.go b/pkg/agent/qrm-plugins/mb/advisor/priority_advisor.go new file mode 100644 index 0000000000..1df0cd940a --- /dev/null +++ b/pkg/agent/qrm-plugins/mb/advisor/priority_advisor.go @@ -0,0 +1,124 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package advisor + +import ( + "context" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/domain" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/monitor" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/plan" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +// priorityAdvisor is able to work with resctrl groups with priority; +// one groups' priority can be the same as that of others'. +// It leverages uniqPriorityAdvisor working with groups of distinct priorities only. +// It targets the scenarios where the groups of same priority don't share ccd. +// todo: enhance to handle multiple groups of same priority sharing ccd +type priorityAdvisor struct { + uniqPriorityAdvisor *uniqPriorityAdvisor +} + +// groupInfo stores the mapping of groups and their CCDs for each domain +type groupInfo struct { // DomainGroups maps domain ID -> combined group key -> original group key -> CCD IDs + DomainGroups map[int]domainGroupMapping +} + +// domainGroupMapping maps combined group keys to their original groups +type domainGroupMapping map[string]combinedGroupMapping + +// combinedGroupMapping maps original group keys to their CCD IDs +type combinedGroupMapping map[string]ccdSet + +// ccdSet represents a set of CCD IDs +type ccdSet = sets.Int + +func (a *priorityAdvisor) GetPlan(ctx context.Context, domainsMon *monitor.DomainStats) (*plan.MBPlan, error) { + domainStats, groupInfos, err := a.combinedDomainStats(domainsMon) + if err != nil { + return nil, err + } + mbPlan, err := a.uniqPriorityAdvisor.GetPlan(ctx, domainStats) + if err != nil { + return nil, err + } + return a.splitPlan(mbPlan, groupInfos), nil +} + +func (a *priorityAdvisor) combinedDomainStats(domainsMon *monitor.DomainStats) (*monitor.DomainStats, *groupInfo, error) { + domainStats := &monitor.DomainStats{ + Incomings: make(map[int]monitor.DomainMonStat), + Outgoings: make(map[int]monitor.DomainMonStat), + OutgoingGroupSumStat: make(map[string][]monitor.MBInfo), + } + groupInfos := &groupInfo{ + DomainGroups: make(map[int]domainGroupMapping), + } + var err error + for id, domainMon := range domainsMon.Incomings { + domainStats.Incomings[id], groupInfos.DomainGroups[id], err = preProcessGroupInfo(domainMon) + if err != nil { + return nil, nil, err + } + } + for id, domainMon := range domainsMon.Outgoings { + domainStats.Outgoings[id], _, err = preProcessGroupInfo(domainMon) + if err != nil { + return nil, nil, err + } + } + domainStats.OutgoingGroupSumStat = preProcessGroupSumStat(domainsMon.OutgoingGroupSumStat) + return domainStats, groupInfos, nil +} + +func (a *priorityAdvisor) splitPlan(mbPlan *plan.MBPlan, groupInfos *groupInfo) *plan.MBPlan { + for groupKey, ccdPlan := range mbPlan.MBGroups { + if !isCombinedGroup(groupKey) { + continue + } + for _, domainGroupInfos := range groupInfos.DomainGroups { + for group, groupInfo := range domainGroupInfos[groupKey] { + for ccd := range groupInfo { + if _, exists := ccdPlan[ccd]; exists { + if mbPlan.MBGroups[group] == nil { + mbPlan.MBGroups[group] = make(plan.GroupCCDPlan) + } + mbPlan.MBGroups[group][ccd] = ccdPlan[ccd] + } + } + } + } + delete(mbPlan.MBGroups, groupKey) + } + return mbPlan +} + +func New(emitter metrics.MetricEmitter, domains domain.Domains, ccdMinMB, ccdMaxMB int, defaultDomainCapacity int, + capPercent int, XDomGroups []string, groupNeverThrottles []string, + groupCapacity map[string]int, +) Advisor { + return &priorityAdvisor{ + uniqPriorityAdvisor: newUniqPriorityAdvisor(emitter, + domains, ccdMinMB, ccdMaxMB, defaultDomainCapacity, capPercent, + XDomGroups, groupNeverThrottles, + groupCapacity, + ), + } +} diff --git a/pkg/agent/qrm-plugins/mb/advisor/priority_advisor_test.go b/pkg/agent/qrm-plugins/mb/advisor/priority_advisor_test.go new file mode 100644 index 0000000000..e4c7c79f1b --- /dev/null +++ b/pkg/agent/qrm-plugins/mb/advisor/priority_advisor_test.go @@ -0,0 +1,277 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package advisor + +import ( + "reflect" + "testing" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/priority" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/monitor" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/plan" +) + +func Test_priorityGroupDecorator_combinedDomainStats(t *testing.T) { + t.Parallel() + priority.GetInstance().AddWeight("machine", 9_000) + tests := []struct { + name string + domainsMon *monitor.DomainStats + wantStats *monitor.DomainStats + wantGroupInfo *groupInfo + wantErr bool + }{ + { + name: "single group per weight - no combination", + domainsMon: &monitor.DomainStats{ + Incomings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated-60": map[int]monitor.MBInfo{ + 0: {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + 1: {LocalMB: 3_000, RemoteMB: 2_000, TotalMB: 5_000}, + }, + }, + 1: { + "share-50": map[int]monitor.MBInfo{ + 2: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + Outgoings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated-60": map[int]monitor.MBInfo{ + 0: {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + 1: {LocalMB: 3_000, RemoteMB: 2_000, TotalMB: 5_000}, + }, + }, + 1: { + "share-50": map[int]monitor.MBInfo{ + 2: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + OutgoingGroupSumStat: map[string][]monitor.MBInfo{ + "dedicated-60": { + {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + {LocalMB: 0, RemoteMB: 0, TotalMB: 0}, + }, + "share-50": { + {LocalMB: 0, RemoteMB: 0, TotalMB: 0}, + {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + wantStats: &monitor.DomainStats{ + Incomings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated-60": map[int]monitor.MBInfo{ + 0: {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + 1: {LocalMB: 3_000, RemoteMB: 2_000, TotalMB: 5_000}, + }, + }, + 1: { + "share-50": map[int]monitor.MBInfo{ + 2: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + Outgoings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated-60": map[int]monitor.MBInfo{ + 0: {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + 1: {LocalMB: 3_000, RemoteMB: 2_000, TotalMB: 5_000}, + }, + }, + 1: { + "share-50": map[int]monitor.MBInfo{ + 2: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + OutgoingGroupSumStat: map[string][]monitor.MBInfo{ + "dedicated-60": { + {LocalMB: 5_000, RemoteMB: 5_000, TotalMB: 10_000}, + {LocalMB: 0, RemoteMB: 0, TotalMB: 0}, + }, + "share-50": { + {LocalMB: 0, RemoteMB: 0, TotalMB: 0}, + {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + wantGroupInfo: &groupInfo{ + DomainGroups: map[int]domainGroupMapping{ + 0: {}, + 1: {}, + }, + }, + wantErr: false, + }, + { + name: "multiple groups with same weight - combine groups", + domainsMon: &monitor.DomainStats{ + Incomings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated": map[int]monitor.MBInfo{ + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + }, + "machine": map[int]monitor.MBInfo{ + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + Outgoings: map[int]monitor.GroupMBStats{ + 0: { + "dedicated": map[int]monitor.MBInfo{ + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + }, + "machine": map[int]monitor.MBInfo{ + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + OutgoingGroupSumStat: map[string][]monitor.MBInfo{ + "dedicated": { + {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + }, + "machine": { + {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + wantStats: &monitor.DomainStats{ + Incomings: map[int]monitor.GroupMBStats{ + 0: { + "combined-9000": map[int]monitor.MBInfo{ + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + Outgoings: map[int]monitor.GroupMBStats{ + 0: { + "combined-9000": map[int]monitor.MBInfo{ + 0: {LocalMB: 10_000, RemoteMB: 5_000, TotalMB: 15_000}, + 1: {LocalMB: 8_000, RemoteMB: 4_000, TotalMB: 12_000}, + }, + }, + }, + OutgoingGroupSumStat: map[string][]monitor.MBInfo{ + "combined-9000": { + {LocalMB: 18_000, RemoteMB: 9_000, TotalMB: 27_000}, + }, + }, + }, + wantGroupInfo: &groupInfo{ + DomainGroups: map[int]domainGroupMapping{ + 0: { + "combined-9000": { + "dedicated": {0: struct{}{}}, + "machine": {1: struct{}{}}, + }, + }, + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + d := &priorityAdvisor{} + gotStats, gotGroupInfo, err := d.combinedDomainStats(tt.domainsMon) + if (err != nil) != tt.wantErr { + t.Errorf("combinedDomainStats() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(gotStats, tt.wantStats) { + t.Errorf("combinedDomainStats() gotStats = %v, want %v", gotStats, tt.wantStats) + } + if !reflect.DeepEqual(gotGroupInfo, tt.wantGroupInfo) { + t.Errorf("combinedDomainStats() gotGroupInfo = %v, want %v", gotGroupInfo, tt.wantGroupInfo) + } + }) + } +} + +func Test_priorityGroupDecorator_splitPlan(t *testing.T) { + t.Parallel() + tests := []struct { + name string + mbPlan *plan.MBPlan + groupInfos *groupInfo + want *plan.MBPlan + }{ + { + name: "no combined groups - no change", + mbPlan: &plan.MBPlan{ + MBGroups: map[string]plan.GroupCCDPlan{ + "dedicated-60": {0: 5_000, 1: 4_000}, + "machine-60": {2: 6_000, 3: 5_500}, + }, + }, + groupInfos: &groupInfo{ + DomainGroups: map[int]domainGroupMapping{ + 0: {}, + 1: {}, + }, + }, + want: &plan.MBPlan{ + MBGroups: map[string]plan.GroupCCDPlan{ + "dedicated-60": {0: 5_000, 1: 4_000}, + "machine-60": {2: 6_000, 3: 5_500}, + }, + }, + }, + { + name: "single combined group - split to original groups", + mbPlan: &plan.MBPlan{ + MBGroups: map[string]plan.GroupCCDPlan{ + "combined-9000": {0: 5_000, 1: 4_000}, + }, + }, + groupInfos: &groupInfo{ + DomainGroups: map[int]domainGroupMapping{ + 0: { + "combined-9000": { + "dedicated-60": {0: struct{}{}}, + "machine-60": {1: struct{}{}}, + }, + }, + }, + }, + want: &plan.MBPlan{ + MBGroups: map[string]plan.GroupCCDPlan{ + "dedicated-60": {0: 5_000}, + "machine-60": {1: 4_000}, + }, + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + d := &priorityAdvisor{} + got := d.splitPlan(tt.mbPlan, tt.groupInfos) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("splitPlan() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor.go b/pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor.go similarity index 65% rename from pkg/agent/qrm-plugins/mb/advisor/domain_advisor.go rename to pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor.go index 1a19eddf00..9da489c177 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor.go +++ b/pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor.go @@ -36,7 +36,9 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" ) -type domainAdvisor struct { +// uniqPriorityAdvisor is an priorityAdvisor which can only work with resctrl groups of distinct priorities; +// due to this limitation, it should not be used directly. +type uniqPriorityAdvisor struct { emitter metrics.MetricEmitter domains domain.Domains @@ -59,114 +61,123 @@ type domainAdvisor struct { ccdDistribute distributor.Distributor } -func (d *domainAdvisor) GetPlan(ctx context.Context, domainsMon *monitor.DomainStats) (*plan.MBPlan, error) { - d.emitStatsMtrics(domainsMon) +func (a *uniqPriorityAdvisor) GetPlan(ctx context.Context, domainsMon *monitor.DomainStats) (*plan.MBPlan, error) { + a.emitStatsMtrics(domainsMon) // identify mb incoming usage etc since the capacity applies to incoming traffic - domIncomingInfo, err := d.calcIncomingDomainStats(ctx, domainsMon) + domIncomingInfo, err := a.calcIncomingDomainStats(ctx, domainsMon) if err != nil { return nil, errors.Wrap(err, "failed to get plan") } if klog.V(6).Enabled() { domTotalMBs := getDomainTotalMBs(domIncomingInfo) - general.InfofV(6, "[mbm] [advisor] domains incoming total: %v", domTotalMBs) + general.InfofV(6, "[mbm] [priorityAdvisor] domains incoming total: %v", domTotalMBs) } - d.emitDomIncomingStatSummaryMetrics(domIncomingInfo) + a.emitDomIncomingStatSummaryMetrics(domIncomingInfo) // based on mb incoming usage info, decide incoming quotas (i.e. targets) - domIncomingQuotas := d.getIncomingDomainQuotas(ctx, domIncomingInfo) + domIncomingQuotas := a.getIncomingDomainQuotas(ctx, domIncomingInfo) groupedDomIncomingTargets := domIncomingQuotas.GetGroupedDomainSetting() if klog.V(6).Enabled() { - general.InfofV(6, "[mbm] [advisor] group-domain incoming targets: %s", + general.InfofV(6, "[mbm] [priorityAdvisor] group-domain incoming targets: %s", stringify(groupedDomIncomingTargets)) } - d.emitIncomingTargets(groupedDomIncomingTargets) + a.emitIncomingTargets(groupedDomIncomingTargets) // for each group, based on incoming targets, decide what the outgoing targets are var groupedDomOutgoingTargets map[string][]int - groupedDomOutgoingTargets, err = d.deriveOutgoingTargets(ctx, domainsMon.OutgoingGroupSumStat, groupedDomIncomingTargets) + groupedDomOutgoingTargets, err = a.deriveOutgoingTargets(ctx, domainsMon.OutgoingGroupSumStat, groupedDomIncomingTargets) if err != nil { return nil, errors.Wrap(err, "failed to get plan") } if klog.V(6).Enabled() { - general.InfofV(6, "[mbm] [advisor] group-domain outgoing targets: %s", + general.InfofV(6, "[mbm] [priorityAdvisor] group-domain outgoing targets: %s", stringify(groupedDomOutgoingTargets)) } - d.emitOutgoingTargets(groupedDomOutgoingTargets) + a.emitOutgoingTargets(groupedDomOutgoingTargets) // leverage the current observed outgoing stats (and implicit previous outgoing mb) // to adjust th outgoing mb hopeful to reach the desired target groupedDomOutgoings := domainsMon.OutgoingGroupSumStat - groupedDomainOutgoingQuotas := d.adjust(ctx, groupedDomOutgoingTargets, groupedDomOutgoings, d.capPercent) + groupedDomainOutgoingQuotas := a.adjust(ctx, groupedDomOutgoingTargets, groupedDomOutgoings, a.capPercent) if klog.V(6).Enabled() { - general.InfofV(6, "[mbm] [advisor] group-domain outgoing quotas: %s", + general.InfofV(6, "[mbm] [priorityAdvisor] group-domain outgoing quotas adjusted: %s", stringify(groupedDomainOutgoingQuotas)) } - d.emitAdjustedOutgoingTargets(groupedDomainOutgoingQuotas) + a.emitAdjustedOutgoingTargets(groupedDomainOutgoingQuotas) // split outgoing mb to ccd level - groupedCCDOutgoingQuotas := d.distributeToCCDs(ctx, groupedDomainOutgoingQuotas, domainsMon.Outgoings) + groupedCCDOutgoingQuotas := a.distributeToCCDs(ctx, groupedDomainOutgoingQuotas, domainsMon.Outgoings) + if klog.V(6).Enabled() { + general.InfofV(6, "[mbm] [priorityAdvisor] group-ccd outgoing quotas: %v", groupedCCDOutgoingQuotas) + } rawPlan := convertToPlan(groupedCCDOutgoingQuotas) - d.emitRawPlan(rawPlan) + if klog.V(6).Enabled() { + general.InfofV(6, "[mbm] [priorityAdvisor] raw plan: %s", rawPlan) + } + a.emitRawPlan(rawPlan) // finalize plan with never-throttle groups and ccb mb checks - checkedPlan := applyPlanCCDBoundsChecks(rawPlan, d.ccdMinMB, d.ccdMaxMB) - updatePlan := maskPlanWithNoThrottles(checkedPlan, d.groupNeverThrottles, d.getNoThrottleMB()) - d.emitUpdatePlan(updatePlan) + checkedPlan := applyPlanCCDBoundsChecks(rawPlan, a.ccdMinMB, a.ccdMaxMB) + updatePlan := maskPlanWithNoThrottles(checkedPlan, a.groupNeverThrottles, a.getNoThrottleMB()) + if klog.V(6).Enabled() { + general.InfofV(6, "[mbm] [priorityAdvisor] mb plan update: %s", updatePlan) + } + a.emitUpdatePlan(updatePlan) return updatePlan, nil } -func (d *domainAdvisor) getNoThrottleMB() int { - if d.ccdMaxMB > 0 { - return d.ccdMaxMB +func (a *uniqPriorityAdvisor) getNoThrottleMB() int { + if a.ccdMaxMB > 0 { + return a.ccdMaxMB } - return d.defaultDomainCapacity + return a.defaultDomainCapacity } -func (d *domainAdvisor) adjust(_ context.Context, +func (a *uniqPriorityAdvisor) adjust(_ context.Context, groupedSettings map[string][]int, observed map[string][]monitor.MBInfo, capPercent int, ) map[string][]int { result := map[string][]int{} activeGroups := sets.String{} for group, values := range groupedSettings { currents := getGroupOutgoingTotals(group, observed) - if _, ok := d.adjusters[group]; !ok { - d.adjusters[group] = adjuster.New(capPercent) + if _, ok := a.adjusters[group]; !ok { + a.adjusters[group] = adjuster.New(capPercent) } - result[group] = d.adjusters[group].AdjustOutgoingTargets(values, currents) + result[group] = a.adjusters[group].AdjustOutgoingTargets(values, currents) activeGroups.Insert(group) } // clean up to avoid memory leak if len(groupedSettings) > 0 { - for group := range d.adjusters { + for group := range a.adjusters { if activeGroups.Has(group) { continue } - delete(d.adjusters, group) + delete(a.adjusters, group) } } return result } -func (d *domainAdvisor) getIncomingDomainQuotas(_ context.Context, domIncomingInfo map[int]*resource.MBGroupIncomingStat, +func (a *uniqPriorityAdvisor) getIncomingDomainQuotas(_ context.Context, domIncomingInfo map[int]*resource.MBGroupIncomingStat, ) resource.DomQuotas { domQuotas := map[int]resource.GroupSettings{} for dom, incomingInfo := range domIncomingInfo { - domQuotas[dom] = d.quotaStrategy.GetGroupQuotas(incomingInfo) + domQuotas[dom] = a.quotaStrategy.GetGroupQuotas(incomingInfo) } return domQuotas } -func (d *domainAdvisor) calcIncomingDomainStats(ctx context.Context, mon *monitor.DomainStats, +func (a *uniqPriorityAdvisor) calcIncomingDomainStats(ctx context.Context, mon *monitor.DomainStats, ) (map[int]*resource.MBGroupIncomingStat, error) { incomingInfoOfDomains := make(map[int]*resource.MBGroupIncomingStat) var err error for domID, incomingStats := range mon.Incomings { - incomingInfoOfDomains[domID], err = d.calcIncomingStat(domID, incomingStats) + incomingInfoOfDomains[domID], err = a.calcIncomingStat(domID, incomingStats) if err != nil { return nil, errors.Wrap(err, "failed to calc domain quotas") } @@ -174,8 +185,8 @@ func (d *domainAdvisor) calcIncomingDomainStats(ctx context.Context, mon *monito return incomingInfoOfDomains, nil } -func (d *domainAdvisor) calcIncomingStat(domID int, incomingStats monitor.GroupMBStats) (*resource.MBGroupIncomingStat, error) { - capacity, err := d.getEffectiveCapacity(domID, incomingStats) +func (a *uniqPriorityAdvisor) calcIncomingStat(domID int, incomingStats monitor.GroupMBStats) (*resource.MBGroupIncomingStat, error) { + capacity, err := a.getEffectiveCapacity(domID, incomingStats) if err != nil { return nil, errors.Wrap(err, "failed to calc domain capacity") } @@ -185,15 +196,15 @@ func (d *domainAdvisor) calcIncomingStat(domID int, incomingStats monitor.GroupM } // getEffectiveCapacity gets the effective memory bandwidth capacity of specified domain, with its given resource usage -func (d *domainAdvisor) getEffectiveCapacity(domID int, domIncomingStats monitor.GroupMBStats) (int, error) { - if _, ok := d.domains[domID]; !ok { +func (a *uniqPriorityAdvisor) getEffectiveCapacity(domID int, domIncomingStats monitor.GroupMBStats) (int, error) { + if _, ok := a.domains[domID]; !ok { return 0, fmt.Errorf("unknown domain %d", domID) } - return getMinEffectiveCapacity(d.defaultDomainCapacity, d.groupCapacityInMB, domIncomingStats), nil + return getMinEffectiveCapacity(a.defaultDomainCapacity, a.groupCapacityInMB, domIncomingStats), nil } -func (d *domainAdvisor) deriveOutgoingTargets(_ context.Context, +func (a *uniqPriorityAdvisor) deriveOutgoingTargets(_ context.Context, outgoingGroupSumStat map[string][]monitor.MBInfo, incomingTargets map[string][]int, ) (map[string][]int, error) { // for each group: based on incoming targets, decide what the outgoing targets are @@ -206,17 +217,17 @@ func (d *domainAdvisor) deriveOutgoingTargets(_ context.Context, continue } - if d.xDomGroups.Has(group) { + if a.xDomGroups.Has(group) { localRatio := make([]float64, len(domSums)) for i, domSum := range domSums { // limit the excessive remote traffic if applicable - remoteTarget := d.domains[i].GetAlienMBLimit() + remoteTarget := a.domains[i].GetAlienMBLimit() if remoteTarget > domSum.RemoteMB { remoteTarget = domSum.RemoteMB } localRatio[i] = float64(domSum.LocalMB) / float64(domSum.LocalMB+remoteTarget) } - outgoingTargets, err := d.flower.InvertFlow(localRatio, groupIncomingTargets) + outgoingTargets, err := a.flower.InvertFlow(localRatio, groupIncomingTargets) if err != nil { return nil, errors.Wrap(err, "failed to get sourcing outgoing out of desired incoming targets") } @@ -230,14 +241,14 @@ func (d *domainAdvisor) deriveOutgoingTargets(_ context.Context, return result, nil } -func (d *domainAdvisor) distributeToCCDs(_ context.Context, +func (a *uniqPriorityAdvisor) distributeToCCDs(_ context.Context, quotas map[string][]int, outgoingStat map[int]monitor.GroupMBStats, ) map[string]map[int]int { result := map[string]map[int]int{} for group, domQuota := range quotas { // each domain is treated independently for domID, domTotal := range domQuota { - ccdDistributions := d.domainDistributeGroup(domID, group, domTotal, outgoingStat) + ccdDistributions := a.domainDistributeGroup(domID, group, domTotal, outgoingStat) if len(ccdDistributions) == 0 { continue } @@ -253,7 +264,7 @@ func (d *domainAdvisor) distributeToCCDs(_ context.Context, return result } -func (d *domainAdvisor) domainDistributeGroup(domID int, group string, +func (a *uniqPriorityAdvisor) domainDistributeGroup(domID int, group string, domTotal int, outgoingStat map[int]monitor.GroupMBStats, ) map[int]int { weights := map[int]int{} @@ -261,7 +272,11 @@ func (d *domainAdvisor) domainDistributeGroup(domID int, group string, for ccd, stat := range groupStat { weights[ccd] = stat.TotalMB } - domCCDQuotas := d.ccdDistribute.Distribute(domTotal, weights) + domCCDQuotas := a.ccdDistribute.Distribute(domTotal, weights) + if klog.V(6).Enabled() { + general.InfofV(6, "[mbm] [priorityAdvisor] domain %d, group %s, total %v, weights %v, distribute to ccd quotas: %v", + domID, group, domTotal, weights, domCCDQuotas) + } result := map[int]int{} for ccd, v := range domCCDQuotas { result[ccd] = v @@ -269,15 +284,15 @@ func (d *domainAdvisor) domainDistributeGroup(domID int, group string, return result } -func New(emitter metrics.MetricEmitter, domains domain.Domains, ccdMinMB, ccdMaxMB int, defaultDomainCapacity int, +func newUniqPriorityAdvisor(emitter metrics.MetricEmitter, domains domain.Domains, ccdMinMB, ccdMaxMB int, defaultDomainCapacity int, capPercent int, XDomGroups []string, groupNeverThrottles []string, groupCapacity map[string]int, -) Advisor { +) *uniqPriorityAdvisor { // do not throttle built-in "/" anytime notThrottles := sets.NewString("/") notThrottles.Insert(groupNeverThrottles...) - return &domainAdvisor{ + return &uniqPriorityAdvisor{ emitter: emitter, domains: domains, xDomGroups: sets.NewString(XDomGroups...), diff --git a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor_test.go b/pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor_test.go similarity index 99% rename from pkg/agent/qrm-plugins/mb/advisor/domain_advisor_test.go rename to pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor_test.go index 6362a4214c..a2e421f9c6 100644 --- a/pkg/agent/qrm-plugins/mb/advisor/domain_advisor_test.go +++ b/pkg/agent/qrm-plugins/mb/advisor/uniq_priority_advisor_test.go @@ -92,7 +92,7 @@ func Test_domainAdvisor_getEffectiveCapacity(t *testing.T) { tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() - d := &domainAdvisor{ + d := &uniqPriorityAdvisor{ domains: tt.fields.domains, xDomGroups: tt.fields.xDomGroups, groupCapacityInMB: tt.fields.groupCapacityInMB, @@ -226,7 +226,7 @@ func Test_domainAdvisor_calcIncomingQuotas(t *testing.T) { tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() - d := &domainAdvisor{ + d := &uniqPriorityAdvisor{ domains: tt.fields.domains, xDomGroups: tt.fields.XDomGroups, groupCapacityInMB: tt.fields.GroupCapacityInMB, @@ -515,7 +515,7 @@ func Test_domainAdvisor_GetPlan(t *testing.T) { tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() - d := &domainAdvisor{ + d := &uniqPriorityAdvisor{ domains: tt.fields.domains, defaultDomainCapacity: tt.fields.defaultDomainCapacity, capPercent: 100, diff --git a/pkg/agent/qrm-plugins/mb/policy/plugin.go b/pkg/agent/qrm-plugins/mb/policy/plugin.go index cba372d627..15e241cadb 100644 --- a/pkg/agent/qrm-plugins/mb/policy/plugin.go +++ b/pkg/agent/qrm-plugins/mb/policy/plugin.go @@ -32,6 +32,7 @@ import ( "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/advisor/priority" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/allocator" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/domain" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/mb/monitor" @@ -119,13 +120,19 @@ func (m *MBPlugin) Start() (err error) { return nil } + // ensure the extra resctrl groups registered with their priorities + for group, weight := range m.conf.ExtraGroupPriorities { + general.Infof("mbm: registering extra group %s, priority %d", group, weight) + priority.GetInstance().AddWeight(group, weight) + } + // initializing advisor field is deferred as qos group mb capacities is known now + general.Infof("mbm: create advior able to treat multiple resctrl groups with identical priority") m.advisor = advisor.New(m.emitter, m.domains, m.conf.MinCCDMB, m.conf.MaxCCDMB, defaultMBDomainCapacity, m.conf.MBCapLimitPercent, m.conf.CrossDomainGroups, m.conf.MBQRMPluginConfig.NoThrottleGroups, - groupCapacities, - ) + groupCapacities) go func() { wait.Until(m.run, interval, m.chStop) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go index c620614c4e..4ec798a3d5 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go @@ -28,6 +28,7 @@ const ( ControlKnowKeyMemoryOffloading MemoryControlKnobName = "memory_offloading" ControlKnowKeyDyingMemcgReclaim MemoryControlKnobName = "dying_memcg_reclaim" ControlKnobKeyMemoryNUMAHeadroom MemoryControlKnobName = "memory_numa_headroom" + ControlKnobKeyMemoryHigh MemoryControlKnobName = "memory_high" ) const ( diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go index 0840cf9125..7f565b68ce 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go @@ -273,6 +273,8 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyMemoryLimitInBytes, memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryLimitInBytes)) + memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyMemoryHigh, + memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryHigh)) memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyCPUSetMems, memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorCPUSetMems)) memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyDropCache, diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go index 021f017e44..570382804e 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go @@ -26,6 +26,7 @@ import ( "strconv" "time" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/samber/lo" "google.golang.org/grpc" "google.golang.org/grpc/metadata" @@ -392,6 +393,43 @@ func (p *DynamicPolicy) handleAdvisorResp(advisorResp *advisorsvc.ListAndWatchRe return nil } +func (p *DynamicPolicy) handleAdvisorMemoryHigh( + _ *config.Configuration, + _ interface{}, + _ *dynamicconfig.DynamicAgentConfiguration, + emitter metrics.MetricEmitter, + metaServer *metaserver.MetaServer, + entryName, subEntryName string, + calculationInfo *advisorsvc.CalculationInfo, podResourceEntries state.PodResourceEntries, +) error { + memoryHighStr := calculationInfo.CalculationResult.Values[string(memoryadvisor.ControlKnobKeyMemoryHigh)] + memoryHigh, err := strconv.ParseInt(memoryHighStr, 10, 64) + if err != nil { + return fmt.Errorf("parse %s: %s failed with error: %v", memoryadvisor.ControlKnobKeyMemoryHigh, memoryHighStr, err) + } + + if !cgroups.IsCgroup2UnifiedMode() { + general.Infof("memory.high is not supported in cgroupv1 mode") + return nil + } + + if calculationInfo.CgroupPath != "" { + if err = cgroupmgr.ApplyMemoryWithRelativePath(calculationInfo.CgroupPath, &common.MemoryData{ + HighInBytes: memoryHigh, + }); err != nil { + return fmt.Errorf("apply memory.high failed with error: %v", err) + } + + _ = emitter.StoreInt64(util.MetricNameMemoryHandleAdvisorMemoryHigh, memoryHigh, + metrics.MetricTypeNameRaw, metrics.ConvertMapToTags(map[string]string{ + "cgroupPath": calculationInfo.CgroupPath, + })...) + return nil + } + + return nil +} + func (p *DynamicPolicy) handleAdvisorMemoryLimitInBytes( _ *config.Configuration, _ interface{}, diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler_memory_high_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler_memory_high_test.go new file mode 100644 index 0000000000..5196f990d4 --- /dev/null +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler_memory_high_test.go @@ -0,0 +1,144 @@ +//go:build linux +// +build linux + +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dynamicpolicy + +import ( + "sync" + "testing" + + "github.com/bytedance/mockey" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" + "github.com/kubewharf/katalyst-core/pkg/metrics" + common "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" + cgroupmgr "github.com/kubewharf/katalyst-core/pkg/util/cgroup/manager" +) + +var memoryHighTestMutex sync.Mutex + +func TestDynamicPolicy_handleAdvisorMemoryHigh(t *testing.T) { + t.Parallel() + + t.Run("invalid memory high value", func(t *testing.T) { + t.Parallel() + memoryHighTestMutex.Lock() + defer memoryHighTestMutex.Unlock() + + p := &DynamicPolicy{} + calculationInfo := &advisorsvc.CalculationInfo{ + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryHigh): "invalid", + }, + }, + } + + err := p.handleAdvisorMemoryHigh(nil, nil, nil, metrics.DummyMetrics{}, nil, "", "", calculationInfo, nil) + assert.Error(t, err) + }) + + t.Run("cgroupv1 mode", func(t *testing.T) { + t.Parallel() + memoryHighTestMutex.Lock() + defer memoryHighTestMutex.Unlock() + defer mockey.UnPatchAll() + + mockey.Mock(cgroups.IsCgroup2UnifiedMode).IncludeCurrentGoRoutine().To(func() bool { + return false + }).Build() + + p := &DynamicPolicy{} + calculationInfo := &advisorsvc.CalculationInfo{ + CgroupPath: "/kubepods/besteffort", + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryHigh): "228000000000", + }, + }, + } + + err := p.handleAdvisorMemoryHigh(nil, nil, nil, metrics.DummyMetrics{}, nil, "", "", calculationInfo, nil) + assert.NoError(t, err) + }) + + t.Run("empty cgroup path", func(t *testing.T) { + t.Parallel() + memoryHighTestMutex.Lock() + defer memoryHighTestMutex.Unlock() + defer mockey.UnPatchAll() + + mockey.Mock(cgroups.IsCgroup2UnifiedMode).IncludeCurrentGoRoutine().To(func() bool { + return true + }).Build() + + p := &DynamicPolicy{} + calculationInfo := &advisorsvc.CalculationInfo{ + CgroupPath: "", + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryHigh): "228000000000", + }, + }, + } + + err := p.handleAdvisorMemoryHigh(nil, nil, nil, metrics.DummyMetrics{}, nil, "", "", calculationInfo, nil) + assert.NoError(t, err) + }) + + t.Run("success apply memory high", func(t *testing.T) { + t.Parallel() + memoryHighTestMutex.Lock() + defer memoryHighTestMutex.Unlock() + defer mockey.UnPatchAll() + + var capturedCgroupPath string + var capturedMemoryData *common.MemoryData + applyMemoryCalled := false + + mockey.Mock(cgroups.IsCgroup2UnifiedMode).IncludeCurrentGoRoutine().To(func() bool { + return true + }).Build() + mockey.Mock(cgroupmgr.ApplyMemoryWithRelativePath).IncludeCurrentGoRoutine().To(func(cgroupPath string, memoryData *common.MemoryData) error { + capturedCgroupPath = cgroupPath + capturedMemoryData = memoryData + applyMemoryCalled = true + return nil + }).Build() + + p := &DynamicPolicy{} + calculationInfo := &advisorsvc.CalculationInfo{ + CgroupPath: "/kubepods/besteffort", + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryHigh): "228000000000", + }, + }, + } + + err := p.handleAdvisorMemoryHigh(nil, nil, nil, metrics.DummyMetrics{}, nil, "", "", calculationInfo, nil) + assert.NoError(t, err) + assert.True(t, applyMemoryCalled) + assert.Equal(t, "/kubepods/besteffort", capturedCgroupPath) + assert.Equal(t, int64(228000000000), capturedMemoryData.HighInBytes) + }) +} diff --git a/pkg/agent/qrm-plugins/sriov/policy/dynamic.go b/pkg/agent/qrm-plugins/sriov/policy/dynamic.go index 64f6b11230..fa522e6b14 100644 --- a/pkg/agent/qrm-plugins/sriov/policy/dynamic.go +++ b/pkg/agent/qrm-plugins/sriov/policy/dynamic.go @@ -44,13 +44,28 @@ import ( cpudynamicpolicy "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy" ) +type podFilter func(req *pluginapi.ResourceRequest) bool + +func requiredAnnotationFilter(required map[string]string) podFilter { + return func(req *pluginapi.ResourceRequest) bool { + for requiredK, requiredV := range required { + if gotV, exists := req.Annotations[requiredK]; !exists || gotV != requiredV { + return false + } + } + + return true + } +} + type DynamicPolicy struct { sync.RWMutex *basePolicy - name string - started bool - emitter metrics.MetricEmitter + name string + started bool + emitter metrics.MetricEmitter + podFilters []podFilter policyConfig qrmconfig.SriovDynamicPolicyConfig } @@ -76,10 +91,16 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration return false, agent.ComponentStub{}, nil } + var podFilters []podFilter + if conf.SriovDynamicPolicyConfig.PodRequiredAnnotations != nil { + podFilters = append(podFilters, requiredAnnotationFilter(conf.SriovDynamicPolicyConfig.PodRequiredAnnotations)) + } + dynamicPolicy := &DynamicPolicy{ name: fmt.Sprintf("%s_%s", agentName, consts.SriovResourcePluginPolicyNameDynamic), emitter: wrappedEmitter, basePolicy: basePolicy, + podFilters: podFilters, policyConfig: conf.SriovDynamicPolicyConfig, } @@ -112,8 +133,6 @@ func (p *DynamicPolicy) Run(ctx context.Context) { <-ctx.Done() general.Infof("stopped") - - return } // GetAccompanyResourceTopologyHints get topology hints of accompany resources @@ -149,6 +168,11 @@ func (p *DynamicPolicy) GetAccompanyResourceTopologyHints(req *pluginapi.Resourc return nil } + if !p.shouldDynamicAllocateForPod(req) { + general.Infof("pod %s/%s filtered out, skip get accompany resource topology hints", req.PodNamespace, req.PodName) + return nil + } + request, _, err := qrmutil.GetPodAggregatedRequestResource(req) if err != nil { return fmt.Errorf("GetPodAggregatedRequestResource failed with error: %v", err) @@ -234,14 +258,6 @@ func (p *DynamicPolicy) AllocateAccompanyResource(req *pluginapi.ResourceRequest } }() - qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) - if err != nil { - err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - general.Errorf("%s", err.Error()) - return err - } - // reuse allocation info allocated by same pod and container allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) if allocationInfo != nil { @@ -256,6 +272,20 @@ func (p *DynamicPolicy) AllocateAccompanyResource(req *pluginapi.ResourceRequest return nil } + // qrmutil.GetKatalystQoSLevelFromResourceReq would overwrite req.Annotations, so we should run before it + if !p.shouldDynamicAllocateForPod(req) { + general.Infof("pod %s/%s filtered out, skip allocate accompany resource", req.PodNamespace, req.PodName) + return nil + } + + qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return err + } + // get request quantity of main resource: cpu request, _, err := qrmutil.GetPodAggregatedRequestResource(req) if err != nil { @@ -346,6 +376,16 @@ func (p *DynamicPolicy) ReleaseAccompanyResource(req *pluginapi.RemovePodRequest return nil } +func (p *DynamicPolicy) shouldDynamicAllocateForPod(req *pluginapi.ResourceRequest) bool { + for _, filter := range p.podFilters { + if !filter(req) { + return false + } + } + + return true +} + func (p *DynamicPolicy) addAllocationInfoToResponse(allocationInfo *state.AllocationInfo, resp *pluginapi.ResourceAllocationResponse) error { resourceAllocationInfo, err := p.generateResourceAllocationInfo(allocationInfo) if err != nil { diff --git a/pkg/agent/qrm-plugins/sriov/policy/dynamic_test.go b/pkg/agent/qrm-plugins/sriov/policy/dynamic_test.go index c05d15b130..8be962c150 100644 --- a/pkg/agent/qrm-plugins/sriov/policy/dynamic_test.go +++ b/pkg/agent/qrm-plugins/sriov/policy/dynamic_test.go @@ -302,6 +302,78 @@ func TestDynamicPolicy_GetAccompanyResourceTopologyHints(t *testing.T) { So(err, ShouldNotBeNil) So(err.Error(), ShouldEqual, "no available VFs") }) + + Convey("pod without required annotations", t, func() { + vfState, podEntries := state.GenerateDummyState(2, 2, nil) + policy := generateDynamicPolicy(t, false, true, vfState, podEntries) + policy.podFilters = []podFilter{requiredAnnotationFilter(map[string]string{"foo": "bar"})} + + req := &pluginapi.ResourceRequest{ + PodUid: "pod", + PodName: "pod", + ContainerName: "container", + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 32, + }, + } + + hints := &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + }, + } + + err := policy.GetAccompanyResourceTopologyHints(req, hints) + So(err, ShouldBeNil) + So(hints, ShouldResemble, &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + }, + }) + }) + + Convey("pod with reqired annotations but no available VFs with large size", t, func() { + vfState, podEntries := state.GenerateDummyState(2, 2, map[int]sets.Int{ + 0: sets.NewInt(0, 1), + 1: sets.NewInt(0, 1), + }) + policy := generateDynamicPolicy(t, false, true, vfState, podEntries) + policy.podFilters = []podFilter{requiredAnnotationFilter(map[string]string{"foo": "bar"})} + + req := &pluginapi.ResourceRequest{ + PodUid: "pod", + PodName: "pod", + ContainerName: "container", + Annotations: map[string]string{ + "foo": "bar", + }, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 32, + }, + } + + hints := &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + }, + } + + err := policy.GetAccompanyResourceTopologyHints(req, hints) + + So(err, ShouldNotBeNil) + So(err.Error(), ShouldEqual, "no available VFs") + }) } func TestDynamicPolicy_AllocateAccompanyResource(t *testing.T) { @@ -309,7 +381,7 @@ func TestDynamicPolicy_AllocateAccompanyResource(t *testing.T) { Convey("dryRun", t, func() { vfState, podEntries := state.GenerateDummyState(2, 2, nil) - policy := generateDynamicPolicy(t, false, true, vfState, podEntries) + policy := generateDynamicPolicy(t, true, true, vfState, podEntries) req := &pluginapi.ResourceRequest{ PodUid: "pod", @@ -343,32 +415,6 @@ func TestDynamicPolicy_AllocateAccompanyResource(t *testing.T) { AllocatedQuantity: 32, AllocationResult: "1-32", }, - policy.ResourceName(): { - IsNodeResource: true, - IsScalarResource: true, - AllocatedQuantity: 1, - Annotations: map[string]string{ - netNsAnnotationKey: "/var/run/netns/ns2", - pciAnnotationKey: `[{"address":"0000:40:00.1","repName":"eth0_1","vfName":"eth0_1"}]`, - }, - Devices: []*pluginapi.DeviceSpec{ - { - HostPath: filepath.Join(rdmaDevicePrefix, "umad1"), - ContainerPath: filepath.Join(rdmaDevicePrefix, "umad1"), - Permissions: "rwm", - }, - { - HostPath: filepath.Join(rdmaDevicePrefix, "uverbs1"), - ContainerPath: filepath.Join(rdmaDevicePrefix, "uverbs1"), - Permissions: "rwm", - }, - { - HostPath: rdmaCmPath, - ContainerPath: rdmaCmPath, - Permissions: "rw", - }, - }, - }, }, }) }) @@ -584,4 +630,86 @@ func TestDynamicPolicy_AllocateAccompanyResource(t *testing.T) { So(err, ShouldNotBeNil) So(err.Error(), ShouldContainSubstring, "no available VFs") }) + + Convey("pod without required annotations", t, func() { + vfState, podEntries := state.GenerateDummyState(2, 2, nil) + policy := generateDynamicPolicy(t, false, true, vfState, podEntries) + policy.podFilters = []podFilter{requiredAnnotationFilter(map[string]string{"foo": "bar"})} + + req := &pluginapi.ResourceRequest{ + PodUid: "pod", + PodName: "pod", + ContainerName: "container", + ResourceName: string(v1.ResourceCPU), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 32, + }, + } + + resp := &pluginapi.ResourceAllocationResponse{ + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceCPU): { + AllocatedQuantity: 32, + AllocationResult: "1-32", + }, + }, + }, + } + + err := policy.AllocateAccompanyResource(req, resp) + So(err, ShouldBeNil) + So(resp.AllocationResult, ShouldResemble, &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceCPU): { + AllocatedQuantity: 32, + AllocationResult: "1-32", + }, + }, + }) + }) + + Convey("pod with required annotations but no available VFs with large size", t, func() { + vfState, podEntries := state.GenerateDummyState(2, 2, map[int]sets.Int{ + 0: sets.NewInt(0, 1), + 1: sets.NewInt(0, 1), + }) + policy := generateDynamicPolicy(t, false, true, vfState, podEntries) + policy.podFilters = []podFilter{requiredAnnotationFilter(map[string]string{"foo": "bar"})} + + req := &pluginapi.ResourceRequest{ + PodUid: "pod", + PodName: "pod", + ContainerName: "container", + Annotations: map[string]string{ + "foo": "bar", + }, + ResourceName: string(v1.ResourceCPU), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 32, + }, + } + + resp := &pluginapi.ResourceAllocationResponse{ + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceCPU): { + AllocatedQuantity: 32, + AllocationResult: "1-32", + }, + }, + }, + } + + err := policy.AllocateAccompanyResource(req, resp) + + So(err, ShouldNotBeNil) + So(err.Error(), ShouldContainSubstring, "no available VFs") + }) } diff --git a/pkg/agent/qrm-plugins/util/consts.go b/pkg/agent/qrm-plugins/util/consts.go index 4b49145699..41c25842f2 100644 --- a/pkg/agent/qrm-plugins/util/consts.go +++ b/pkg/agent/qrm-plugins/util/consts.go @@ -51,6 +51,7 @@ const ( MetricNameMemoryHandleAdvisorContainerEntryFailed = "memory_handle_advisor_container_entry_failed" MetricNameMemoryHandleAdvisorExtraEntryFailed = "memory_handle_advisor_extra_entry_failed" MetricNameMemoryHandleAdvisorMemoryLimit = "memory_handle_advisor_memory_limit" + MetricNameMemoryHandleAdvisorMemoryHigh = "memory_handle_advisor_memory_high" MetricNameMemoryHandleAdvisorDropCache = "memory_handle_advisor_drop_cache" MetricNameMemoryHandleAdvisorCPUSetMems = "memory_handle_advisor_cpuset_mems" MetricNameMemoryHandlerAdvisorMemoryOffload = "memory_handler_advisor_memory_offloading" diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go index 0309201b29..22bacb90ad 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go @@ -77,7 +77,7 @@ type kubeletPlugin struct { // NewKubeletReporterPlugin creates a kubelet reporter plugin func NewKubeletReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, - conf *config.Configuration, callback plugin.ListAndWatchCallback, + conf *config.Configuration, callback plugin.ListAndWatchCallback, cache *v1alpha1.GetReportContentResponse, ) (plugin.ReporterPlugin, error) { ctx, cancel := context.WithCancel(context.Background()) p := &kubeletPlugin{ @@ -90,6 +90,10 @@ func NewKubeletReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaser cb: callback, StopControl: process.NewStopControl(time.Time{}), } + if cache != nil { + // initialize with the historical cache restored from checkpoint + p.latestReportContentResponse.Store(cache) + } var podResourcesFilter topology.PodResourcesFilter if conf.EnablePodResourcesFilter { diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin_test.go b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin_test.go index 64562c41e1..4b6a74daee 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin_test.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin_test.go @@ -48,6 +48,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util" "github.com/kubewharf/katalyst-core/pkg/util/machine" "github.com/kubewharf/katalyst-core/pkg/util/native" ) @@ -300,7 +301,7 @@ func TestNewKubeletReporterPlugin(t *testing.T) { klog.Infof("Callback called with name: %s, resp: %#v", name, resp) } - plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback) + plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback, nil) assert.NoError(t, err) success := make(chan bool) @@ -336,7 +337,7 @@ func TestGetTopologyPolicyReportContent(t *testing.T) { klog.Infof("Callback called with name: %s, resp: %#v", name, resp) } - plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback) + plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback, nil) assert.NoError(t, err) kubePlugin := plugin.(*kubeletPlugin) @@ -360,7 +361,7 @@ func TestGetTopologyStatusContent(t *testing.T) { klog.Infof("Callback called with name: %s, resp: %#v", name, resp) } - plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback) + plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, callback, nil) assert.NoError(t, err) kubePlugin := plugin.(*kubeletPlugin) @@ -368,3 +369,28 @@ func TestGetTopologyStatusContent(t *testing.T) { _, err = kubePlugin.getReportContent(context.TODO()) assert.NoError(t, err) } + +func Test_kubeletPlugin_WithCache(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t, "") + meta := generateTestMetaServer() + + // mock a cache response + mockCache := &v1alpha1.GetReportContentResponse{ + Content: []*v1alpha1.ReportContent{ + { + GroupVersionKind: &util.CNRGroupVersionKind, + }, + }, + } + + plugin, err := NewKubeletReporterPlugin(metrics.DummyMetrics{}, meta, conf, nil, mockCache) + assert.NoError(t, err) + assert.NotNil(t, plugin) + + // Verify that the cache is properly set during initialization + cachedResponse := plugin.GetCache() + assert.NotNil(t, cachedResponse) + assert.Equal(t, mockCache, cachedResponse) +} diff --git a/pkg/agent/resourcemanager/fetcher/manager.go b/pkg/agent/resourcemanager/fetcher/manager.go index 8bba96404c..a5cb1c1127 100644 --- a/pkg/agent/resourcemanager/fetcher/manager.go +++ b/pkg/agent/resourcemanager/fetcher/manager.go @@ -141,7 +141,17 @@ func (m *ReporterPluginManager) registerInnerReporterPlugins(emitter metrics.Met continue } - curPlugin, err := initFn(emitter, metaServer, conf, callback) + var cache *v1alpha1.GetReportContentResponse + m.mutex.Lock() + // Try to get the cache from the stopped remote endpoint which was restored from the checkpoint. + // This prevents data omission during the initial reporting cycles if the agent restarts + // and the data hasn't fully ready yet. + if old, ok := m.endpoints[pluginName]; ok { + cache = old.GetCache() + } + m.mutex.Unlock() + + curPlugin, err := initFn(emitter, metaServer, conf, callback, cache) if err != nil { errList = append(errList, err) continue @@ -411,15 +421,10 @@ func (m *ReporterPluginManager) getReportContent(cacheFirst bool) (map[string]*v } func (m *ReporterPluginManager) writeCheckpoint(reportResponses map[string]*v1alpha1.GetReportContentResponse) error { - remoteResponses := make(map[string]*v1alpha1.GetReportContentResponse, 0) - // only write remote endpoint response to checkpoint - for name, response := range reportResponses { - if m.innerEndpoints.Has(name) { - continue - } - remoteResponses[name] = response - } - data := checkpoint.New(remoteResponses) + // Write all endpoint responses to the checkpoint to persist their states. + // This enables both remote and inner plugins to restore their cached responses upon agent restart, + // preventing potential data omission during the initial reporting cycles before data is fully ready. + data := checkpoint.New(reportResponses) err := m.checkpointManager.CreateCheckpoint(reporterManagerCheckpoint, data) if err != nil { _ = m.emitter.StoreInt64("reporter_plugin_checkpoint_write_failed", 1, metrics.MetricTypeNameCount) diff --git a/pkg/agent/resourcemanager/fetcher/manager_test.go b/pkg/agent/resourcemanager/fetcher/manager_test.go index 8c40e670c5..b50651d3c4 100644 --- a/pkg/agent/resourcemanager/fetcher/manager_test.go +++ b/pkg/agent/resourcemanager/fetcher/manager_test.go @@ -197,6 +197,61 @@ func TestHealthz(t *testing.T) { _ = p.Stop() } +func TestInnerPluginCheckpointing(t *testing.T) { + t.Parallel() + + socketDir, err := tmpSocketDir() + require.NoError(t, err) + defer os.RemoveAll(socketDir) + + testReporter := reporter.NewReporterManagerStub() + m, err := NewReporterPluginManager(testReporter, metrics.DummyMetrics{}, nil, generateTestConfiguration(socketDir)) + require.NoError(t, err) + + // Mock a response for an inner plugin + mockContent := []*v1alpha1.ReportContent{ + { + GroupVersionKind: &testGroupVersionKind, + Field: []*v1alpha1.ReportField{ + { + FieldType: v1alpha1.FieldType_Spec, + FieldName: "inner_plugin_field", + Value: []byte("inner_plugin_value"), + }, + }, + }, + } + mockResponse := &v1alpha1.GetReportContentResponse{Content: mockContent} + + // Manually add it to the manager's endpoints to simulate a successful report + m.mutex.Lock() + m.innerEndpoints.Insert("system-reporter-plugin") + m.endpoints["system-reporter-plugin"] = plugin.NewStoppedRemoteEndpoint("system-reporter-plugin", mockResponse) + m.mutex.Unlock() + + // Write checkpoint + err = m.writeCheckpoint(map[string]*v1alpha1.GetReportContentResponse{ + "system-reporter-plugin": mockResponse, + }) + require.NoError(t, err) + + // Create a new manager to verify checkpoint reading + m2, err := NewReporterPluginManager(testReporter, metrics.DummyMetrics{}, nil, generateTestConfiguration(socketDir)) + require.NoError(t, err) + + // Verify that the inner plugin's endpoint was restored from the checkpoint + m2.mutex.Lock() + restoredEndpoint, ok := m2.endpoints["system-reporter-plugin"] + m2.mutex.Unlock() + + require.True(t, ok) + require.NotNil(t, restoredEndpoint) + + restoredCache := restoredEndpoint.GetCache() + require.NotNil(t, restoredCache) + reporterContentsEqual(t, mockContent, restoredCache.Content) +} + func setup(t *testing.T, ctx context.Context, content []*v1alpha1.ReportContent, callback plugin.ListAndWatchCallback, socketDir string, pluginSocketName string, reporter reporter.Manager) (registration.AgentPluginHandler, <-chan interface{}, skeleton.GenericPlugin) { m, updateChan := setupReporterManager(t, ctx, content, socketDir, callback, reporter) p := setupReporterPlugin(t, content, socketDir, pluginSocketName) diff --git a/pkg/agent/resourcemanager/fetcher/plugin/plugin.go b/pkg/agent/resourcemanager/fetcher/plugin/plugin.go index 6b4c7e5cfb..eef2e9085f 100644 --- a/pkg/agent/resourcemanager/fetcher/plugin/plugin.go +++ b/pkg/agent/resourcemanager/fetcher/plugin/plugin.go @@ -33,7 +33,7 @@ const ( fakePluginName = "fake-reporter-plugin" ) -type InitFunc func(emitter metrics.MetricEmitter, _ *metaserver.MetaServer, conf *config.Configuration, callback ListAndWatchCallback) (ReporterPlugin, error) +type InitFunc func(emitter metrics.MetricEmitter, _ *metaserver.MetaServer, conf *config.Configuration, callback ListAndWatchCallback, cache *v1alpha1.GetReportContentResponse) (ReporterPlugin, error) // ReporterPlugin performs report actions based on system or kubelet information. type ReporterPlugin interface { diff --git a/pkg/agent/resourcemanager/fetcher/system/systemplugin.go b/pkg/agent/resourcemanager/fetcher/system/systemplugin.go index dd4167ae8e..2c057e62f6 100644 --- a/pkg/agent/resourcemanager/fetcher/system/systemplugin.go +++ b/pkg/agent/resourcemanager/fetcher/system/systemplugin.go @@ -70,13 +70,14 @@ type systemPlugin struct { } func NewSystemReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, - conf *config.Configuration, _ plugin.ListAndWatchCallback, + conf *config.Configuration, _ plugin.ListAndWatchCallback, cache *v1alpha1.GetReportContentResponse, ) (plugin.ReporterPlugin, error) { p := &systemPlugin{ - conf: conf, - emitter: emitter, - metaServer: metaServer, - StopControl: process.NewStopControl(time.Time{}), + conf: conf, + emitter: emitter, + metaServer: metaServer, + latestReportContentResponse: cache, // initialize with the historical cache restored from checkpoint + StopControl: process.NewStopControl(time.Time{}), } return p, nil diff --git a/pkg/agent/resourcemanager/fetcher/system/systemplugin_test.go b/pkg/agent/resourcemanager/fetcher/system/systemplugin_test.go index 499aee1348..0cceeed34d 100644 --- a/pkg/agent/resourcemanager/fetcher/system/systemplugin_test.go +++ b/pkg/agent/resourcemanager/fetcher/system/systemplugin_test.go @@ -30,6 +30,7 @@ import ( "k8s.io/client-go/kubernetes/fake" internalfake "github.com/kubewharf/katalyst-api/pkg/client/clientset/versioned/fake" + "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options" "github.com/kubewharf/katalyst-core/pkg/client" "github.com/kubewharf/katalyst-core/pkg/config" @@ -37,6 +38,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util" ) func generateTestConfiguration(t *testing.T) *config.Configuration { @@ -64,10 +66,41 @@ func Test_systemPlugin_GetReportContent(t *testing.T) { metricsFetcher.SetByStringIndex(consts.MetricCPUCodeName, "Intel_SkyLake") metricsFetcher.SetByStringIndex(consts.MetricInfoIsVM, false) - plugin, err := NewSystemReporterPlugin(metrics.DummyMetrics{}, meta, conf, nil) + plugin, err := NewSystemReporterPlugin(metrics.DummyMetrics{}, meta, conf, nil, nil) assert.NoError(t, err) content, err := plugin.GetReportContent(context.Background()) assert.NoError(t, err) assert.NotNil(t, content) } + +func Test_systemPlugin_WithCache(t *testing.T) { + t.Parallel() + + genericClient := &client.GenericClientSet{ + KubeClient: fake.NewSimpleClientset(), + InternalClient: internalfake.NewSimpleClientset(), + DynamicClient: dynamicfake.NewSimpleDynamicClient(runtime.NewScheme()), + } + conf := generateTestConfiguration(t) + meta, err := metaserver.NewMetaServer(genericClient, metrics.DummyMetrics{}, conf) + assert.NoError(t, err) + + // mock a cache response + mockCache := &v1alpha1.GetReportContentResponse{ + Content: []*v1alpha1.ReportContent{ + { + GroupVersionKind: &util.CNRGroupVersionKind, + }, + }, + } + + plugin, err := NewSystemReporterPlugin(metrics.DummyMetrics{}, meta, conf, nil, mockCache) + assert.NoError(t, err) + assert.NotNil(t, plugin) + + // Verify that the cache is properly set during initialization + cachedResponse := plugin.GetCache() + assert.NotNil(t, cachedResponse) + assert.Equal(t, mockCache, cachedResponse) +} diff --git a/pkg/agent/resourcemanager/reporter/manager.go b/pkg/agent/resourcemanager/reporter/manager.go index 5a7cedf5fd..4f5a7d7d33 100644 --- a/pkg/agent/resourcemanager/reporter/manager.go +++ b/pkg/agent/resourcemanager/reporter/manager.go @@ -23,12 +23,14 @@ import ( v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" "github.com/kubewharf/katalyst-core/pkg/client" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" ) // Manager indicates the way that resources are updated, and it @@ -95,7 +97,8 @@ func (r *managerImpl) PushContents(ctx context.Context, responses map[string]*v1 for gvk, fields := range reportFieldsByGVK { u, ok := r.reporters[gvk] if !ok || u == nil { - return fmt.Errorf("reporter of gvk %s not found", gvk) + klog.Warningf("reporter of gvk %s not found", gvk) + continue } sort.SliceStable(fields, func(i, j int) bool { @@ -123,6 +126,13 @@ func (r *managerImpl) getReporter(genericClient *client.GenericClientSet, metaSe ) error { var errList []error for gvk, f := range initializers { + if conf != nil && conf.GenericReporterConfiguration != nil { + if !general.IsNameEnabled(gvk.Kind, nil, conf.GenericReporterConfiguration.AgentReporters) { + klog.Infof("reporter %q is disabled", gvk.Kind) + continue + } + } + reporter, err := f(genericClient, metaServer, emitter, conf) if err != nil { errList = append(errList, err) diff --git a/pkg/agent/resourcemanager/reporter/manager_test.go b/pkg/agent/resourcemanager/reporter/manager_test.go index d5c58a4c3b..12fc6f5a68 100644 --- a/pkg/agent/resourcemanager/reporter/manager_test.go +++ b/pkg/agent/resourcemanager/reporter/manager_test.go @@ -201,6 +201,33 @@ func Test_managerImpl_PushContents(t *testing.T) { }, wantErr: false, }, + { + name: "test-2", + fields: fields{ + conf: generateTestConfiguration(t), + reporters: map[v1.GroupVersionKind]Reporter{}, + }, + args: args{ + ctx: context.TODO(), + responses: map[string]*v1alpha1.GetReportContentResponse{ + "agent-2": { + Content: []*v1alpha1.ReportContent{ + { + GroupVersionKind: &testGroupVersionKindFirst, + Field: []*v1alpha1.ReportField{ + { + FieldType: v1alpha1.FieldType_Spec, + FieldName: "fieldName_b", + Value: []byte("Value_b"), + }, + }, + }, + }, + }, + }, + }, + wantErr: false, + }, } for _, tt := range tests { tt := tt diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory.go index 2b4ad82e72..7994e8e454 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory.go @@ -148,6 +148,39 @@ func reclaimedContainersFilter(ci *types.ContainerInfo) bool { return ci != nil && ci.QoSLevel == apiconsts.PodAnnotationQoSLevelReclaimedCores } +func nonReclaimedCoresContainersFilter(ci *types.ContainerInfo) bool { + return ci != nil && ci.QoSLevel != apiconsts.PodAnnotationQoSLevelReclaimedCores +} + +func GetNonReclaimedCoresContainers(metaReader metacache.MetaReader, metaServer *metaserver.MetaServer) ([]*types.ContainerInfo, error) { + var errList []error + + nonReclaimedCoresContainers := make([]*types.ContainerInfo, 0) + + metaReader.RangeContainer(func(podUID string, containerName string, containerInfo *types.ContainerInfo) bool { + if nonReclaimedCoresContainersFilter(containerInfo) { + ctx := context.Background() + pod, err := metaServer.GetPod(ctx, podUID) + if err != nil { + errList = append(errList, err) + return true + } + + if native.PodIsActive(pod) { + nonReclaimedCoresContainers = append(nonReclaimedCoresContainers, containerInfo) + } + } + return true + }) + + err := errors.NewAggregate(errList) + if err != nil { + return nil, err + } + + return nonReclaimedCoresContainers, nil +} + // GetActualNUMABindingNUMAsForReclaimedCores gets the actual numa binding numas according to the pod annotation // if numa with numa binding result is not -1, it will be added to numa binding numas func GetActualNUMABindingNUMAsForReclaimedCores(metaReader metacache.MetaReader) (machine.CPUSet, error) { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory_test.go index 9db22f4703..29cadf148c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/helper/memory_test.go @@ -18,6 +18,7 @@ package helper import ( "reflect" + "sort" "testing" info "github.com/google/cadvisor/info/v1" @@ -700,3 +701,454 @@ func TestGetActualNUMABindingNUMAsForReclaimedCores(t *testing.T) { }) } } + +func TestGetNonReclaimedCoresContainers(t *testing.T) { + t.Parallel() + + containerInfoDedicatedCores := makeContainerInfo("pod0", "default", + "pod0", "container0", + consts.PodAnnotationQoSLevelDedicatedCores, map[string]string{ + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + types.TopologyAwareAssignment{ + 0: machine.NewCPUSet(0), + }, 30<<30) + + containerInfoSharedCores := makeContainerInfo("pod1", "default", + "pod1", "container1", + consts.PodAnnotationQoSLevelSharedCores, nil, + nil, 20<<30) + + containerInfoReclaimedCores := makeContainerInfo("pod2", "default", + "pod2", "container2", + consts.PodAnnotationQoSLevelReclaimedCores, nil, + nil, 20<<30) + + containerInfoSystemCores := makeContainerInfo("pod3", "default", + "pod3", "container3", + consts.PodAnnotationQoSLevelSystemCores, nil, + nil, 10<<30) + + containerInfoDedicatedCores2 := makeContainerInfo("pod4", "default", + "pod4", "container4", + consts.PodAnnotationQoSLevelDedicatedCores, map[string]string{ + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + types.TopologyAwareAssignment{ + 1: machine.NewCPUSet(24), + }, 30<<30) + + tests := []struct { + name string + podList []*v1.Pod + containerInfos []*types.ContainerInfo + wantContainers []*types.ContainerInfo + wantErr bool + }{ + { + name: "No containers", + podList: nil, + containerInfos: nil, + wantContainers: []*types.ContainerInfo{}, + wantErr: false, + }, + { + name: "Only reclaimed cores containers", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "default", + UID: "pod2", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container2"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container2", ContainerID: "container2"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoReclaimedCores}, + wantContainers: []*types.ContainerInfo{}, + wantErr: false, + }, + { + name: "One dedicated cores container with active pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantContainers: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantErr: false, + }, + { + name: "One shared cores container with active pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + UID: "pod1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container1"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container1", ContainerID: "container1"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoSharedCores}, + wantContainers: []*types.ContainerInfo{containerInfoSharedCores}, + wantErr: false, + }, + { + name: "One system cores container with active pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + Namespace: "default", + UID: "pod3", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container3"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container3", ContainerID: "container3"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoSystemCores}, + wantContainers: []*types.ContainerInfo{containerInfoSystemCores}, + wantErr: false, + }, + { + name: "Non-reclaimed container with failed pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantContainers: []*types.ContainerInfo{}, + wantErr: false, + }, + { + name: "Non-reclaimed container with succeeded pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantContainers: []*types.ContainerInfo{}, + wantErr: false, + }, + { + name: "Mixed containers with active pods", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + UID: "pod1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container1"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container1", ContainerID: "container1"}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "default", + UID: "pod2", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container2"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container2", ContainerID: "container2"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{ + containerInfoDedicatedCores, + containerInfoSharedCores, + containerInfoReclaimedCores, + }, + wantContainers: []*types.ContainerInfo{ + containerInfoDedicatedCores, + containerInfoSharedCores, + }, + wantErr: false, + }, + { + name: "Multiple non-reclaimed containers with active pods", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod4", + Namespace: "default", + UID: "pod4", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container4"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container4", ContainerID: "container4"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{ + containerInfoDedicatedCores, + containerInfoDedicatedCores2, + }, + wantContainers: []*types.ContainerInfo{ + containerInfoDedicatedCores, + containerInfoDedicatedCores2, + }, + wantErr: false, + }, + { + name: "Non-reclaimed container with pod not found", + podList: []*v1.Pod{}, + containerInfos: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantContainers: nil, + wantErr: true, + }, + { + name: "Mixed containers with some inactive pods", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0"}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + UID: "pod1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container1"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container1", ContainerID: "container1"}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{ + containerInfoDedicatedCores, + containerInfoSharedCores, + }, + wantContainers: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantErr: false, + }, + { + name: "Non-reclaimed container with deleted pod", + podList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod0", + Namespace: "default", + UID: "pod0", + DeletionTimestamp: &metav1.Time{}, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container0"}, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "container0", ContainerID: "container0", State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }}, + }, + }, + }, + }, + containerInfos: []*types.ContainerInfo{containerInfoDedicatedCores}, + wantContainers: []*types.ContainerInfo{}, + wantErr: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t, "/tmp/checkpoint", "/tmp/statefile") + metricsFetcher := metric.NewFakeMetricsFetcher(metrics.DummyMetrics{}) + metaServer := generateTestMetaServer(t, tt.podList, metricsFetcher) + metaReader, err := metacache.NewMetaCacheImp(conf, metricspool.DummyMetricsEmitterPool{}, metricsFetcher) + require.NoError(t, err) + err = metaReader.ClearContainers() + require.NoError(t, err) + + for _, info := range tt.containerInfos { + err := metaReader.SetContainerInfo(info.PodUID, info.ContainerName, info) + assert.NoError(t, err) + } + + gotContainers, err := GetNonReclaimedCoresContainers(metaReader, metaServer) + if (err != nil) != tt.wantErr { + t.Errorf("GetNonReclaimedCoresContainers() error = %v, wantErr %v", err, tt.wantErr) + return + } + + sortSlice := func(cs []*types.ContainerInfo) { + sort.Slice(cs, func(i, j int) bool { + return cs[i].PodUID < cs[j].PodUID + }) + } + sortSlice(gotContainers) + sortSlice(tt.wantContainers) + + if !apiequality.Semantic.DeepEqual(gotContainers, tt.wantContainers) { + t.Errorf("GetNonReclaimedCoresContainers() = %v, want %v", gotContainers, tt.wantContainers) + } + }) + } +} diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go index 0463ac0697..ed3bdbc9cf 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go @@ -1264,7 +1264,10 @@ func TestUpdate(t *testing.T) { ExtraEntries: []types.ExtraMemoryAdvices{ { CgroupPath: "/kubepods/besteffort", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(240 << 30)}, + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(240 << 30), + string(memoryadvisor.ControlKnobKeyMemoryHigh): strconv.FormatInt(int64(0.95*float64(240<<30)), 10), + }, }, }, }, @@ -1318,7 +1321,10 @@ func TestUpdate(t *testing.T) { ExtraEntries: []types.ExtraMemoryAdvices{ { CgroupPath: "/kubepods/besteffort", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(184 << 30)}, + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(184 << 30), + string(memoryadvisor.ControlKnobKeyMemoryHigh): strconv.FormatInt(187690070835, 10), + }, }, }, }, diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go index 7f5aeaf2e6..03c77ac89c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go @@ -92,6 +92,12 @@ func (p *PolicyNUMAAware) Update() (err error) { return err } + nonReclaimedCoresContainers, err := helper.GetNonReclaimedCoresContainers(p.metaReader, p.metaServer) + if err != nil { + general.Errorf("GetNonReclaimedCoresContainers failed: %v", err) + return err + } + numaReclaimableMemory = make(map[int]float64) for _, numaID := range availNUMAs.ToSliceInt() { data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) @@ -129,16 +135,6 @@ func (p *PolicyNUMAAware) Update() (err error) { numaReclaimableMemory[numaID] = numaReclaimable } - for _, container := range reclaimedCoresContainers { - if container.MemoryRequest > 0 && len(container.TopologyAwareAssignments) > 0 { - reclaimableMemory += container.MemoryRequest - reclaimableMemoryPerNuma := container.MemoryRequest / float64(len(container.TopologyAwareAssignments)) - for numaID := range container.TopologyAwareAssignments { - numaReclaimableMemory[numaID] += reclaimableMemoryPerNuma - } - } - } - watermarkScaleFactor, err := p.metaServer.GetNodeMetric(consts.MetricMemScaleFactorSystem) if err != nil { general.Infof("Can not get system watermark scale factor: %v", err) @@ -148,18 +144,72 @@ func (p *PolicyNUMAAware) Update() (err error) { // reserve memory for watermark_scale_factor to make kswapd less happened systemWatermarkReserved := availNUMATotal * watermarkScaleFactor.Value / 10000 - memoryHeadroom := math.Max(reclaimableMemory-systemWatermarkReserved-reservedForAllocate, 0) - reduceRatio := 0.0 + totalReclaimMemoryRequest := 0.0 + totalReclaimMemoryUsage := 0.0 + for _, container := range reclaimedCoresContainers { + if container.MemoryRequest > 0 && len(container.TopologyAwareAssignments) > 0 { + // Get container memory usage + memUsage, errMem := p.metaServer.GetContainerMetric(container.PodUID, container.ContainerName, consts.MetricMemUsageContainer) + if errMem != nil { + general.Infof("Can not get container memory usage, podUID: %v, containerName: %v, %v", container.PodUID, container.ContainerName, errMem) + // If get memory usage failed, use MemoryRequest + memUsage.Value = container.MemoryRequest + } + totalReclaimMemoryRequest += container.MemoryRequest + totalReclaimMemoryUsage += memUsage.Value + } + } + requestRatio := 1.0 + if totalReclaimMemoryUsage > 0 { + requestRatio = totalReclaimMemoryRequest / totalReclaimMemoryUsage + } + + totalNonReclaimMemoryRequest := 0.0 + totalNonReclaimMemoryUsage := 0.0 + for _, container := range nonReclaimedCoresContainers { + if container.MemoryRequest > 0 { + memUsage, errMem := p.metaServer.GetContainerMetric(container.PodUID, container.ContainerName, consts.MetricMemUsageContainer) + if errMem != nil { + general.Infof("Can not get container memory usage, podUID: %v, containerName: %v, %v", container.PodUID, container.ContainerName, errMem) + memUsage.Value = container.MemoryRequest + } + totalNonReclaimMemoryRequest += container.MemoryRequest + totalNonReclaimMemoryUsage += memUsage.Value + } + } + nonReclaimedReservedRatio := 0.0 + if totalReclaimMemoryUsage+totalNonReclaimMemoryUsage > 0 { + nonReclaimedReservedRaw := math.Min(totalNonReclaimMemoryRequest-totalNonReclaimMemoryUsage, totalNonReclaimMemoryRequest*dynamicConfig.RequestBasedRatio) + nonReclaimedReservedRatio = math.Max(nonReclaimedReservedRaw/(totalReclaimMemoryUsage+totalNonReclaimMemoryUsage), 0) + } + nonReclaimedReserved := math.Max((availNUMATotal-reclaimableMemory)*nonReclaimedReservedRatio, 0) + + availMemory := math.Max(reclaimableMemory-systemWatermarkReserved-reservedForAllocate-nonReclaimedReserved, 0) + reclaimableToAvailRatio := 0.0 if reclaimableMemory > 0 { - reduceRatio = memoryHeadroom / reclaimableMemory + reclaimableToAvailRatio = availMemory / reclaimableMemory } + // convert reclaimable memory to memory headroom allNUMAs := p.metaServer.CPUDetails.NUMANodes().ToSliceInt() numaHeadroom := make(map[int]float64, len(allNUMAs)) totalNUMAHeadroom := 0.0 for numaID := range numaReclaimableMemory { - numaHeadroom[numaID] = numaReclaimableMemory[numaID] * reduceRatio + numaHeadroom[numaID] = numaReclaimableMemory[numaID] * reclaimableToAvailRatio * requestRatio totalNUMAHeadroom += numaHeadroom[numaID] + } + + for _, container := range reclaimedCoresContainers { + if container.MemoryRequest > 0 && len(container.TopologyAwareAssignments) > 0 { + totalNUMAHeadroom += container.MemoryRequest + headroomPerNuma := container.MemoryRequest / float64(len(container.TopologyAwareAssignments)) + for numaID := range container.TopologyAwareAssignments { + numaHeadroom[numaID] += headroomPerNuma + } + } + } + + for numaID := range numaHeadroom { general.InfoS("numa memory headroom", "NUMA-ID", numaID, "headroom", general.FormatMemoryQuantity(numaHeadroom[numaID])) } @@ -185,12 +235,17 @@ func (p *PolicyNUMAAware) Update() (err error) { general.InfoS("total memory reclaimable", "reclaimableMemory", general.FormatMemoryQuantity(reclaimableMemory), - "memoryHeadroom", general.FormatMemoryQuantity(memoryHeadroom), + "availMemory", general.FormatMemoryQuantity(availMemory), "ResourceUpperBound", general.FormatMemoryQuantity(p.essentials.ResourceUpperBound), "systemWatermarkReserved", general.FormatMemoryQuantity(systemWatermarkReserved), "reservedForAllocate", general.FormatMemoryQuantity(reservedForAllocate), "totalNUMAHeadroom", general.FormatMemoryQuantity(totalNUMAHeadroom), "numaHeadroom", numaHeadroom, + "totalReclaimMemoryRequest", general.FormatMemoryQuantity(totalReclaimMemoryRequest), + "totalReclaimMemoryUsage", general.FormatMemoryQuantity(totalReclaimMemoryUsage), + "totalNonReclaimMemoryRequest", general.FormatMemoryQuantity(totalNonReclaimMemoryRequest), + "totalNonReclaimMemoryUsage", general.FormatMemoryQuantity(totalNonReclaimMemoryUsage), + "requestRatio", requestRatio, ) return nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go index 0d5110514a..8e6adf08ff 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go @@ -109,7 +109,8 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0.5, + CacheBasedRatio: 0.5, + RequestBasedRatio: 0.1, }, }, setFakeMetric: func(store *metric.FakeMetricsFetcher) { @@ -168,7 +169,8 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0.5, + CacheBasedRatio: 0.5, + RequestBasedRatio: 0.1, }, }, essentials: types.ResourceEssentials{ @@ -263,7 +265,8 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0.5, + CacheBasedRatio: 0.5, + RequestBasedRatio: 0.1, }, }, essentials: types.ResourceEssentials{ @@ -300,8 +303,9 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0.5, - MaxOversoldRate: 1.5, + CacheBasedRatio: 0.5, + RequestBasedRatio: 0.1, + MaxOversoldRate: 1.5, }, }, setFakeMetric: func(store *metric.FakeMetricsFetcher) { @@ -334,8 +338,9 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0.5, - MaxOversoldRate: 0, + CacheBasedRatio: 0.5, + RequestBasedRatio: 0.1, + MaxOversoldRate: 0, }, }, setFakeMetric: func(store *metric.FakeMetricsFetcher) { @@ -397,8 +402,9 @@ func TestPolicyNUMAAware(t *testing.T) { }, memoryHeadroomConfiguration: &memoryheadroom.MemoryHeadroomConfiguration{ MemoryUtilBasedConfiguration: &memoryheadroom.MemoryUtilBasedConfiguration{ - CacheBasedRatio: 0, - MaxOversoldRate: 2, + CacheBasedRatio: 0, + RequestBasedRatio: 0.1, + MaxOversoldRate: 2, }, }, setFakeMetric: func(store *metric.FakeMetricsFetcher) { @@ -415,10 +421,10 @@ func TestPolicyNUMAAware(t *testing.T) { }, }, wantErr: false, - want: resource.MustParse("180Gi"), + want: resource.MustParse("175Gi"), wantNUMA: map[int]resource.Quantity{ 0: resource.MustParse("100Gi"), - 1: resource.MustParse("80Gi"), + 1: resource.MustParse("75Gi"), }, }, } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go index 99c9ff5d1f..6935b329da 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go @@ -44,6 +44,9 @@ const ( reclaimMemoryUnlimited = -1 defaultProcZoneInfoFile = "/proc/zoneinfo" + + // Factor for calculating memory.high based on memory.max, available for cgroup v2 only + memoryHighScaleFactor = 0.95 ) type memoryGuard struct { @@ -110,11 +113,19 @@ func (mg *memoryGuard) GetAdvices() types.InternalMemoryCalculationResult { general.Errorf("failed to get last reconcile result") return types.InternalMemoryCalculationResult{} } + memoryMax := mg.reclaimMemoryLimit.Load() + memoryHigh := int64(float64(memoryMax) * memoryHighScaleFactor) + if memoryMax == reclaimMemoryUnlimited { + memoryHigh = reclaimMemoryUnlimited + } result := types.InternalMemoryCalculationResult{ ExtraEntries: []types.ExtraMemoryAdvices{ { CgroupPath: mg.reclaimRelativeRootCgroupPath, - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.FormatInt(mg.reclaimMemoryLimit.Load(), 10)}, + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.FormatInt(memoryMax, 10), + string(memoryadvisor.ControlKnobKeyMemoryHigh): strconv.FormatInt(memoryHigh, 10), + }, }, }, } @@ -127,9 +138,17 @@ func (mg *memoryGuard) GetAdvices() types.InternalMemoryCalculationResult { continue } + numaMemoryMax := numaBindingReclaimMemoryLimit[numaID] + numaMemoryHigh := int64(float64(numaMemoryMax) * memoryHighScaleFactor) + if numaMemoryMax == reclaimMemoryUnlimited { + numaMemoryHigh = reclaimMemoryUnlimited + } result.ExtraEntries = append(result.ExtraEntries, types.ExtraMemoryAdvices{ CgroupPath: cgroupPath, - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.FormatInt(numaBindingReclaimMemoryLimit[numaID], 10)}, + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.FormatInt(numaMemoryMax, 10), + string(memoryadvisor.ControlKnobKeyMemoryHigh): strconv.FormatInt(numaMemoryHigh, 10), + }, }) } } diff --git a/pkg/config/agent/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go b/pkg/config/agent/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go index 16dbb76cbe..c95a919a04 100644 --- a/pkg/config/agent/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go +++ b/pkg/config/agent/dynamic/adminqos/reclaimedresource/memoryheadroom/util_based.go @@ -23,6 +23,7 @@ type MemoryUtilBasedConfiguration struct { FreeBasedRatio float64 StaticBasedCapacity float64 CacheBasedRatio float64 + RequestBasedRatio float64 MaxOversoldRate float64 } @@ -52,6 +53,10 @@ func (c *MemoryUtilBasedConfiguration) ApplyConfiguration(conf *crd.DynamicConfi c.CacheBasedRatio = *config.CacheBasedRatio } + if config.RequestBasedRatio != nil { + c.RequestBasedRatio = *config.RequestBasedRatio + } + if config.MaxOversoldRate != nil { c.MaxOversoldRate = *config.MaxOversoldRate } diff --git a/pkg/config/agent/qrm/cpu_plugin.go b/pkg/config/agent/qrm/cpu_plugin.go index 6d8269b5c9..92026f5064 100644 --- a/pkg/config/agent/qrm/cpu_plugin.go +++ b/pkg/config/agent/qrm/cpu_plugin.go @@ -69,6 +69,8 @@ type CPUDynamicPolicyConfig struct { EnableDefaultDedicatedCoresCPUBurst bool // EnableDefaultSharedCoresCPUBurst indicates whether to enable cpu burst for shared cores by default EnableDefaultSharedCoresCPUBurst bool + // EnableCPUBurstForMainContainerOnly indicates whether cpu burst is only enabled for the main container + EnableCPUBurstForMainContainerOnly bool *hintoptimizer.HintOptimizerConfiguration *irqtuner.IRQTunerConfiguration diff --git a/pkg/config/agent/qrm/mb_plugin.go b/pkg/config/agent/qrm/mb_plugin.go index bb2472504c..9df6de6f07 100644 --- a/pkg/config/agent/qrm/mb_plugin.go +++ b/pkg/config/agent/qrm/mb_plugin.go @@ -38,6 +38,7 @@ type MBQRMPluginConfig struct { ResetResctrlOnly bool LocalIsVictimAndTotalIsAllRead bool + ExtraGroupPriorities map[string]int } func NewMBQRMPluginConfig() *MBQRMPluginConfig { diff --git a/pkg/config/agent/qrm/sriov_plugin.go b/pkg/config/agent/qrm/sriov_plugin.go index f413769e7b..65c6431162 100644 --- a/pkg/config/agent/qrm/sriov_plugin.go +++ b/pkg/config/agent/qrm/sriov_plugin.go @@ -41,6 +41,7 @@ type SriovStaticPolicyConfig struct { } type SriovDynamicPolicyConfig struct { + PodRequiredAnnotations map[string]string LargeSizeVFQueueCount int LargeSizeVFCPUThreshold int LargeSizeVFFailOnExhaustion bool diff --git a/pkg/config/agent/reporter/reporter_base.go b/pkg/config/agent/reporter/reporter_base.go index 99223c1a38..0a8724a14d 100644 --- a/pkg/config/agent/reporter/reporter_base.go +++ b/pkg/config/agent/reporter/reporter_base.go @@ -30,6 +30,9 @@ type GenericReporterConfiguration struct { // first item for a particular name wins InnerPlugins []string + // AgentReporters is the list of agent reporters to enable or disable + AgentReporters []string + RefreshLatestCNRPeriod time.Duration // DefaultCNRLabels is the labels for CNR created by reporter diff --git a/pkg/config/controller/lifecycle.go b/pkg/config/controller/lifecycle.go index da8491819b..172530e807 100644 --- a/pkg/config/controller/lifecycle.go +++ b/pkg/config/controller/lifecycle.go @@ -50,6 +50,7 @@ type HealthzConfig struct { type LifeCycleConfig struct { EnableHealthz bool EnableCNCLifecycle bool + EnableCNRLifecycle bool *CNRLifecycleConfig *CNCLifecycleConfig @@ -60,6 +61,7 @@ func NewLifeCycleConfig() *LifeCycleConfig { return &LifeCycleConfig{ EnableHealthz: false, EnableCNCLifecycle: true, + EnableCNRLifecycle: true, CNRLifecycleConfig: &CNRLifecycleConfig{}, CNCLifecycleConfig: &CNCLifecycleConfig{}, HealthzConfig: &HealthzConfig{}, diff --git a/pkg/controller/vpa/util/api.go b/pkg/controller/vpa/util/api.go index b221593019..95872c37dc 100644 --- a/pkg/controller/vpa/util/api.go +++ b/pkg/controller/vpa/util/api.go @@ -42,6 +42,8 @@ const ( VPAConditionReasonCalculatedIllegal = "Illegal" VPAConditionReasonPodSpecUpdated = "PodSpecUpdated" VPAConditionReasonPodSpecNoUpdate = "PodSpecNoUpdate" + VPAConditionReasonNoPodsMatched = "NoPodsMatched" + VPAConditionReasonPodsMatched = "PodsMatched" ) // SetVPAConditions is used to set conditions for vpa in local vpa diff --git a/pkg/controller/vpa/vpa.go b/pkg/controller/vpa/vpa.go index 517423be59..4b2cd934e9 100644 --- a/pkg/controller/vpa/vpa.go +++ b/pkg/controller/vpa/vpa.go @@ -371,12 +371,18 @@ func (vc *VPAController) syncVPA(key string) error { workloadLister, ok := vc.workloadLister[gvk] if !ok { klog.Errorf("[vpa] vpa %s/%s without workload lister %v", namespace, name, gvk) + // Set NoPodsMatched to True when the targetRef kind is unsupported + _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.NoPodsMatched, core.ConditionTrue, util.VPAConditionReasonNoPodsMatched, fmt.Sprintf("Unsupported targetRef kind: %v", gvk)) return nil } workloadObj, err := katalystutil.GetWorkloadForVPA(vpa, workloadLister) if err != nil { klog.Errorf("[vpa] vpa %s/%s get workload error: %v", namespace, name, err) + if errors.IsNotFound(err) { + // Set NoPodsMatched to True when the workload object cannot be found + _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.NoPodsMatched, core.ConditionTrue, util.VPAConditionReasonNoPodsMatched, fmt.Sprintf("TargetRef workload not found: %s %s", gvk.Kind, vpa.Spec.TargetRef.Name)) + } return err } @@ -408,8 +414,25 @@ func (vc *VPAController) syncVPA(key string) error { if err != nil { klog.Errorf("[vpa] failed to get pods by vpa %s, err %v", vpa.Name, err) _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.RecommendationUpdated, core.ConditionFalse, util.VPAConditionReasonCalculatedIllegal, "failed to find pods") + if errors.IsNotFound(err) { + // Set NoPodsMatched to True when the workload object cannot be found during pod list retrieval + _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.NoPodsMatched, core.ConditionTrue, util.VPAConditionReasonNoPodsMatched, fmt.Sprintf("TargetRef workload not found: %s %s", gvk.Kind, vpa.Spec.TargetRef.Name)) + } return err } + + if len(pods) == 0 { + message := "No pods match this VPA object" + if selector, err := native.GetUnstructuredSelector(workload); err == nil && selector != nil { + message = fmt.Sprintf("No pods match this VPA object, label selector: %s", selector.String()) + } + // Set NoPodsMatched to True when no pods match the targetRef workload + _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.NoPodsMatched, core.ConditionTrue, util.VPAConditionReasonNoPodsMatched, message) + } else { + // Set NoPodsMatched to False when there are pods matching the targetRef workload + _ = util.UpdateVPAConditions(vc.ctx, vc.vpaUpdater, vpa, apis.NoPodsMatched, core.ConditionFalse, util.VPAConditionReasonPodsMatched, "Pods match this VPA object") + } + klog.V(4).Infof("[vpa] syncing vpa %s with %d pods", name, len(pods)) _ = vc.metricsEmitter.StoreInt64(metricNameVAPControlVPAPodCount, int64(len(pods)), metrics.MetricTypeNameRaw, tags...) diff --git a/pkg/controller/vpa/vpa_status.go b/pkg/controller/vpa/vpa_status.go index af001474ac..5d5fb3e505 100644 --- a/pkg/controller/vpa/vpa_status.go +++ b/pkg/controller/vpa/vpa_status.go @@ -325,7 +325,14 @@ func (vs *vpaStatusController) syncVPA(key string) error { // are updated to the expected resources in their annotations func (vs *vpaStatusController) setRecommendationAppliedCondition(vpa *apis.KatalystVerticalPodAutoscaler, pods []*v1.Pod) error { failedCount := 0 + activePodCount := 0 for _, pod := range pods { + if !native.PodIsActive(pod) { + klog.V(6).Infof("[vpa-status] pod %s is not active, skip", native.GenerateUniqObjectNameKey(pod)) + continue + } + + activePodCount += 1 if !katalystutil.CheckPodSpecUpdated(pod) { failedCount += 1 } @@ -337,7 +344,7 @@ func (vs *vpaStatusController) setRecommendationAppliedCondition(vpa *apis.Katal return err } } else { - msg := fmt.Sprintf("failed to update %d pods, total %d pods", failedCount, len(pods)) + msg := fmt.Sprintf("failed to update %d pods, active %d pods, total %d pods", failedCount, activePodCount, len(pods)) err := util.SetVPAConditions(vpa, apis.RecommendationApplied, v1.ConditionFalse, util.VPAConditionReasonPodSpecNoUpdate, msg) if err != nil { return err diff --git a/pkg/controller/vpa/vpa_status_test.go b/pkg/controller/vpa/vpa_status_test.go new file mode 100644 index 0000000000..797f0dbbc4 --- /dev/null +++ b/pkg/controller/vpa/vpa_status_test.go @@ -0,0 +1,236 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vpa + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + apis "github.com/kubewharf/katalyst-api/pkg/apis/autoscaling/v1alpha1" + apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/controller/vpa/util" +) + +func TestSetRecommendationAppliedCondition(t *testing.T) { + t.Parallel() + vs := &vpaStatusController{} + + for _, tc := range []struct { + name string + pods []*v1.Pod + expectedStatus v1.ConditionStatus + expectedReason string + expectedMessage string + }{ + { + name: "all pods updated", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + }, + expectedStatus: v1.ConditionTrue, + expectedReason: util.VPAConditionReasonPodSpecUpdated, + expectedMessage: "", + }, + { + name: "skip inactive pods", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + // This pod is not updated, but it is inactive so it should be skipped + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodSucceeded, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + }, + expectedStatus: v1.ConditionTrue, + expectedReason: util.VPAConditionReasonPodSpecUpdated, + expectedMessage: "", + }, + { + name: "failed to update pods with inactive ones", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + // Different from annotation to make CheckPodSpecUpdated return false + }, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{}, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + DeletionTimestamp: &metav1.Time{}, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{}, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }, + }, + }, + }, + }, + }, + expectedStatus: v1.ConditionFalse, + expectedReason: util.VPAConditionReasonPodSpecNoUpdate, + expectedMessage: "failed to update 1 pods, active 1 pods, total 3 pods", + }, + { + name: "all active pods failed to update", + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + // Different from annotation to make CheckPodSpecUpdated return false + }, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + apiconsts.PodAnnotationInplaceUpdateResourcesKey: `{"c1":{"requests":{"cpu":"1","memory":"1Gi"}}}`, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + // Different from annotation to make CheckPodSpecUpdated return false + }, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodPending, + }, + }, + }, + expectedStatus: v1.ConditionFalse, + expectedReason: util.VPAConditionReasonPodSpecNoUpdate, + expectedMessage: "failed to update 2 pods, active 2 pods, total 2 pods", + }, + } { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + vpa := &apis.KatalystVerticalPodAutoscaler{ + Status: apis.KatalystVerticalPodAutoscalerStatus{}, + } + + err := vs.setRecommendationAppliedCondition(vpa, tc.pods) + assert.NoError(t, err) + + assert.Len(t, vpa.Status.Conditions, 1) + assert.Equal(t, tc.expectedStatus, vpa.Status.Conditions[0].Status) + assert.Equal(t, tc.expectedReason, vpa.Status.Conditions[0].Reason) + assert.Equal(t, tc.expectedMessage, vpa.Status.Conditions[0].Message) + assert.Equal(t, apis.RecommendationApplied, vpa.Status.Conditions[0].Type) + }) + } +} diff --git a/pkg/controller/vpa/vpa_test.go b/pkg/controller/vpa/vpa_test.go index 9dd7b04f6a..ef6507a6d4 100644 --- a/pkg/controller/vpa/vpa_test.go +++ b/pkg/controller/vpa/vpa_test.go @@ -132,6 +132,12 @@ func TestVPAControllerSyncVPA(t *testing.T) { Status: v1.ConditionTrue, Reason: util.VPAConditionReasonUpdated, }, + { + Type: apis.NoPodsMatched, + Status: v1.ConditionFalse, + Reason: util.VPAConditionReasonPodsMatched, + Message: "Pods match this VPA object", + }, }, }, } @@ -186,6 +192,12 @@ func TestVPAControllerSyncVPA(t *testing.T) { Status: v1.ConditionTrue, Reason: util.VPAConditionReasonUpdated, }, + { + Type: apis.NoPodsMatched, + Status: v1.ConditionFalse, + Reason: util.VPAConditionReasonPodsMatched, + Message: "Pods match this VPA object", + }, }, }, } @@ -230,12 +242,51 @@ func TestVPAControllerSyncVPA(t *testing.T) { Status: v1.ConditionTrue, Reason: util.VPAConditionReasonUpdated, }, + { + Type: apis.NoPodsMatched, + Status: v1.ConditionFalse, + Reason: util.VPAConditionReasonPodsMatched, + Message: "Pods match this VPA object", + }, }, }, } newPod3 := pod1.DeepCopy() + vpa4 := vpa3.DeepCopy() + vpaNew4 := vpaNew3.DeepCopy() + vpaNew4.Status.Conditions = []apis.VerticalPodAutoscalerCondition{ + { + Type: apis.RecommendationUpdated, + Status: v1.ConditionTrue, + Reason: util.VPAConditionReasonUpdated, + }, + { + Type: apis.NoPodsMatched, + Status: v1.ConditionTrue, + Reason: util.VPAConditionReasonNoPodsMatched, + Message: "No pods match this VPA object, label selector: workload=sts1", + }, + } + + vpa5 := vpa3.DeepCopy() + vpa5.Spec.TargetRef.Name = "sts2" + vpaNew5 := vpa5.DeepCopy() + vpaNew5.Status.Conditions = []apis.VerticalPodAutoscalerCondition{ + { + Type: apis.RecommendationUpdated, + Status: v1.ConditionTrue, + Reason: util.VPAConditionReasonUpdated, + }, + { + Type: apis.NoPodsMatched, + Status: v1.ConditionTrue, + Reason: util.VPAConditionReasonNoPodsMatched, + Message: "TargetRef workload not found: StatefulSet sts2", + }, + } + for _, tc := range []struct { name string object runtime.Object @@ -243,6 +294,7 @@ func TestVPAControllerSyncVPA(t *testing.T) { newPods []*v1.Pod vpa *apis.KatalystVerticalPodAutoscaler vpaNew *apis.KatalystVerticalPodAutoscaler + wantErr bool }{ { name: "delete owner reference", @@ -337,6 +389,61 @@ func TestVPAControllerSyncVPA(t *testing.T) { newPod3, }, }, + { + name: "no pods matched with valid selector", + vpa: vpa4, + vpaNew: vpaNew4, + object: &appsv1.StatefulSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "StatefulSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sts1", + Namespace: "default", + Annotations: map[string]string{ + apiconsts.WorkloadAnnotationVPAEnabledKey: apiconsts.WorkloadAnnotationVPAEnabled, + }, + }, + Spec: appsv1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "workload": "sts1", + }, + }, + }, + }, + pods: []*v1.Pod{}, + newPods: []*v1.Pod{}, + }, + { + name: "target ref workload not found", + vpa: vpa5, + vpaNew: vpaNew5, + wantErr: true, + object: &appsv1.StatefulSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "StatefulSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sts1", + Namespace: "default", + Annotations: map[string]string{ + apiconsts.WorkloadAnnotationVPAEnabledKey: apiconsts.WorkloadAnnotationVPAEnabled, + }, + }, + Spec: appsv1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "workload": "sts1", + }, + }, + }, + }, + pods: []*v1.Pod{}, + newPods: []*v1.Pod{}, + }, } { tc := tc t.Run(tc.name, func(t *testing.T) { @@ -380,7 +487,11 @@ func TestVPAControllerSyncVPA(t *testing.T) { assert.True(t, synced) err = vpaController.syncVPA(key) - assert.NoError(t, err) + if tc.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } for _, pod := range tc.newPods { p, err := controlCtx.Client.KubeClient.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) @@ -392,6 +503,19 @@ func TestVPAControllerSyncVPA(t *testing.T) { Get(context.TODO(), tc.vpaNew.Name, metav1.GetOptions{}) assert.NoError(t, err) assert.Equal(t, tc.vpaNew.GetObjectMeta(), vpa.GetObjectMeta()) + + for _, expectedCond := range tc.vpaNew.Status.Conditions { + found := false + for _, actCond := range vpa.Status.Conditions { + if actCond.Type == expectedCond.Type { + found = true + assert.Equal(t, expectedCond.Status, actCond.Status) + assert.Equal(t, expectedCond.Reason, actCond.Reason) + assert.Equal(t, expectedCond.Message, actCond.Message) + } + } + assert.True(t, found, "condition %s not found", expectedCond.Type) + } }) } } diff --git a/pkg/util/cgroup/common/types.go b/pkg/util/cgroup/common/types.go index d92f4cef10..a681afd55e 100644 --- a/pkg/util/cgroup/common/types.go +++ b/pkg/util/cgroup/common/types.go @@ -73,7 +73,9 @@ type MemoryData struct { // MinInBytes for memory.min // cgroup memory that can never be reclaimed by kswapd. MinInBytes int64 - WmarkRatio int32 + // HighInBytes for memory.high + HighInBytes int64 + WmarkRatio int32 // SwapMaxInBytes < 0 means disable cgroup-level swap SwapMaxInBytes int64 } diff --git a/pkg/util/cgroup/manager/v2/fs_linux.go b/pkg/util/cgroup/manager/v2/fs_linux.go index 0ac25b2831..f614a9f97a 100644 --- a/pkg/util/cgroup/manager/v2/fs_linux.go +++ b/pkg/util/cgroup/manager/v2/fs_linux.go @@ -73,6 +73,18 @@ func (m *manager) ApplyMemory(absCgroupPath string, data *common.MemoryData) err } } + if data.HighInBytes != 0 { + memoryHighValue := "max" + if data.HighInBytes > 0 { + memoryHighValue = numToStr(data.HighInBytes) + } + if err, applied, oldData := common.InstrumentedWriteFileIfChange(absCgroupPath, "memory.high", memoryHighValue); err != nil { + return err + } else if applied { + klog.Infof("[CgroupV2] apply memory high successfully, cgroupPath: %s, data: %v, old data: %v\n", absCgroupPath, memoryHighValue, oldData) + } + } + if data.WmarkRatio != 0 { newRatio := fmt.Sprintf("%d", data.WmarkRatio) if err, applied, oldData := common.InstrumentedWriteFileIfChange(absCgroupPath, "memory.wmark_ratio", newRatio); err != nil {