From 671f1fde9d45d9cc854e9d3fd65b4a943cc14f78 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 4 Mar 2026 14:07:42 -0800 Subject: [PATCH 1/3] test: minor fix and add a test (#480) --- pkg/controllers/updaterun/initialization.go | 5 +- .../initialization_integration_test.go | 75 ++++++++++++++++++- pkg/controllers/updaterun/suite_test.go | 15 ++-- 3 files changed, 83 insertions(+), 12 deletions(-) diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index a977c7de2..f77c5f2a9 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -623,9 +623,8 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement plac } if latestResourceSnapshot == nil { - err := fmt.Errorf("no resource snapshot created for placement `%s`", placementKey) - klog.ErrorS(err, "Failed to create resource snapshot", "placement", placementKey, "updateRun", updateRunRef) - return nil, err + err := fmt.Errorf("no resource snapshot created for placement `%s` but there is no error returned by getOrCreate", placementKey) + return nil, controller.NewUnexpectedBehaviorError(err) } // Return the master snapshot directly rather than listing from the cache, because diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go index 41ff96c09..e285da5ff 100644 --- a/pkg/controllers/updaterun/initialization_integration_test.go +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -36,6 +36,7 @@ import ( placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils" "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" + "github.com/kubefleet-dev/kubefleet/pkg/utils/resource" ) var ( @@ -49,6 +50,7 @@ var ( const ( regionEastus = "eastus" regionWestus = "westus" + falseString = "false" ) var _ = Describe("Updaterun initialization tests", func() { @@ -946,12 +948,12 @@ var _ = Describe("Updaterun initialization tests", func() { It("Should create a new resource snapshot at next index even when multiple pre-existing snapshots exist and no index specified", func() { By("Creating a pre-existing resource snapshot at index 0") - resourceSnapshot.Labels[placementv1beta1.IsLatestSnapshotLabel] = "false" + resourceSnapshot.Labels[placementv1beta1.IsLatestSnapshotLabel] = falseString Expect(k8sClient.Create(ctx, resourceSnapshot)).To(Succeed()) By("Creating a another pre-existing resource snapshot at index 1") resourceSnapshot2.Name = testCRPName + "-1-snapshot" - resourceSnapshot2.Labels[placementv1beta1.IsLatestSnapshotLabel] = "false" + resourceSnapshot2.Labels[placementv1beta1.IsLatestSnapshotLabel] = falseString resourceSnapshot2.Labels[placementv1beta1.ResourceIndexLabel] = "1" Expect(k8sClient.Create(ctx, resourceSnapshot2)).To(Succeed()) @@ -1006,6 +1008,75 @@ var _ = Describe("Updaterun initialization tests", func() { By("Checking update run status metrics are emitted") validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) }) + + It("Should not create a new resource snapshot when latest snapshot hash matches and no index specified", func() { + By("Computing the resource hash using the resource selector resolver") + _, selectedResources, _, err := resourceSelectorResolver.SelectResourcesForPlacement(crp) + Expect(err).NotTo(HaveOccurred()) + snapshotSpec := &placementv1beta1.ResourceSnapshotSpec{SelectedResources: selectedResources} + resourceHash, err := resource.HashOf(snapshotSpec) + Expect(err).NotTo(HaveOccurred()) + + By("Creating a pre-existing resource snapshot at index 0") + resourceSnapshot.Labels[placementv1beta1.IsLatestSnapshotLabel] = falseString + Expect(k8sClient.Create(ctx, resourceSnapshot)).To(Succeed()) + + By("Creating another pre-existing resource snapshot at index 1") + resourceSnapshot2.Name = testCRPName + "-1-snapshot" + resourceSnapshot2.Labels[placementv1beta1.IsLatestSnapshotLabel] = falseString + resourceSnapshot2.Labels[placementv1beta1.ResourceIndexLabel] = "1" + Expect(k8sClient.Create(ctx, resourceSnapshot2)).To(Succeed()) + + By("Creating a latest pre-existing resource snapshot at index 2 with matching hash") + resourceSnapshot3.Name = testCRPName + "-2-snapshot" + resourceSnapshot3.Labels[placementv1beta1.ResourceIndexLabel] = "2" + resourceSnapshot3.Annotations[placementv1beta1.ResourceGroupHashAnnotation] = resourceHash + resourceSnapshot3.Spec.SelectedResources = snapshotSpec.SelectedResources + Expect(k8sClient.Create(ctx, resourceSnapshot3)).To(Succeed()) + + By("Creating a new cluster resource override") + Expect(k8sClient.Create(ctx, clusterResourceOverride)).To(Succeed()) + + By("Creating a new clusterStagedUpdateRun without specifying resourceSnapshotIndex") + updateRun.Spec.ResourceSnapshotIndex = "" + Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) + + By("Validating the initialization succeeded and reused the existing resource snapshot at index 2") + Eventually(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if initCond == nil { + return fmt.Errorf("initialization condition not found yet") + } + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition status = %v, want True, message = %s", initCond.Status, initCond.Message) + } + if updateRun.Status.ResourceSnapshotIndexUsed != "2" { + return fmt.Errorf("resourceSnapshotIndexUsed = %s, want `2`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, timeout, interval).Should(Succeed(), "failed to validate initialization succeeded with existing snapshot") + + By("Validating the clusterStagedUpdateRun initialized consistently") + Consistently(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition changed unexpectedly") + } + if updateRun.Status.ResourceSnapshotIndexUsed != "2" { + return fmt.Errorf("resourceSnapshotIndexUsed = %s, want `2`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, duration, interval).Should(Succeed(), "initialization should remain successful") + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) + }) }) }) diff --git a/pkg/controllers/updaterun/suite_test.go b/pkg/controllers/updaterun/suite_test.go index 22c16aa8b..c13088882 100644 --- a/pkg/controllers/updaterun/suite_test.go +++ b/pkg/controllers/updaterun/suite_test.go @@ -48,12 +48,13 @@ import ( ) var ( - cfg *rest.Config - mgr manager.Manager - testEnv *envtest.Environment - k8sClient client.Client - ctx context.Context - cancel context.CancelFunc + cfg *rest.Config + mgr manager.Manager + testEnv *envtest.Environment + k8sClient client.Client + ctx context.Context + cancel context.CancelFunc + resourceSelectorResolver controller.ResourceSelectorResolver ) const ( @@ -121,7 +122,7 @@ var _ = BeforeSuite(func() { }, nil) // Setup our main reconciler. - resourceSelectorResolver := controller.ResourceSelectorResolver{ + resourceSelectorResolver = controller.ResourceSelectorResolver{ RestMapper: mgr.GetRESTMapper(), InformerManager: dynamicInformerManager, ResourceConfig: utils.NewResourceConfig(false), From e68d9111e5f39094d4f31f71449925a1699776df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:09:21 -0800 Subject: [PATCH 2/3] chore: bump step-security/harden-runner from 2.14.2 to 2.15.0 (#468) Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.14.2 to 2.15.0. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/5ef0c079ce82195b2a36a210272d6b661572d83e...a90bcbc6539c36a85cdfeb73f7e2f433735f215b) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-version: 2.15.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Ryan Zhang --- .github/workflows/codespell.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 1bd5a15ae..7ebd6f413 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@5ef0c079ce82195b2a36a210272d6b661572d83e # v2.14.2 + uses: step-security/harden-runner@a90bcbc6539c36a85cdfeb73f7e2f433735f215b # v2.15.0 with: egress-policy: audit From 3d066d84d176b5acfa462ac96aa6f47dd8ae6cf0 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Thu, 5 Mar 2026 06:24:39 +0800 Subject: [PATCH 3/3] feat: improve the default setup for hub agent leader election to allow better scalability/stability (#414) --- cmd/hubagent/main.go | 40 +++++++++-- cmd/hubagent/options/leaderelection.go | 90 ++++++++++++++++++++++++- cmd/hubagent/options/options_test.go | 82 +++++++++++++++++++--- cmd/hubagent/options/validation.go | 5 ++ cmd/hubagent/options/validation_test.go | 11 +++ 5 files changed, 208 insertions(+), 20 deletions(-) diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index dfe7299ce..bdf9d9635 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" "k8s.io/klog/v2" clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" @@ -101,8 +102,18 @@ func main() { // Set up controller-runtime logger ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - config := ctrl.GetConfigOrDie() - config.QPS, config.Burst = float32(opts.CtrlMgrOpts.HubQPS), opts.CtrlMgrOpts.HubBurst + // Create separate configs for the general access purpose and the leader election purpose. + // + // This aims to improve the availability of the hub agent; originally all access to the API + // server would share the same rate limiting configuration, and under adverse conditions (e.g., + // large volume of concurrent placements) controllers would exhause all tokens in the rate limiter, + // and effectively starve the leader election process (the runtime can no longer renew leases), + // which would trigger the hub agent to restart even though the system remains functional. + defaultCfg := ctrl.GetConfigOrDie() + leaderElectionCfg := rest.CopyConfig(defaultCfg) + + defaultCfg.QPS, defaultCfg.Burst = float32(opts.CtrlMgrOpts.HubQPS), opts.CtrlMgrOpts.HubBurst + leaderElectionCfg.QPS, leaderElectionCfg.Burst = float32(opts.LeaderElectionOpts.LeaderElectionQPS), opts.LeaderElectionOpts.LeaderElectionBurst mgrOpts := ctrl.Options{ Scheme: scheme, @@ -110,12 +121,27 @@ func main() { SyncPeriod: &opts.CtrlMgrOpts.ResyncPeriod.Duration, DefaultTransform: cache.TransformStripManagedFields(), }, - LeaderElection: opts.LeaderElectionOpts.LeaderElect, - LeaderElectionID: "136224848560.hub.fleet.azure.com", - LeaderElectionNamespace: opts.LeaderElectionOpts.ResourceNamespace, + LeaderElection: opts.LeaderElectionOpts.LeaderElect, + LeaderElectionConfig: leaderElectionCfg, + // If leader election is enabled, the hub agent by default uses a setup + // with a lease duration of 60 secs, a renew deadline of 45 secs, and a retry period of 5 secs. + // This setup gives the hub agent up to 9 attempts/45 seconds to renew its leadership lease + // before it loses the leadership and restarts. + // + // These values are set significantly higher than the controller-runtime defaults + // (15 seconds, 10 seconds, and 2 seconds respectively), as under heavy loads the hub agent + // might have difficulty renewing its lease in time due to API server side latencies, which + // might further lead to unexpected leadership losses (even when it is the only candidate + // running) and restarts. + // + // Note (chenyu1): a minor side effect with the higher values is that when the agent does restart, + // (or in the future when we do run multiple hub agent replicas), the new leader might have to wait a bit + // longer (up to 60 seconds) to acquire the leadership, which should still be acceptable in most scenarios. LeaseDuration: &opts.LeaderElectionOpts.LeaseDuration.Duration, RenewDeadline: &opts.LeaderElectionOpts.RenewDeadline.Duration, RetryPeriod: &opts.LeaderElectionOpts.RetryPeriod.Duration, + LeaderElectionID: "136224848560.hub.fleet.azure.com", + LeaderElectionNamespace: opts.LeaderElectionOpts.ResourceNamespace, HealthProbeBindAddress: opts.CtrlMgrOpts.HealthProbeBindAddress, Metrics: metricsserver.Options{ BindAddress: opts.CtrlMgrOpts.MetricsBindAddress, @@ -128,7 +154,7 @@ func main() { if opts.CtrlMgrOpts.EnablePprof { mgrOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlMgrOpts.PprofPort) } - mgr, err := ctrl.NewManager(config, mgrOpts) + mgr, err := ctrl.NewManager(defaultCfg, mgrOpts) if err != nil { klog.ErrorS(err, "unable to start controller manager.") exitWithErrorFunc() @@ -186,7 +212,7 @@ func main() { } ctx := ctrl.SetupSignalHandler() - if err := workload.SetupControllers(ctx, &wg, mgr, config, opts); err != nil { + if err := workload.SetupControllers(ctx, &wg, mgr, defaultCfg, opts); err != nil { klog.ErrorS(err, "unable to set up controllers") exitWithErrorFunc() } diff --git a/cmd/hubagent/options/leaderelection.go b/cmd/hubagent/options/leaderelection.go index a16237582..9c4ec271d 100644 --- a/cmd/hubagent/options/leaderelection.go +++ b/cmd/hubagent/options/leaderelection.go @@ -18,6 +18,8 @@ package options import ( "flag" + "fmt" + "strconv" "time" "github.com/kubefleet-dev/kubefleet/pkg/utils" @@ -52,6 +54,18 @@ type LeaderElectionOptions struct { // The namespace of the resource object that will be used to lock during leader election cycles. // This option only applies if leader election is enabled. ResourceNamespace string + + // The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager + // for leader election purposes. This sets up a separate client-side throttling mechanism specifically + // for lease related operations, mostly to avoid an adverse situation where normal operations + // in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. + LeaderElectionQPS float64 + + // The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager + // for leader election purposes. This sets up a separate client-side throttling mechanism specifically + // for lease related operations, mostly to avoid an adverse situation where normal operations + // in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. + LeaderElectionBurst int } // AddFlags adds flags for LeaderElectionOptions to the specified FlagSet. @@ -68,7 +82,7 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) { flags.DurationVar( &o.LeaseDuration.Duration, "leader-lease-duration", - 15*time.Second, + 60*time.Second, "The duration of a leader election lease. This is the period where a non-leader candidate will wait after observing a leadership renewal before attempting to acquire leadership of the current leader. And it is also effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. The option only applies if leader election is enabled.", ) @@ -76,7 +90,7 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) { flags.DurationVar( &o.RenewDeadline.Duration, "leader-renew-deadline", - 10*time.Second, + 45*time.Second, "The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. The option only applies if leader election is enabled", ) @@ -84,7 +98,7 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) { flags.DurationVar( &o.RetryPeriod.Duration, "leader-retry-period", - 2*time.Second, + 5*time.Second, "The duration the clients should wait between attempting acquisition and renewal of a leadership. The option only applies if leader election is enabled", ) @@ -95,4 +109,74 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) { utils.FleetSystemNamespace, "The namespace of the resource object that will be used to lock during leader election cycles. The option only applies if leader election is enabled.", ) + + flags.Var( + newLeaderElectionQPSValueWithValidation(250, &o.LeaderElectionQPS), + "leader-election-qps", + "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager for leader election purposes. This sets up a separate client-side throttling mechanism specifically for lease related operations, mostly to avoid an adverse situation where normal operations in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. Defaults to 250. Use a positive float64 value in the range [10.0, 1000.0], or set a less or equal to zero value to disable client-side throttling.") + + flags.Var( + newLeaderElectionBurstValueWithValidation(1000, &o.LeaderElectionBurst), + "leader-election-burst", + "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager for leader election purposes. This sets up a separate client-side throttling mechanism specifically for lease related operations, mostly to avoid an adverse situation where normal operations in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. Defaults to 1000. Use a positive int value in the range [10, 2000].") +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type LeaderElectionQPSValueWithValidation float64 + +func (v *LeaderElectionQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *LeaderElectionQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps <= 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 1000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 1000.0]", qps) + } + *v = LeaderElectionQPSValueWithValidation(qps) + return nil +} + +func newLeaderElectionQPSValueWithValidation(defaultVal float64, p *float64) *LeaderElectionQPSValueWithValidation { + *p = defaultVal + return (*LeaderElectionQPSValueWithValidation)(p) +} + +type LeaderElectionBurstValueWithValidation int + +func (v *LeaderElectionBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *LeaderElectionBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 2000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 2000]", burst) + } + *v = LeaderElectionBurstValueWithValidation(burst) + return nil +} + +func newLeaderElectionBurstValueWithValidation(defaultVal int, p *int) *LeaderElectionBurstValueWithValidation { + *p = defaultVal + return (*LeaderElectionBurstValueWithValidation)(p) } diff --git a/cmd/hubagent/options/options_test.go b/cmd/hubagent/options/options_test.go index ca84d5208..c638f2d12 100644 --- a/cmd/hubagent/options/options_test.go +++ b/cmd/hubagent/options/options_test.go @@ -42,11 +42,13 @@ func TestLeaderElectionOpts(t *testing.T) { flagSetName: "allDefault", args: []string{}, wantLeaderElectionOpts: LeaderElectionOptions{ - LeaderElect: false, - LeaseDuration: metav1.Duration{Duration: 15 * time.Second}, - RenewDeadline: metav1.Duration{Duration: 10 * time.Second}, - RetryPeriod: metav1.Duration{Duration: 2 * time.Second}, - ResourceNamespace: utils.FleetSystemNamespace, + LeaderElect: false, + LeaseDuration: metav1.Duration{Duration: 60 * time.Second}, + RenewDeadline: metav1.Duration{Duration: 45 * time.Second}, + RetryPeriod: metav1.Duration{Duration: 5 * time.Second}, + ResourceNamespace: utils.FleetSystemNamespace, + LeaderElectionQPS: 250.0, + LeaderElectionBurst: 1000, }, }, { @@ -58,15 +60,75 @@ func TestLeaderElectionOpts(t *testing.T) { "--leader-renew-deadline=20s", "--leader-retry-period=5s", "--leader-election-namespace=test-namespace", + "--leader-election-qps=500", + "--leader-election-burst=1500", }, wantLeaderElectionOpts: LeaderElectionOptions{ - LeaderElect: true, - LeaseDuration: metav1.Duration{Duration: 30 * time.Second}, - RenewDeadline: metav1.Duration{Duration: 20 * time.Second}, - RetryPeriod: metav1.Duration{Duration: 5 * time.Second}, - ResourceNamespace: "test-namespace", + LeaderElect: true, + LeaseDuration: metav1.Duration{Duration: 30 * time.Second}, + RenewDeadline: metav1.Duration{Duration: 20 * time.Second}, + RetryPeriod: metav1.Duration{Duration: 5 * time.Second}, + ResourceNamespace: "test-namespace", + LeaderElectionQPS: 500.0, + LeaderElectionBurst: 1500, }, }, + { + name: "negative leader election QPS value", + flagSetName: "qpsNegative", + args: []string{"--leader-election-qps=-5"}, + wantLeaderElectionOpts: LeaderElectionOptions{ + LeaderElect: false, + LeaseDuration: metav1.Duration{Duration: 60 * time.Second}, + RenewDeadline: metav1.Duration{Duration: 45 * time.Second}, + RetryPeriod: metav1.Duration{Duration: 5 * time.Second}, + ResourceNamespace: utils.FleetSystemNamespace, + LeaderElectionQPS: -1, + LeaderElectionBurst: 1000, + }, + }, + { + name: "leader election QPS parse error", + flagSetName: "qpsParseError", + args: []string{"--leader-election-qps=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse float64 value", + }, + { + name: "leader election QPS out of range (too small)", + flagSetName: "qpsOutOfRangeTooSmall", + args: []string{"--leader-election-qps=9.9"}, + wantErred: true, + wantErrMsgSubStr: "QPS limit is set to an invalid value", + }, + { + name: "leader election QPS out of range (too large)", + flagSetName: "qpsOutOfRangeTooLarge", + args: []string{"--leader-election-qps=1000.1"}, + wantErred: true, + wantErrMsgSubStr: "QPS limit is set to an invalid value", + }, + { + name: "leader election burst parse error", + flagSetName: "burstParseError", + args: []string{"--leader-election-burst=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse int value", + }, + { + name: "leader election burst out of range (too small)", + flagSetName: "burstOutOfRangeTooSmall", + args: []string{"--leader-election-burst=9"}, + wantErred: true, + wantErrMsgSubStr: "burst limit is set to an invalid value", + }, + { + name: "leader election burst out of range (too large)", + flagSetName: "burstOutOfRangeTooLarge", + args: []string{"--leader-election-burst=2001"}, + wantErred: true, + wantErrMsgSubStr: "burst limit is set to an invalid value", + }, } for _, tc := range testCases { diff --git a/cmd/hubagent/options/validation.go b/cmd/hubagent/options/validation.go index 6431dfcce..5770be49d 100644 --- a/cmd/hubagent/options/validation.go +++ b/cmd/hubagent/options/validation.go @@ -33,6 +33,11 @@ func (o *Options) Validate() field.ErrorList { errs = append(errs, field.Invalid(newPath.Child("HubBurst"), o.CtrlMgrOpts.HubBurst, "The burst limit for client-side throttling must be greater than or equal to its QPS limit")) } + // Cross-field validation for leader election options. + if float64(o.LeaderElectionOpts.LeaderElectionBurst) < float64(o.LeaderElectionOpts.LeaderElectionQPS) { + errs = append(errs, field.Invalid(newPath.Child("LeaderElectionBurst"), o.LeaderElectionOpts.LeaderElectionBurst, "The burst limit for client-side throttling of leader election related operations must be greater than or equal to its QPS limit")) + } + // Cross-field validation for webhook options. // Note: this validation logic is a bit weird in the sense that the system accepts diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index 2e222acb7..b67d84571 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -40,6 +40,10 @@ func newTestOptions(modifyOptions ModifyOptions) Options { HubQPS: 250, HubBurst: 1000, }, + LeaderElectionOpts: LeaderElectionOptions{ + LeaderElectionQPS: 250.0, + LeaderElectionBurst: 1000, + }, WebhookOpts: WebhookOptions{ ClientConnectionType: "url", ServiceName: testWebhookServiceName, @@ -78,6 +82,13 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { }), want: field.ErrorList{field.Invalid(newPath.Child("HubBurst"), 50, "The burst limit for client-side throttling must be greater than or equal to its QPS limit")}, }, + "invalid: leader election burst value is less than its QPS value": { + opt: newTestOptions(func(option *Options) { + option.LeaderElectionOpts.LeaderElectionQPS = 100 + option.LeaderElectionOpts.LeaderElectionBurst = 50 + }), + want: field.ErrorList{field.Invalid(newPath.Child("LeaderElectionBurst"), 50, "The burst limit for client-side throttling of leader election related operations must be greater than or equal to its QPS limit")}, + }, "WebhookServiceName is empty": { opt: newTestOptions(func(option *Options) { option.WebhookOpts.EnableWebhooks = true