From a7b45febc0c5a3243d769137e82d023ec8771fdf Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 09:52:13 +0100 Subject: [PATCH 1/5] OCPBUGS-78154: Use HA leader election defaults for MCO on SNO. --- cmd/common/helpers.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index a7ad807128..356f9b3b0d 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -70,7 +70,11 @@ func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { if infra, err := clusterstatus.GetClusterInfraStatus(context.TODO(), restcfg); err == nil && infra != nil { if infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { - return leaderelection.LeaderElectionSNOConfig(defaultLeaderElection) + // MCO runs a single replica — no lease contention on SNO. + // Use HA defaults to avoid ~5min recovery delay after reboots. + // See: https://issues.redhat.com/browse/OCPBUGS-78154 + klog.Infof("SNO topology detected, using HA leader election defaults for faster lease recovery (LeaseDuration=%s, RetryPeriod=%s)", + defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) } } else { klog.Warningf("unable to get cluster infrastructure status, using HA cluster values for leader election: %v", err) From f016a8e8d26c461b3e6d7138c854476b95895313 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 09:58:06 +0100 Subject: [PATCH 2/5] Trim comment in GetLeaderElectionConfig. --- cmd/common/helpers.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index 356f9b3b0d..b4ad2d9618 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -71,8 +71,6 @@ func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { if infra, err := clusterstatus.GetClusterInfraStatus(context.TODO(), restcfg); err == nil && infra != nil { if infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { // MCO runs a single replica — no lease contention on SNO. - // Use HA defaults to avoid ~5min recovery delay after reboots. - // See: https://issues.redhat.com/browse/OCPBUGS-78154 klog.Infof("SNO topology detected, using HA leader election defaults for faster lease recovery (LeaseDuration=%s, RetryPeriod=%s)", defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) } From 165da5a9a880740520539b0e7fc67c27d346b15f Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 15:56:37 +0100 Subject: [PATCH 3/5] OCPBUGS-78154: Add useDefaultTimings flag to GetLeaderElectionConfig Refactor GetLeaderElectionConfig to accept a flag that allows callers to use HA default leader election timings regardless of cluster topology. All MCO components (MCC, MCO, MOB) pass true to avoid inflated SNO lease timings that cause ~5.5min delay after node reboot. Co-Authored-By: Claude Opus 4.6 --- cmd/common/helpers.go | 13 ++++++++----- cmd/machine-config-controller/start.go | 2 +- cmd/machine-config-operator/start.go | 2 +- cmd/machine-os-builder/start.go | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index b4ad2d9618..058c331378 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -58,8 +58,9 @@ func CreateResourceLock(cb *clients.Builder, componentNamespace, componentName s return lock } -// GetLeaderElectionConfig returns leader election configs defaults based on the cluster topology -func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { +// GetLeaderElectionConfig returns leader election configs defaults based on the cluster topology. +// When useDefaultTimings is true, HA default timings are used regardless of topology. +func GetLeaderElectionConfig(restcfg *rest.Config, useDefaultTimings bool) configv1.LeaderElection { // Defaults follow conventions // https://github.com/openshift/enhancements/blob/master/CONVENTIONS.md#high-availability @@ -68,11 +69,13 @@ func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { "", "", ) + if useDefaultTimings { + return defaultLeaderElection + } + if infra, err := clusterstatus.GetClusterInfraStatus(context.TODO(), restcfg); err == nil && infra != nil { if infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { - // MCO runs a single replica — no lease contention on SNO. - klog.Infof("SNO topology detected, using HA leader election defaults for faster lease recovery (LeaseDuration=%s, RetryPeriod=%s)", - defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) + return leaderelection.LeaderElectionSNOConfig(defaultLeaderElection) } } else { klog.Warningf("unable to get cluster infrastructure status, using HA cluster values for leader election: %v", err) diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index f9a022d230..a968ce371b 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -181,7 +181,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { <-ctx.Done() } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) + leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName), diff --git a/cmd/machine-config-operator/start.go b/cmd/machine-config-operator/start.go index e5b0ef76df..c88d20f0ee 100644 --- a/cmd/machine-config-operator/start.go +++ b/cmd/machine-config-operator/start.go @@ -143,7 +143,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { <-ctx.Done() } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) + leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName), diff --git a/cmd/machine-os-builder/start.go b/cmd/machine-os-builder/start.go index a1c20f795c..2c025afbda 100644 --- a/cmd/machine-os-builder/start.go +++ b/cmd/machine-os-builder/start.go @@ -72,7 +72,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrl.Run(ctx, 3) } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) + leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName), From 65d648b8aaa0ea72a4fac0bcbc4d70923b6889f7 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 16:22:34 +0100 Subject: [PATCH 4/5] OCPBUGS-78154: Log when using HA default leader election timings Add info log when useDefaultTimings flag bypasses topology-specific timings for better runtime visibility during troubleshooting. Co-Authored-By: Claude Opus 4.6 --- cmd/common/helpers.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index 058c331378..3c98c35664 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -70,6 +70,8 @@ func GetLeaderElectionConfig(restcfg *rest.Config, useDefaultTimings bool) confi ) if useDefaultTimings { + klog.Infof("Using HA default leader election timings (LeaseDuration=%s, RetryPeriod=%s)", + defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) return defaultLeaderElection } From 34cd2aa29a7d4e08a4208e79942663414db6d7b3 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 17:30:09 +0100 Subject: [PATCH 5/5] OCPBUGS-78154: Add GetDefaultLeaderElectionConfig for MCC Add a dedicated function that returns HA default leader election timings regardless of cluster topology. Only MCC uses it, since it runs a single replica with no lease contention and the inflated SNO timings add ~5.5min of unnecessary recovery latency after node reboot. Co-Authored-By: Claude Opus 4.6 --- cmd/common/helpers.go | 24 +++++++++++++++--------- cmd/machine-config-controller/start.go | 2 +- cmd/machine-config-operator/start.go | 2 +- cmd/machine-os-builder/start.go | 2 +- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index 3c98c35664..32adddefdd 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -58,9 +58,8 @@ func CreateResourceLock(cb *clients.Builder, componentNamespace, componentName s return lock } -// GetLeaderElectionConfig returns leader election configs defaults based on the cluster topology. -// When useDefaultTimings is true, HA default timings are used regardless of topology. -func GetLeaderElectionConfig(restcfg *rest.Config, useDefaultTimings bool) configv1.LeaderElection { +// GetLeaderElectionConfig returns leader election configs defaults based on the cluster topology +func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { // Defaults follow conventions // https://github.com/openshift/enhancements/blob/master/CONVENTIONS.md#high-availability @@ -69,12 +68,6 @@ func GetLeaderElectionConfig(restcfg *rest.Config, useDefaultTimings bool) confi "", "", ) - if useDefaultTimings { - klog.Infof("Using HA default leader election timings (LeaseDuration=%s, RetryPeriod=%s)", - defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) - return defaultLeaderElection - } - if infra, err := clusterstatus.GetClusterInfraStatus(context.TODO(), restcfg); err == nil && infra != nil { if infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { return leaderelection.LeaderElectionSNOConfig(defaultLeaderElection) @@ -86,6 +79,19 @@ func GetLeaderElectionConfig(restcfg *rest.Config, useDefaultTimings bool) confi return defaultLeaderElection } +// GetDefaultLeaderElectionConfig returns HA default leader election timings regardless of +// cluster topology. Use this for components that run a single replica and have no lease +// contention, where the inflated SNO timings only add unnecessary recovery latency. +func GetDefaultLeaderElectionConfig() configv1.LeaderElection { + defaultLeaderElection := leaderelection.LeaderElectionDefaulting( + configv1.LeaderElection{}, + "", "", + ) + klog.Infof("Using HA default leader election timings (LeaseDuration=%s, RetryPeriod=%s)", + defaultLeaderElection.LeaseDuration.Duration, defaultLeaderElection.RetryPeriod.Duration) + return defaultLeaderElection +} + // SignalHandler catches SIGINT/SIGTERM signals and makes sure the passed context gets cancelled when those signals happen. This allows us to use a // context to shut down our operations cleanly when we are signalled to shutdown. func SignalHandler(runCancel context.CancelFunc) { diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index a968ce371b..71a547e63c 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -181,7 +181,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { <-ctx.Done() } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) + leaderElectionCfg := common.GetDefaultLeaderElectionConfig() leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName), diff --git a/cmd/machine-config-operator/start.go b/cmd/machine-config-operator/start.go index c88d20f0ee..e5b0ef76df 100644 --- a/cmd/machine-config-operator/start.go +++ b/cmd/machine-config-operator/start.go @@ -143,7 +143,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { <-ctx.Done() } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) + leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName), diff --git a/cmd/machine-os-builder/start.go b/cmd/machine-os-builder/start.go index 2c025afbda..a1c20f795c 100644 --- a/cmd/machine-os-builder/start.go +++ b/cmd/machine-os-builder/start.go @@ -72,7 +72,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { ctrl.Run(ctx, 3) } - leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig(), true) + leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName),