From fdf1f44bc7f54c2ee168d9799fd24238971ff1ac Mon Sep 17 00:00:00 2001 From: Dalia Khater Date: Tue, 3 Mar 2026 02:22:18 -0600 Subject: [PATCH] Add etcd size limit validation for rendered MachineConfigs Fixes bug where MachineConfigPools get stuck in degraded state with "etcdserver: request is too large" errors when rendered MachineConfigs exceed etcd's 1.5MB size limit. Changes: - Add MaxMachineConfigSize constant (1572864 bytes) in constants.go - Add ValidateMachineConfigSize() function in helpers.go that: * Validates rendered MC size before sending to etcd * Returns clear error message with remediation guidance * Logs warning when size exceeds 80% of limit * Provides debug logging of MC size usage - Call validation in render controller before MC create/update This prevents the operator from attempting to write oversized MCs to etcd, provides early detection with helpful error messages, and avoids wasting retry attempts. The error message specifically mentions large registry mirror configurations (ImageDigestMirrorSet/ICSP) as the primary cause and suggests reducing their size. --- pkg/controller/common/constants.go | 5 +++ pkg/controller/common/helpers.go | 37 ++++++++++++++++++++++ pkg/controller/render/render_controller.go | 5 +++ 3 files changed, 47 insertions(+) diff --git a/pkg/controller/common/constants.go b/pkg/controller/common/constants.go index 31ba8b1040..1f890d0eb8 100644 --- a/pkg/controller/common/constants.go +++ b/pkg/controller/common/constants.go @@ -160,6 +160,11 @@ const ( // Note: Update units in status_test.go when the following are bumped RHCOSVersionBootImageSkewLimit = "9.2" OCPVersionBootImageSkewLimit = "4.13.0" + + // MaxMachineConfigSize is the maximum size for a MachineConfig object in bytes. + // This matches etcd's default request size limit of 1.5MB (1572864 bytes). + // Reference: https://issues.redhat.com/browse/OCPBUGS-62619 + MaxMachineConfigSize = 1572864 ) // Commonly-used MCO ConfigMap names diff --git a/pkg/controller/common/helpers.go b/pkg/controller/common/helpers.go index 54bb1a4305..8bd49484f5 100644 --- a/pkg/controller/common/helpers.go +++ b/pkg/controller/common/helpers.go @@ -468,6 +468,43 @@ func ValidateMachineConfig(cfg mcfgv1.MachineConfigSpec) error { return nil } +// ValidateMachineConfigSize checks if the MachineConfig size exceeds etcd limits. +// etcd has a default request size limit of 1.5MB. This function validates that the +// rendered MachineConfig does not exceed this limit to prevent "etcdserver: request +// is too large" errors. +func ValidateMachineConfigSize(mc *mcfgv1.MachineConfig) error { + // Marshal the MachineConfig to JSON to get its actual size as it will be sent to etcd + data, err := json.Marshal(mc) + if err != nil { + return fmt.Errorf("failed to marshal MachineConfig: %w", err) + } + + size := len(data) + + // Check if size exceeds the limit + if size > MaxMachineConfigSize { + return fmt.Errorf("rendered MachineConfig %s is too large (%d bytes, max %d bytes). "+ + "This will exceed etcd's size limit. Consider reducing the number or size of MachineConfigs, "+ + "particularly large registry mirror configurations (ImageDigestMirrorSet/ImageContentSourcePolicy)", + mc.Name, size, MaxMachineConfigSize) + } + + // Log size information at debug level + percentUsed := float64(size) / float64(MaxMachineConfigSize) * 100 + klog.V(4).Infof("MachineConfig %s size: %d bytes (%.2f%% of %d byte limit)", + mc.Name, size, percentUsed, MaxMachineConfigSize) + + // Warn if approaching the limit (> 80%) + warningThreshold := (MaxMachineConfigSize * 4) / 5 + if size > warningThreshold { + klog.Warningf("MachineConfig %s is approaching size limit: %d bytes (%.2f%% of %d byte limit). "+ + "Consider reducing MachineConfig size to avoid hitting the limit.", + mc.Name, size, percentUsed, MaxMachineConfigSize) + } + + return nil +} + // Validates that a given MachineConfig's extensions are supported. func ValidateMachineConfigExtensions(cfg mcfgv1.MachineConfigSpec) error { return validateExtensions(cfg.Extensions) diff --git a/pkg/controller/render/render_controller.go b/pkg/controller/render/render_controller.go index 1d78e369ea..440ccabaa2 100644 --- a/pkg/controller/render/render_controller.go +++ b/pkg/controller/render/render_controller.go @@ -602,6 +602,11 @@ func (ctrl *Controller) syncGeneratedMachineConfig(pool *mcfgv1.MachineConfigPoo return fmt.Errorf("could not generate rendered MachineConfig: %w", err) } + // Validate that the generated MachineConfig does not exceed etcd size limits + if err := ctrlcommon.ValidateMachineConfigSize(generated); err != nil { + return fmt.Errorf("size validation failed: %w", err) + } + // Collect metric when OSImageURL was overridden var isOSImageURLOverridden bool if generated.Spec.OSImageURL != ctrlcommon.GetBaseImageContainer(&cc.Spec, osImageStreamSet) {