Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/machine-config-controller/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ func runStartCmd(_ *cobra.Command, _ []string) {
ctrlctx.KubeNamespacedInformerFactory.Core().V1().Secrets(),
ctrlctx.KubeNamespacedInformerFactory.Core().V1().ConfigMaps(),
ctrlctx.ConfigInformerFactory.Config().V1().Infrastructures(),
ctrlctx.FeatureGatesHandler,
)
if err != nil {
klog.Fatalf("unable to start cert rotation controller: %v", err)
Expand Down
161 changes: 157 additions & 4 deletions pkg/controller/certrotation/certrotation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@ package certrotationcontroller
import (
"bytes"
"context"
"crypto/x509"
"encoding/json"
"encoding/pem"
"fmt"
"time"

"github.com/vincent-petithory/dataurl"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
coreinformersv1 "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
Expand All @@ -24,10 +28,12 @@ import (
"k8s.io/utils/clock"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/api/features"
configclientset "github.com/openshift/client-go/config/clientset/versioned"
machineclientset "github.com/openshift/client-go/machine/clientset/versioned"

"github.com/openshift/library-go/pkg/controller/factory"
"github.com/openshift/library-go/pkg/crypto"
"github.com/openshift/library-go/pkg/operator/certrotation"
"github.com/openshift/library-go/pkg/operator/events"

Expand All @@ -46,6 +52,7 @@ const (
mcsCARefresh = 8 * oneYear
mcsTLSKeyExpiry = mcsCAExpiry
mcsTLSKeyRefresh = mcsCARefresh
iriTLSKeyExpiry = mcsCAExpiry
workQueueKey = "key"
)

Expand All @@ -70,6 +77,8 @@ type CertRotationController struct {
recorder events.Recorder

cachesToSync []cache.InformerSynced

featureGatesHandler ctrlcommon.FeatureGatesHandler
}

// New returns a new cert rotation controller.
Expand All @@ -82,6 +91,7 @@ func New(
mcoSecretInformer coreinformersv1.SecretInformer,
mcoConfigMapInfomer coreinformersv1.ConfigMapInformer,
infraInformer configinformers.InfrastructureInformer,
featureGatesHandler ctrlcommon.FeatureGatesHandler,
) (*CertRotationController, error) {

recorder := events.NewLoggingEventRecorder(componentName, clock.RealClock{})
Expand All @@ -106,8 +116,9 @@ func New(
hostnamesQueue: workqueue.NewTypedRateLimitingQueueWithConfig(
workqueue.DefaultTypedControllerRateLimiter[string](),
workqueue.TypedRateLimitingQueueConfig[string]{Name: "Hostnames"}),
infraInformer: infraInformer,
infraLister: infraInformer.Lister(),
infraInformer: infraInformer,
infraLister: infraInformer.Lister(),
featureGatesHandler: featureGatesHandler,
}

// The cert controller will begin creating "machine-config-server-ca" configmap & secret in the MCO namespace.
Expand Down Expand Up @@ -331,11 +342,14 @@ func (c *CertRotationController) addConfigMap(obj interface{}) {
return
}

klog.Infof("configMap %s added, reconciling all user data secrets", configMap.Name)
klog.Infof("configMap %s added, reconciling user data secrets and IRI certificate", configMap.Name)

go func() {
c.reconcileUserDataSecrets()
}()
go func() {
c.reconcileIRICertificate()
}()
Comment on lines 348 to +352
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forgot about these being individual go routines in my original implementation. There is a very small chance of a race here on create or updates. Given that the CA rotates once every 8 years or so, the risk should be minimal. Even if there is a race, the fresh GETs on the CA should result in valid creates/updates.

If we want to be careful we can combine these into one thread and set up a mutex, but IMO that's not a blocker for merging this. The safest way to do this would be via a workqueue like our other controllers, but again definitely not a blocker, just me lamenting my original decisions 😄

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about serializing them? Ie:

go func() {
		c.reconcileUserDataSecrets()
		c.reconcileIRICertificate()
}()

}

func (c *CertRotationController) updateConfigMap(oldCM, newCM interface{}) {
Expand All @@ -353,12 +367,15 @@ func (c *CertRotationController) updateConfigMap(oldCM, newCM interface{}) {
return
}

klog.Infof("configMap %s updated, reconciling all user data secrets", oldConfigMap.Name)
klog.Infof("configMap %s updated, reconciling user data secrets and IRI certificate", oldConfigMap.Name)

// Reconcile all user data secrets
go func() {
c.reconcileUserDataSecrets()
}()
go func() {
c.reconcileIRICertificate()
}()
}

func (c *CertRotationController) addSecret(obj interface{}) {
Expand Down Expand Up @@ -467,3 +484,139 @@ func (c *CertRotationController) reconcileSecret(secret corev1.Secret) error {
klog.Infof("Successfully modified %s secret \n", secret.Name)
return nil
}

func (c *CertRotationController) reconcileIRICertificate() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this internal method is very long, please evaluate to refactor it in smaller methods

if !c.featureGatesHandler.Enabled(features.FeatureGateNoRegistryClusterInstall) {
klog.V(4).Infof("Skipping IRI certificate reconciliation: %s feature gate is not enabled", features.FeatureGateNoRegistryClusterInstall)
return
}
klog.Infof("Reconciling IRI certificate")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is also required to check for the presence of the IRI cluster resource. If not present, it means that the feature is not enabled

// Get the MCS CA secret (fresh get, not from lister)
caSecret, err := c.kubeClient.CoreV1().Secrets(ctrlcommon.MCONamespace).Get(context.TODO(), ctrlcommon.MachineConfigServerCAName, metav1.GetOptions{})
if err != nil {
klog.Errorf("Cannot get MCS CA secret for IRI cert reconciliation: %v", err)
return
}

caCert := caSecret.Data[corev1.TLSCertKey]
caKey := caSecret.Data[corev1.TLSPrivateKeyKey]
if len(caCert) == 0 || len(caKey) == 0 {
klog.Errorf("MCS CA secret %s is missing cert or key data", ctrlcommon.MachineConfigServerCAName)
return
}

// Load the CA
ca, err := crypto.GetCAFromBytes(caCert, caKey)
if err != nil {
klog.Errorf("Cannot load MCS CA for IRI cert generation: %v", err)
return
}

// Check if the existing IRI cert is already valid under the current CA
iriSecret, err := c.kubeClient.CoreV1().Secrets(ctrlcommon.MCONamespace).Get(context.TODO(), ctrlcommon.InternalReleaseImageTLSSecretName, metav1.GetOptions{})
secretExists := err == nil
if secretExists {
if c.isIRICertValid(iriSecret, ca) {
klog.Infof("IRI TLS certificate is still valid under the current MCS CA, skipping rotation")
return
}
} else if !errors.IsNotFound(err) {
klog.Errorf("Cannot get IRI TLS secret: %v", err)
return
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this whole block could be simplified with the happy path, and secretExists removed. Ie:

iriSecret, err := c.kubeClient.CoreV1().Secrets(ctrlcommon.MCONamespace).Get(context.TODO(), ctrlcommon.InternalReleaseImageTLSSecretName, metav1.GetOptions{})
if err != nil && !errors.IsNotFound(err) {
        klog.Errorf("Cannot get IRI TLS secret: %v", err)
		return
}
if iriSecret != nil && c.isIRICertValid(iriSecret, ca) {
   	klog.Infof("IRI TLS certificate is still valid under the current MCS CA, skipping rotation")
    return
}

Later you could simply check for iriSecret != nil

// Get hostnames from the dynamic serving rotation (includes api-int hostname and platform VIPs)
Copy link
Contributor

@djoshy djoshy Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the hostnames from the infra object required for the IRI cert? I was under the impression it wasn't.

It's possible the infra hostnames can get updated outside of the configmap being updated(very unlikely, but plauisble), that could result in the hostnames in the IRI cert would be stale. The MCS TLS cert handles this as it feeds of the dynamic serving rotation so I thought it would be worth mentioning:

CertCreator: &certrotation.ServingRotation{
Hostnames: c.hostnamesRotation.GetHostnames,
HostnamesChanged: c.hostnamesRotation.hostnamesChanged,
},

If we do want infra hostnames changes to be accounted for, we can call reconcileIRICertificate when the hostname queue gets updated, and I would also recommend adding a check for the IPs in isIRICertValid(). If not, we can just take out this bit that uses the hostnames from the dynamic serving rotation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

During the installation, when the initial IRI TLS certificate is generated (see here), we don't use hostnames, just localhost and the apiInt url.
The entry point for the IRI (logical) service is - and must remain - the apiInt. The localhost must be supported as a special case to allow masters to consume their own local registries in case of reboot / disconnect (ie when the apiInt is not reachable for any reason). So to summarize I don't think hostname are really required, but just apiInt and localHost (same behavior of the installer asset)

hostnames := c.hostnamesRotation.GetHostnames()
if len(hostnames) == 0 {
klog.Errorf("No hostnames available for IRI cert generation")
return
}
// IRI registry also serves locally on each master node, matching the installer's SANs
hostnames = append(hostnames, "localhost", "127.0.0.1", "::1")

// Generate a new IRI TLS certificate signed by the MCS CA
certConfig, err := ca.MakeServerCert(sets.New(hostnames...), iriTLSKeyExpiry)
if err != nil {
klog.Errorf("Cannot generate IRI TLS certificate: %v", err)
return
}

certPEM, keyPEM, err := certConfig.GetPEMBytes()
if err != nil {
klog.Errorf("Cannot get PEM bytes for IRI TLS certificate: %v", err)
return
}

if !secretExists {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to take into account this case? It doesn't seem a valid one (at least here): if the secret does not exist for any reason, the IRI controller will be broken and won't work as well

// Create new secret
newSecret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: ctrlcommon.InternalReleaseImageTLSSecretName,
Namespace: ctrlcommon.MCONamespace,
},
Type: corev1.SecretTypeTLS,
Data: map[string][]byte{
corev1.TLSCertKey: certPEM,
corev1.TLSPrivateKeyKey: keyPEM,
},
}
if _, err := c.kubeClient.CoreV1().Secrets(ctrlcommon.MCONamespace).Create(context.TODO(), newSecret, metav1.CreateOptions{}); err != nil {
klog.Errorf("Cannot create IRI TLS secret: %v", err)
return
}
klog.Infof("Successfully created IRI TLS secret %s", ctrlcommon.InternalReleaseImageTLSSecretName)
return
}

// Update existing secret
updatedSecret := iriSecret.DeepCopy()
updatedSecret.Data[corev1.TLSCertKey] = certPEM
updatedSecret.Data[corev1.TLSPrivateKeyKey] = keyPEM
if _, err := c.kubeClient.CoreV1().Secrets(ctrlcommon.MCONamespace).Update(context.TODO(), updatedSecret, metav1.UpdateOptions{}); err != nil {
klog.Errorf("Cannot update IRI TLS secret: %v", err)
return
}
klog.Infof("Successfully updated IRI TLS secret %s", ctrlcommon.InternalReleaseImageTLSSecretName)
}

// isIRICertValid checks whether the existing IRI certificate is signed by the given CA
// and has not expired.
func (c *CertRotationController) isIRICertValid(iriSecret *corev1.Secret, ca *crypto.CA) bool {
certPEM := iriSecret.Data[corev1.TLSCertKey]
if len(certPEM) == 0 {
return false
}

// Decode the first PEM block (the leaf certificate)
block, _ := pem.Decode(certPEM)
if block == nil {
klog.Warningf("Cannot decode PEM from existing IRI TLS certificate")
return false
}

iriCert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
klog.Warningf("Cannot parse existing IRI TLS certificate: %v", err)
return false
}

// Build a cert pool with the current CA to verify against
caPool := x509.NewCertPool()
for _, caCert := range ca.Config.Certs {
caPool.AddCert(caCert)
}

// Verify the IRI cert is signed by the current CA and is not expired
_, err = iriCert.Verify(x509.VerifyOptions{
Roots: caPool,
CurrentTime: time.Now(),
KeyUsages: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
})
if err != nil {
klog.Infof("Existing IRI TLS certificate is not valid under current MCS CA: %v", err)
return false
}

return true
}
Loading