From d38ac014f5d427a9db82d95eabdfa6b3fc651e47 Mon Sep 17 00:00:00 2001 From: eshulman2 Date: Wed, 28 Jan 2026 16:40:11 +0200 Subject: [PATCH 1/3] Add resyncPeriod option for drift detection This change introduces a new `resyncPeriod` field in `ManagedOptions` that enables periodic reconciliation of resources to detect and correct drift from the desired state in OpenStack. Motivation: Resources managed by ORC can drift from their desired state due to external modifications in OpenStack. Without drift detection, these changes go unnoticed until the next spec change triggers reconciliation. The resyncPeriod option allows users to configure periodic checks to detect and remediate such drift automatically. Implementation: - Add `resyncPeriod` field (metav1.Duration) to ManagedOptions API - Add GetResyncPeriod() helper method for safe access with nil handling - Modify shouldReconcile() in the generic controller to check if enough time has passed since the last successful reconciliation - Schedule next resync when periodic resync is enabled and no other reschedule is pending - Update all CRDs, OpenAPI schema, and documentation Usage: ```yaml spec: managedOptions: resyncPeriod: 1h # Reconcile every hour to detect drift ``` If not specified or set to 0, periodic resync is disabled (default behavior unchanged). Closes: https://github.com/k-orc/openstack-resource-controller/issues/655 Co-Authored-By: Claude Opus 4.5 --- api/v1alpha1/controller_options.go | 26 ++++++++++++ api/v1alpha1/zz_generated.deepcopy.go | 41 +++++++++++-------- cmd/models-schema/zz_generated.openapi.go | 8 ++++ .../bases/openstack.k-orc.cloud_domains.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_flavors.yaml | 8 ++++ .../openstack.k-orc.cloud_floatingips.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_groups.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_images.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_keypairs.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_networks.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_ports.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_projects.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_roles.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_routers.yaml | 8 ++++ .../openstack.k-orc.cloud_securitygroups.yaml | 8 ++++ .../openstack.k-orc.cloud_servergroups.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_servers.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_services.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_subnets.yaml | 8 ++++ .../bases/openstack.k-orc.cloud_volumes.yaml | 8 ++++ .../openstack.k-orc.cloud_volumetypes.yaml | 8 ++++ .../generic/reconciler/controller.go | 38 +++++++++++++---- .../api/v1alpha1/managedoptions.go | 12 +++++- .../applyconfiguration/internal/internal.go | 5 +++ website/docs/crd-reference.md | 1 + website/docs/user-guide/index.md | 41 +++++++++++++++++++ 26 files changed, 290 insertions(+), 26 deletions(-) diff --git a/api/v1alpha1/controller_options.go b/api/v1alpha1/controller_options.go index d0908b10e..e38ad0d4d 100644 --- a/api/v1alpha1/controller_options.go +++ b/api/v1alpha1/controller_options.go @@ -16,6 +16,12 @@ limitations under the License. package v1alpha1 +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + // +kubebuilder:validation:Enum:=managed;unmanaged type ManagementPolicy string @@ -53,6 +59,14 @@ type ManagedOptions struct { // +kubebuilder:default:=delete // +optional OnDelete OnDelete `json:"onDelete,omitempty"` + + // resyncPeriod specifies the interval after which a successfully + // reconciled resource will be reconciled again to detect drift from the + // desired state. Set to 0 to disable periodic resync. If not specified, + // the default is 10 hours. + // +kubebuilder:default:="10h" + // +optional + ResyncPeriod *metav1.Duration `json:"resyncPeriod,omitempty"` //nolint:kubeapilinter } // GetOnDelete returns the delete behaviour from ManagedOptions. If called on a @@ -63,3 +77,15 @@ func (o *ManagedOptions) GetOnDelete() OnDelete { } return o.OnDelete } + +// DefaultResyncPeriod is the default interval for periodic resync to detect drift (10 hours). +const DefaultResyncPeriod = 10 * time.Hour + +// GetResyncPeriod returns the resync period from ManagedOptions. If called on a +// nil receiver or if ResyncPeriod is not set, it returns the default of 10 hours. +func (o *ManagedOptions) GetResyncPeriod() time.Duration { + if o == nil || o.ResyncPeriod == nil { + return DefaultResyncPeriod + } + return o.ResyncPeriod.Duration +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 1024ea46a..edfc6f2d1 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -300,7 +300,7 @@ func (in *DomainSpec) DeepCopyInto(out *DomainSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -707,7 +707,7 @@ func (in *FlavorSpec) DeepCopyInto(out *FlavorSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -971,7 +971,7 @@ func (in *FloatingIPSpec) DeepCopyInto(out *FloatingIPSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -1188,7 +1188,7 @@ func (in *GroupSpec) DeepCopyInto(out *GroupSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -1700,7 +1700,7 @@ func (in *ImageSpec) DeepCopyInto(out *ImageSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -1928,7 +1928,7 @@ func (in *KeyPairSpec) DeepCopyInto(out *KeyPairSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -1978,6 +1978,11 @@ func (in *KeyPairStatus) DeepCopy() *KeyPairStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ManagedOptions) DeepCopyInto(out *ManagedOptions) { *out = *in + if in.ResyncPeriod != nil { + in, out := &in.ResyncPeriod, &out.ResyncPeriod + *out = new(v1.Duration) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManagedOptions. @@ -2257,7 +2262,7 @@ func (in *NetworkSpec) DeepCopyInto(out *NetworkSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -2608,7 +2613,7 @@ func (in *PortSpec) DeepCopyInto(out *PortSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -2836,7 +2841,7 @@ func (in *ProjectSpec) DeepCopyInto(out *ProjectSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -3073,7 +3078,7 @@ func (in *RoleSpec) DeepCopyInto(out *RoleSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -3447,7 +3452,7 @@ func (in *RouterSpec) DeepCopyInto(out *RouterSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -3760,7 +3765,7 @@ func (in *SecurityGroupSpec) DeepCopyInto(out *SecurityGroupSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -4055,7 +4060,7 @@ func (in *ServerGroupSpec) DeepCopyInto(out *ServerGroupSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -4366,7 +4371,7 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -4623,7 +4628,7 @@ func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -4967,7 +4972,7 @@ func (in *SubnetSpec) DeepCopyInto(out *SubnetSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -5308,7 +5313,7 @@ func (in *VolumeSpec) DeepCopyInto(out *VolumeSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } @@ -5575,7 +5580,7 @@ func (in *VolumeTypeSpec) DeepCopyInto(out *VolumeTypeSpec) { if in.ManagedOptions != nil { in, out := &in.ManagedOptions, &out.ManagedOptions *out = new(ManagedOptions) - **out = **in + (*in).DeepCopyInto(*out) } out.CloudCredentialsRef = in.CloudCredentialsRef } diff --git a/cmd/models-schema/zz_generated.openapi.go b/cmd/models-schema/zz_generated.openapi.go index 42f004b03..f6dcfeadb 100644 --- a/cmd/models-schema/zz_generated.openapi.go +++ b/cmd/models-schema/zz_generated.openapi.go @@ -3774,9 +3774,17 @@ func schema_openstack_resource_controller_v2_api_v1alpha1_ManagedOptions(ref com Format: "", }, }, + "resyncPeriod": { + SchemaProps: spec.SchemaProps{ + Description: "resyncPeriod specifies the interval after which a successfully reconciled resource will be reconciled again to detect drift from the desired state. Set to 0 to disable periodic resync. If not specified, the default is 10 hours.", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Duration"), + }, + }, }, }, }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Duration"}, } } diff --git a/config/crd/bases/openstack.k-orc.cloud_domains.yaml b/config/crd/bases/openstack.k-orc.cloud_domains.yaml index 893ff831c..f9acb41bb 100644 --- a/config/crd/bases/openstack.k-orc.cloud_domains.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_domains.yaml @@ -125,6 +125,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_flavors.yaml b/config/crd/bases/openstack.k-orc.cloud_flavors.yaml index d1c930aa2..ae3b474b8 100644 --- a/config/crd/bases/openstack.k-orc.cloud_flavors.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_flavors.yaml @@ -137,6 +137,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_floatingips.yaml b/config/crd/bases/openstack.k-orc.cloud_floatingips.yaml index 686b41c71..a0a5cbcb6 100644 --- a/config/crd/bases/openstack.k-orc.cloud_floatingips.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_floatingips.yaml @@ -209,6 +209,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_groups.yaml b/config/crd/bases/openstack.k-orc.cloud_groups.yaml index e62f33c1c..ebe56583d 100644 --- a/config/crd/bases/openstack.k-orc.cloud_groups.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_groups.yaml @@ -126,6 +126,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_images.yaml b/config/crd/bases/openstack.k-orc.cloud_images.yaml index 6b5dce9b9..109aa537e 100644 --- a/config/crd/bases/openstack.k-orc.cloud_images.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_images.yaml @@ -139,6 +139,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_keypairs.yaml b/config/crd/bases/openstack.k-orc.cloud_keypairs.yaml index 969748e90..b83c290f7 100644 --- a/config/crd/bases/openstack.k-orc.cloud_keypairs.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_keypairs.yaml @@ -121,6 +121,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_networks.yaml b/config/crd/bases/openstack.k-orc.cloud_networks.yaml index bba47e4c8..d431a45b5 100644 --- a/config/crd/bases/openstack.k-orc.cloud_networks.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_networks.yaml @@ -195,6 +195,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_ports.yaml b/config/crd/bases/openstack.k-orc.cloud_ports.yaml index c14173d71..c51acacd9 100644 --- a/config/crd/bases/openstack.k-orc.cloud_ports.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_ports.yaml @@ -209,6 +209,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_projects.yaml b/config/crd/bases/openstack.k-orc.cloud_projects.yaml index ab550af2b..470b3b4f4 100644 --- a/config/crd/bases/openstack.k-orc.cloud_projects.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_projects.yaml @@ -165,6 +165,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_roles.yaml b/config/crd/bases/openstack.k-orc.cloud_roles.yaml index 98cb4993d..7a43e41e7 100644 --- a/config/crd/bases/openstack.k-orc.cloud_roles.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_roles.yaml @@ -126,6 +126,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_routers.yaml b/config/crd/bases/openstack.k-orc.cloud_routers.yaml index 870aee9db..c19e2fb83 100644 --- a/config/crd/bases/openstack.k-orc.cloud_routers.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_routers.yaml @@ -190,6 +190,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_securitygroups.yaml b/config/crd/bases/openstack.k-orc.cloud_securitygroups.yaml index 13cef5e35..98bfee8a9 100644 --- a/config/crd/bases/openstack.k-orc.cloud_securitygroups.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_securitygroups.yaml @@ -190,6 +190,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_servergroups.yaml b/config/crd/bases/openstack.k-orc.cloud_servergroups.yaml index ad2eafd83..873f3bbd3 100644 --- a/config/crd/bases/openstack.k-orc.cloud_servergroups.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_servergroups.yaml @@ -121,6 +121,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_servers.yaml b/config/crd/bases/openstack.k-orc.cloud_servers.yaml index 2a882bad3..f7f06f67c 100644 --- a/config/crd/bases/openstack.k-orc.cloud_servers.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_servers.yaml @@ -171,6 +171,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_services.yaml b/config/crd/bases/openstack.k-orc.cloud_services.yaml index 9e6a16416..79b563306 100644 --- a/config/crd/bases/openstack.k-orc.cloud_services.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_services.yaml @@ -126,6 +126,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_subnets.yaml b/config/crd/bases/openstack.k-orc.cloud_subnets.yaml index e0445ee80..354dfcd4f 100644 --- a/config/crd/bases/openstack.k-orc.cloud_subnets.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_subnets.yaml @@ -237,6 +237,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_volumes.yaml b/config/crd/bases/openstack.k-orc.cloud_volumes.yaml index eeaf10a8b..12392695f 100644 --- a/config/crd/bases/openstack.k-orc.cloud_volumes.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_volumes.yaml @@ -136,6 +136,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/config/crd/bases/openstack.k-orc.cloud_volumetypes.yaml b/config/crd/bases/openstack.k-orc.cloud_volumetypes.yaml index c92df01fc..8080366f8 100644 --- a/config/crd/bases/openstack.k-orc.cloud_volumetypes.yaml +++ b/config/crd/bases/openstack.k-orc.cloud_volumetypes.yaml @@ -130,6 +130,14 @@ spec: - delete - detach type: string + resyncPeriod: + default: 10h + description: |- + resyncPeriod specifies the interval after which a successfully + reconciled resource will be reconciled again to detect drift from the + desired state. Set to 0 to disable periodic resync. If not specified, + the default is 10 hours. + type: string type: object managementPolicy: default: managed diff --git a/internal/controllers/generic/reconciler/controller.go b/internal/controllers/generic/reconciler/controller.go index 7571519cd..793585444 100644 --- a/internal/controllers/generic/reconciler/controller.go +++ b/internal/controllers/generic/reconciler/controller.go @@ -19,6 +19,7 @@ package reconciler import ( "context" "fmt" + "time" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -135,15 +136,16 @@ func (c *Controller[ // status indicates that no further reconciliation is required. // // Specifically it looks at the Progressing condition. It has the following behaviour: -// - Progressing condition is not present -> reconcile -// - Progressing condition is present and True -> reconcile -// - Progressing condition is present and False, but observedGeneration is old -> reconcile -// - Progressing condition is false and observedGeneration is up to date -> do not reconcile +// - Progressing condition is not present -> reconcile +// - Progressing condition is present and True -> reconcile +// - Progressing condition is present and False, but observedGeneration is old -> reconcile +// - Progressing condition is false and observedGeneration is up to date -> do not reconcile +// (unless periodic resync is enabled and enough time has passed since the last sync) // // If shouldReconcile is preventing an object from being reconciled which should // be reconciled, consider if that object's actuator is correctly returning a // ProgressStatus indicating that the reconciliation should continue. -func shouldReconcile(obj orcv1alpha1.ObjectWithConditions) bool { +func shouldReconcile(obj orcv1alpha1.ObjectWithConditions, resyncPeriod time.Duration) bool { progressing := meta.FindStatusCondition(obj.GetConditions(), orcv1alpha1.ConditionProgressing) if progressing == nil { return true @@ -153,7 +155,17 @@ func shouldReconcile(obj orcv1alpha1.ObjectWithConditions) bool { return true } - return progressing.ObservedGeneration != obj.GetGeneration() + if progressing.ObservedGeneration != obj.GetGeneration() { + return true + } + + // At this point, Progressing is False and generation is up to date. + // For periodic resync, check if enough time has passed since the last sync. + if resyncPeriod > 0 { + return time.Since(progressing.LastTransitionTime.Time) >= resyncPeriod + } + + return false } func (c *Controller[ @@ -165,10 +177,13 @@ func (c *Controller[ ]) reconcileNormal(ctx context.Context, objAdapter interfaces.APIObjectAdapter[orcObjectPT, resourceSpecT, filterT]) (reconcileStatus progress.ReconcileStatus) { log := ctrl.LoggerFrom(ctx) + // Get the resync period from the object's managed options + resyncPeriod := objAdapter.GetManagedOptions().GetResyncPeriod() + // We do this here rather than in a predicate because predicates only cover // a single watch. Doing it here means we cover all sources of // reconciliation, including our dependencies. - if !shouldReconcile(objAdapter.GetObject()) { + if !shouldReconcile(objAdapter.GetObject(), resyncPeriod) { log.V(logging.Verbose).Info("Status is up to date: not reconciling") return reconcileStatus } @@ -229,6 +244,15 @@ func (c *Controller[ } } + // If periodic resync is enabled and we're not already rescheduling for + // another reason, schedule the next resync to detect drift. + if resyncPeriod > 0 { + needsReschedule, _ := reconcileStatus.NeedsReschedule() + if !needsReschedule { + reconcileStatus = reconcileStatus.WithRequeue(resyncPeriod) + } + } + return reconcileStatus } diff --git a/pkg/clients/applyconfiguration/api/v1alpha1/managedoptions.go b/pkg/clients/applyconfiguration/api/v1alpha1/managedoptions.go index 092ab7883..0c64d1b0d 100644 --- a/pkg/clients/applyconfiguration/api/v1alpha1/managedoptions.go +++ b/pkg/clients/applyconfiguration/api/v1alpha1/managedoptions.go @@ -20,12 +20,14 @@ package v1alpha1 import ( apiv1alpha1 "github.com/k-orc/openstack-resource-controller/v2/api/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) // ManagedOptionsApplyConfiguration represents a declarative configuration of the ManagedOptions type for use // with apply. type ManagedOptionsApplyConfiguration struct { - OnDelete *apiv1alpha1.OnDelete `json:"onDelete,omitempty"` + OnDelete *apiv1alpha1.OnDelete `json:"onDelete,omitempty"` + ResyncPeriod *v1.Duration `json:"resyncPeriod,omitempty"` } // ManagedOptionsApplyConfiguration constructs a declarative configuration of the ManagedOptions type for use with @@ -41,3 +43,11 @@ func (b *ManagedOptionsApplyConfiguration) WithOnDelete(value apiv1alpha1.OnDele b.OnDelete = &value return b } + +// WithResyncPeriod sets the ResyncPeriod field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ResyncPeriod field is set to the value of the last call. +func (b *ManagedOptionsApplyConfiguration) WithResyncPeriod(value v1.Duration) *ManagedOptionsApplyConfiguration { + b.ResyncPeriod = &value + return b +} diff --git a/pkg/clients/applyconfiguration/internal/internal.go b/pkg/clients/applyconfiguration/internal/internal.go index dfb83d47f..482c725bd 100644 --- a/pkg/clients/applyconfiguration/internal/internal.go +++ b/pkg/clients/applyconfiguration/internal/internal.go @@ -1004,6 +1004,9 @@ var schemaYAML = typed.YAMLObject(`types: - name: onDelete type: scalar: string + - name: resyncPeriod + type: + namedType: io.k8s.apimachinery.pkg.apis.meta.v1.Duration - name: com.github.k-orc.openstack-resource-controller.v2.api.v1alpha1.Network map: fields: @@ -3289,6 +3292,8 @@ var schemaYAML = typed.YAMLObject(`types: type: scalar: string default: "" +- name: io.k8s.apimachinery.pkg.apis.meta.v1.Duration + scalar: string - name: io.k8s.apimachinery.pkg.apis.meta.v1.FieldsV1 map: elementType: diff --git a/website/docs/crd-reference.md b/website/docs/crd-reference.md index f1664a185..32416f124 100644 --- a/website/docs/crd-reference.md +++ b/website/docs/crd-reference.md @@ -1698,6 +1698,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `onDelete` _[OnDelete](#ondelete)_ | onDelete specifies the behaviour of the controller when the ORC
object is deleted. Options are `delete` - delete the OpenStack resource;
`detach` - do not delete the OpenStack resource. If not specified, the
default is `delete`. | delete | Enum: [delete detach]
| +| `resyncPeriod` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.29/#duration-v1-meta)_ | resyncPeriod specifies the interval after which a successfully
reconciled resource will be reconciled again to detect drift from the
desired state. Set to 0 to disable periodic resync. If not specified,
the default is 10 hours. | 10h | | #### ManagementPolicy diff --git a/website/docs/user-guide/index.md b/website/docs/user-guide/index.md index 4945fcdef..741a771c3 100644 --- a/website/docs/user-guide/index.md +++ b/website/docs/user-guide/index.md @@ -100,6 +100,47 @@ spec: # ... ``` +### Drift Detection + +ORC periodically reconciles managed resources to detect and correct drift from the desired state. This ensures that if someone modifies an OpenStack resource outside of ORC (e.g., through the OpenStack CLI or dashboard), ORC will detect the change and restore the resource to match the Kubernetes specification. + +**By default, drift detection is enabled with a resync period of 10 hours (`10h`).** The `managedOptions.resyncPeriod` field controls how often ORC checks for drift using standard duration format (e.g., `1h`, `30m`, `24h`): + +| Value | Description | +|-------|-------------| +| `10h` | Check for drift every 10 hours. This is the default. | +| `1h` | Check for drift every hour. | +| `30m` | Check for drift every 30 minutes. | +| `0` | Disable periodic drift detection. Resources are only reconciled when their spec changes. | + +```yaml +spec: + managementPolicy: managed + managedOptions: + resyncPeriod: 1h # Check for drift every hour + resource: + # ... +``` + +!!! note + + Drift detection only applies to managed resources. Unmanaged resources are never modified by ORC, so drift detection does not apply to them. + +!!! tip + + For resources that are frequently modified outside of ORC, consider using a shorter resync period. For stable resources, the default of 10 hours is usually sufficient. + +!!! warning + + Be aware of the side effects of drift detection, especially with a low `resyncPeriod`: + + - **Increased OpenStack API load**: Each drift detection cycle queries the OpenStack API. A low resyncPeriod across many resources can generate significant API traffic and may trigger rate limiting. + - **Controller resource consumption**: Frequent reconciliation increases CPU and memory usage on the ORC controller. + - **Potential conflicts**: If resources are actively being modified by external systems (other controllers, automation scripts, or manual operations), frequent drift correction can cause conflicts or unexpected behavior. + - **Network overhead**: Each reconciliation involves network calls to OpenStack, which adds latency and bandwidth usage. + + Consider your environment's scale and requirements when configuring resyncPeriod. For most use cases, the default of 10 hours provides a good balance between drift detection and resource efficiency. + ### Resource References ORC resources reference each other using `*Ref` fields. These references: From 52be701f0a4df5ef9c3f12e66e09ae1522f51f27 Mon Sep 17 00:00:00 2001 From: eshulman2 Date: Wed, 28 Jan 2026 19:40:03 +0200 Subject: [PATCH 2/3] Recreate resource on external deletion Recreate resource in case it was deleted by something external to ORC. this solves the issue when deleting with external but does raise concerns I am afraid that in case of split brain or other edge case the resource will be created over and over causing a catastrophic failure --- .../generic/reconciler/controller.go | 6 ++++-- .../generic/reconciler/resource_actions.go | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/internal/controllers/generic/reconciler/controller.go b/internal/controllers/generic/reconciler/controller.go index 793585444..56ca8512f 100644 --- a/internal/controllers/generic/reconciler/controller.go +++ b/internal/controllers/generic/reconciler/controller.go @@ -219,8 +219,10 @@ func (c *Controller[ return reconcileStatus.WithError(fmt.Errorf("oResource is not set, but no wait events or error")) } - if objAdapter.GetStatusID() == nil { - resourceID := actuator.GetResourceID(osResource) + resourceID := actuator.GetResourceID(osResource) + statusID := objAdapter.GetStatusID() + if statusID == nil || *statusID != resourceID { + // Update status ID if not set, or if it differs (e.g., after recreation due to drift detection) if err := status.SetStatusID(ctx, c, objAdapter.GetObject(), resourceID, c.statusWriter); err != nil { return reconcileStatus.WithError(err) } diff --git a/internal/controllers/generic/reconciler/resource_actions.go b/internal/controllers/generic/reconciler/resource_actions.go index 49f4598b1..6e1092843 100644 --- a/internal/controllers/generic/reconciler/resource_actions.go +++ b/internal/controllers/generic/reconciler/resource_actions.go @@ -70,17 +70,26 @@ func GetOrCreateOSResource[ osResource, reconcileStatus := actuator.GetOSResourceByID(ctx, *resourceID) if needsReschedule, err := reconcileStatus.NeedsReschedule(); needsReschedule { if orcerrors.IsNotFound(err) { - // An OpenStack resource we previously referenced has been deleted unexpectedly. We can't recover from this. - return osResource, progress.WrapError( - orcerrors.Terminal(orcv1alpha1.ConditionReasonUnrecoverableError, "resource has been deleted from OpenStack")) + // An OpenStack resource we previously referenced has been deleted unexpectedly. + // For managed resources, we can recover by recreating the resource. + // For imported resources, this is an unrecoverable error. + if objAdapter.GetManagementPolicy() == orcv1alpha1.ManagementPolicyManaged && objAdapter.GetImportID() == nil && objAdapter.GetImportFilter() == nil { + log.V(logging.Info).Info("Resource has been deleted from OpenStack, will recreate", "ID", *resourceID) + // Fall through to creation by not returning here. + // The status ID will be updated after the new resource is created. + } else { + return osResource, progress.WrapError( + orcerrors.Terminal(orcv1alpha1.ConditionReasonUnrecoverableError, "resource has been deleted from OpenStack")) + } } else { return osResource, reconcileStatus } } if osResource != nil { log.V(logging.Verbose).Info("Got existing OpenStack resource", "ID", actuator.GetResourceID(osResource)) + return osResource, nil } - return osResource, nil + // osResource is nil, fall through to creation } // Import by ID From 7d192625b76c4ec803d20ab7d803f2675690ee6a Mon Sep 17 00:00:00 2001 From: eshulman2 Date: Mon, 2 Feb 2026 15:45:30 +0200 Subject: [PATCH 3/3] KEP --- .../development/proposals/drift-detection.md | 431 ++++++++++++++++++ website/mkdocs.yml | 2 + 2 files changed, 433 insertions(+) create mode 100644 website/docs/development/proposals/drift-detection.md diff --git a/website/docs/development/proposals/drift-detection.md b/website/docs/development/proposals/drift-detection.md new file mode 100644 index 000000000..b04479516 --- /dev/null +++ b/website/docs/development/proposals/drift-detection.md @@ -0,0 +1,431 @@ +# OEP-001: Drift Detection and Automatic Reconciliation + +## Table of Contents + +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories](#user-stories) + - [Design Details](#design-details) + - [Periodic Resync Mechanism](#periodic-resync-mechanism) + - [Configuration Hierarchy](#configuration-hierarchy) + - [Resource Recreation on External Deletion](#resource-recreation-on-external-deletion) + - [Field Coverage](#field-coverage) + - [API Changes](#api-changes) +- [Risks and Mitigations](#risks-and-mitigations) + - [Split-Brain Scenarios](#split-brain-scenarios) + - [API Rate Limiting](#api-rate-limiting) + - [Controller Resource Consumption](#controller-resource-consumption) + - [Conflicts with External Systems](#conflicts-with-external-systems) +- [Implementation](#implementation) + - [Phase 1: Per-Resource Configuration (Current)](#phase-1-per-resource-configuration-current) + - [Phase 2: Resource-Type Level Configuration](#phase-2-resource-type-level-configuration) + - [Phase 3: ORC-Wide Configuration](#phase-3-orc-wide-configuration) +- [Test Plan](#test-plan) +- [Graduation Criteria](#graduation-criteria) +- [Production Readiness Checklist](#production-readiness-checklist) +- [Alternatives Considered](#alternatives-considered) +- [References](#references) + +## Summary + +This proposal introduces drift detection and automatic reconciliation for ORC managed resources. The feature enables ORC to periodically check OpenStack resources for changes made outside of ORC (via CLI, dashboard, or other tools) and automatically restore them to match the desired state defined in the Kubernetes specification. + +Additionally, managed resources that are deleted externally from OpenStack will be automatically recreated by ORC, ensuring the declared state is maintained. + +## Motivation + +In production environments, OpenStack resources may be modified outside of ORC through various means: + +- Direct OpenStack CLI/SDK operations +- OpenStack Horizon dashboard +- Other automation tools or controllers +- Manual emergency interventions +- Third-party integrations + +Without drift detection, these changes go unnoticed until they cause issues, leading to configuration drift between the declared Kubernetes state and the actual OpenStack state. This undermines the declarative model that ORC provides. + +Similar Kubernetes controllers for cloud resources have implemented drift detection: + +- **AWS Controllers for Kubernetes (ACK)**: Uses a 10-hour default resync period for drift recovery +- **Azure Service Operator (ASO)**: Uses a 1-hour default resync period with configurable intervals + +### Goals + +1. **Ensure state consistency**: Managed resources in OpenStack should match the desired state declared in Kubernetes +2. **Detect external modifications**: Identify when OpenStack resources are modified outside of ORC +3. **Automatic correction**: Restore drifted resources to their desired state without manual intervention +4. **Resource recreation**: Recreate managed resources that are deleted externally from OpenStack +5. **Configurable frequency**: Allow operators to tune the resync interval based on their requirements +6. **Hierarchical configuration**: Support configuration at ORC-wide, resource-type, and per-resource levels +7. **Minimal API impact**: Avoid excessive OpenStack API calls that could trigger rate limiting + +### Non-Goals + +1. **Real-time drift detection**: Event-driven detection of changes (this would require OpenStack webhooks or polling at very short intervals) +2. **Drift reporting without correction**: Alerting on drift without taking corrective action (future enhancement) +3. **Selective field reconciliation**: Allowing some fields to drift while correcting others +4. **Conflict resolution with merge semantics**: Merging external changes with desired state +5. **Drift detection for unmanaged resources**: Unmanaged resources are explicitly not modified by ORC + +## Proposal + +### User Stories + +#### Story 1: Automatic State Restoration + +As a platform operator, I want ORC to automatically restore OpenStack resources to their declared state when someone modifies them through the OpenStack CLI, so that my infrastructure remains consistent with my GitOps repository. + +#### Story 2: Resource Recreation + +As a developer, I want ORC to automatically recreate a managed network if it's accidentally deleted from OpenStack, so that my applications don't experience prolonged outages. + +#### Story 3: Configurable Resync Frequency + +As a platform operator managing critical resources, I want to configure a shorter resync period for high-priority resources while using a longer period for stable infrastructure, so that I can balance between drift detection responsiveness and API usage. + +#### Story 4: Disable Drift Detection + +As a platform operator, I want to disable periodic resync for specific resources that are frequently modified by external systems (with full awareness of the implications), so that ORC doesn't constantly revert expected changes. + +### Design Details + +#### Periodic Resync Mechanism + +The drift detection mechanism works by periodically triggering a full reconciliation of managed resources: + +1. **Trigger**: After a resource reaches a stable state (Progressing=False), ORC schedules a resync after `resyncPeriod` duration +2. **Fetch**: On resync, ORC fetches the current state of the OpenStack resource +3. **Compare**: The current state is compared against the desired state in the Kubernetes spec +4. **Update**: If drift is detected, ORC updates the OpenStack resource to match the desired state +5. **Reschedule**: After successful reconciliation, the next resync is scheduled + +The resync check is implemented in the `shouldReconcile` function: + +```go +func shouldReconcile(obj orcv1alpha1.ObjectWithConditions, resyncPeriod time.Duration) bool { + progressing := meta.FindStatusCondition(obj.GetConditions(), orcv1alpha1.ConditionProgressing) + if progressing == nil { + return true + } + if progressing.Status == metav1.ConditionTrue { + return true + } + if progressing.ObservedGeneration != obj.GetGeneration() { + return true + } + // For periodic resync, check if enough time has passed since the last sync + if resyncPeriod > 0 { + return time.Since(progressing.LastTransitionTime.Time) >= resyncPeriod + } + return false +} +``` + +#### Configuration Hierarchy + +Drift detection supports a three-level configuration hierarchy, with more specific configurations taking precedence: + +| Level | Scope | Configuration Location | Precedence | +|-------|-------|----------------------|------------| +| ORC-wide | All resources across all types | Controller deployment configuration | Lowest | +| Resource-type | All resources of a specific type (e.g., all Networks) | CRD-level configuration or controller flags | Medium | +| Per-resource | Individual resource instance | `spec.managedOptions.resyncPeriod` on the CR | Highest | + +**Resolution order**: Per-resource → Resource-type → ORC-wide → Default (10h) + +Example per-resource configuration: + +```yaml +apiVersion: openstack.k-orc.cloud/v1alpha1 +kind: Network +metadata: + name: critical-network +spec: + cloudCredentialsRef: + secretName: openstack-clouds + cloudName: openstack + managementPolicy: managed + managedOptions: + resyncPeriod: 1h # Check for drift every hour + resource: + description: Critical application network +``` + +**Default value**: 10 hours (`10h`) + +This default was chosen to balance between: +- Detecting drift in a reasonable timeframe +- Avoiding excessive OpenStack API load +- Aligning with similar controllers (ACK uses 10h default) + +**Disable resync**: Set `resyncPeriod: 0` to disable periodic drift detection for a resource. + +#### Resource Recreation on External Deletion + +When a managed resource is deleted from OpenStack but the ORC object still exists: + +1. **Detection**: On the next reconciliation (triggered by resync or spec change), ORC attempts to fetch the resource by the ID stored in `status.id` +2. **Not Found**: If the resource is not found and the ORC object: + - Has `managementPolicy: managed` + - Was NOT imported by ID or filter (i.e., was originally created by ORC) +3. **Recreation**: ORC creates a new OpenStack resource matching the desired spec +4. **Status Update**: The new resource ID is stored in `status.id` + +For **imported resources** that are deleted externally, this is a terminal error because: +- The resource was not created by ORC +- Recreating it would not restore the original resource +- The import reference (ID or filter) would become invalid + +```go +if objAdapter.GetManagementPolicy() == orcv1alpha1.ManagementPolicyManaged && + objAdapter.GetImportID() == nil && + objAdapter.GetImportFilter() == nil { + log.V(logging.Info).Info("Resource has been deleted from OpenStack, will recreate", "ID", *resourceID) + // Fall through to creation +} else { + return osResource, progress.WrapError( + orcerrors.Terminal(orcv1alpha1.ConditionReasonUnrecoverableError, + "resource has been deleted from OpenStack")) +} +``` + +#### Field Coverage + +Drift detection covers all **mutable fields** that ORC actuators implement update operations for. This includes: + +- Resource names and descriptions +- Tags and metadata +- Network/subnet configuration (where mutable) +- Security group rules +- Other resource-specific mutable attributes + +**Immutable fields** are not subject to drift correction because: +- OpenStack does not allow modifying them +- Changing them would require resource recreation (which is a separate consideration) + +**Implementation requirement**: Before this feature graduates to stable, all actuator implementations must be audited to ensure they cover all mutable fields in their update reconcilers. + +### API Changes + +The `ManagedOptions` struct is extended with a `resyncPeriod` field: + +```go +type ManagedOptions struct { + // onDelete specifies the behaviour of the controller when the ORC + // object is deleted. Options are `delete` - delete the OpenStack resource; + // `detach` - do not delete the OpenStack resource. If not specified, the + // default is `delete`. + // +kubebuilder:default:=delete + // +optional + OnDelete OnDelete `json:"onDelete,omitempty"` + + // resyncPeriod specifies the interval after which a successfully + // reconciled resource will be reconciled again to detect drift from the + // desired state. Set to 0 to disable periodic resync. If not specified, + // the default is 10 hours. + // +kubebuilder:default:="10h" + // +optional + ResyncPeriod *metav1.Duration `json:"resyncPeriod,omitempty"` +} +``` + +## Risks and Mitigations + +### Split-Brain Scenarios + +**Risk**: Multiple controllers or systems may be managing the same OpenStack resources, leading to conflicts where changes are repeatedly overwritten. + +**Mitigations**: + +1. **Retry with backoff**: Implement exponential backoff when update conflicts are detected +2. **Conflict detection**: Detect when a resource is being modified rapidly and log warnings +3. **Documentation**: Clearly document that ORC should be the sole manager of resources it creates +4. **Eventual consistency**: Accept that in conflict scenarios, the most recent writer wins, and ORC will eventually restore desired state +5. **Condition reporting**: Report conflicts in resource conditions for observability + +### API Rate Limiting + +**Risk**: Frequent resync across many resources could trigger OpenStack API rate limiting. + +**Mitigations**: + +1. **Conservative default**: 10-hour default resync period minimizes API calls +2. **Jittering**: Add random jitter to resync times to avoid thundering herd +3. **Batching**: Consider batching status checks where OpenStack APIs support it +4. **Configurable per-resource**: Allow operators to disable or lengthen resync for stable resources +5. **Resource-type configuration**: Allow operators to set defaults per resource type + +### Controller Resource Consumption + +**Risk**: Frequent reconciliation increases CPU and memory usage on the ORC controller. + +**Mitigations**: + +1. **Hash-based comparison**: Before triggering a full reconciliation, compute a hash of the OpenStack resource state (using deterministic sorted JSON serialization) and compare it against the previous hash stored in `status.observedStateHash`. Only proceed with update operations if the hashes differ. This avoids expensive field-by-field comparison and update API calls when no drift has occurred. + + ```yaml + status: + id: "12345678-1234-1234-1234-123456789abc" + observedStateHash: "sha256:a1b2c3d4..." # Hash of last observed OpenStack state + ``` + +2. **Conservative default**: 10-hour default limits reconciliation frequency +3. **Monitoring**: Expose metrics for reconciliation frequency and duration +4. **Horizontal scaling**: ORC already supports running multiple controller replicas with leader election (`--leader-elect` flag), distributing controller load across nodes + +### Conflicts with External Systems + +**Risk**: If resources are intentionally managed by external systems (e.g., autoscalers, other controllers), drift correction can cause unexpected behavior. + +**Mitigations**: + +1. **Disable option**: Allow `resyncPeriod: 0` to disable drift detection +2. **Unmanaged policy**: Use `managementPolicy: unmanaged` for externally managed resources +3. **Documentation**: Clearly document the implications of drift detection +4. **Warning in user guide**: Warn about side effects of low resyncPeriod values + +## Implementation + +### Phase 1: Per-Resource Configuration (Current) + +- [x] Add `resyncPeriod` field to `ManagedOptions` +- [x] Implement `shouldReconcile` function with resync check +- [x] Schedule resync after successful reconciliation +- [x] Recreate managed resources deleted externally +- [x] Terminal error for imported resources deleted externally +- [x] Documentation in user guide +- [ ] Audit all actuators for mutable field coverage +- [ ] Add jitter to resync scheduling +- [ ] Implement hash-based state comparison with `status.observedStateHash` + +### Phase 2: Resource-Type Level Configuration + +- [ ] Design resource-type configuration mechanism (CRD annotations, controller flags, or ConfigMap) +- [ ] Implement configuration inheritance (per-resource overrides resource-type) +- [ ] Add validation for configuration values +- [ ] Update documentation + +### Phase 3: ORC-Wide Configuration + +- [ ] Design ORC-wide configuration mechanism (controller deployment flags or ConfigMap) +- [ ] Implement full configuration hierarchy resolution +- [ ] Add configuration reload without controller restart (if ConfigMap-based) +- [ ] Update documentation + +## Test Plan + +### Unit Tests + +- [ ] `shouldReconcile` returns true when resync period has elapsed +- [ ] `shouldReconcile` returns false before resync period has elapsed +- [ ] `shouldReconcile` respects `resyncPeriod: 0` (disabled) +- [ ] `GetResyncPeriod` returns default when not specified +- [ ] Configuration hierarchy resolution + +### Integration Tests + +- [ ] Resource is reconciled after resync period elapses +- [ ] Resource drift is corrected on resync +- [ ] Deleted managed resource is recreated +- [ ] Imported resource deletion results in terminal error +- [ ] Resync scheduling includes jitter + +### E2E Tests + +- [ ] Modify OpenStack resource via CLI, verify ORC corrects drift +- [ ] Delete OpenStack resource, verify ORC recreates it +- [ ] Verify resync period is honored (with shortened test value) + +## Graduation Criteria + +This feature should graduate alongside other ORC APIs: + +### Alpha (Current) + +- Feature implemented behind default configuration +- Basic documentation available +- Unit and integration tests passing + +### Beta + +- All actuators audited for mutable field coverage +- Resource-type level configuration implemented +- E2E tests covering main scenarios +- User feedback incorporated +- Jitter implementation complete + +### Stable + +- ORC-wide configuration implemented +- Full documentation including best practices +- Observability improvements (metrics, if beneficial) +- Production usage validated +- No outstanding drift detection issues + +## Production Readiness Checklist + +- [ ] All mutable fields covered by actuator update reconcilers +- [ ] Hash-based state comparison implemented to minimize unnecessary updates +- [ ] Jitter added to prevent thundering herd +- [ ] Documentation includes guidance on resyncPeriod selection +- [ ] Warnings documented for low resyncPeriod values +- [ ] Resource-type and ORC-wide configuration available +- [ ] Conflict detection and logging implemented + +## Alternatives Considered + +### Event-Driven Drift Detection + +**Description**: Use OpenStack notifications (Oslo messaging) to detect changes in real-time. + +**Pros**: +- Immediate drift detection +- No polling overhead + +**Cons**: +- Requires OpenStack notification infrastructure +- Complex to implement and maintain +- Not all OpenStack deployments have notifications enabled +- Additional infrastructure dependency + +**Decision**: Not pursued for initial implementation. May be considered as future enhancement. + +### Drift Detection Without Correction + +**Description**: Detect and report drift without automatically correcting it. + +**Pros**: +- Safer for mixed-management scenarios +- Allows human review before changes + +**Cons**: +- Requires additional alerting/monitoring setup +- Delays remediation +- Adds operational burden + +**Decision**: Not included in initial implementation. Could be added as a separate management policy option in the future. + +### Watch-Based Detection + +**Description**: Implement a watcher that periodically lists all resources from OpenStack and compares. + +**Pros**: +- Single API call can check multiple resources +- Potentially more efficient for large numbers of resources + +**Cons**: +- List operations can be expensive +- Harder to implement with proper filtering +- Different resource types require different list calls + +**Decision**: Not pursued. Per-resource reconciliation is simpler and integrates naturally with controller-runtime. + +## References + +- [AWS Controllers for Kubernetes - Drift Recovery](https://aws-controllers-k8s.github.io/community/docs/user-docs/drift-recovery/) +- [Azure Service Operator - ADR-2022-11: Resource Reconciliation](https://azure.github.io/azure-service-operator/design/ADR-2022-11-Reconcile-Interval/) +- [Kubernetes Enhancement Proposal Template](https://github.com/kubernetes/enhancements/tree/master/keps/NNNN-kep-template) +- ORC User Guide: [Drift Detection](../../../user-guide/index.md#drift-detection) diff --git a/website/mkdocs.yml b/website/mkdocs.yml index e71fbe9e9..7970b9da5 100644 --- a/website/mkdocs.yml +++ b/website/mkdocs.yml @@ -26,6 +26,8 @@ nav: - Coding Standards: development/coding-standards.md - Interfaces: development/godoc/generic-interfaces.md - ReconcileStatus: development/godoc/reconcile-status.md + - Design Proposals: + - Drift Detection: development/proposals/drift-detection.md - Changelog: changelog.md repo_url: https://github.com/k-orc/openstack-resource-controller extra: