Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 1 addition & 21 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -579,27 +579,7 @@ func main() {
if slices.Contains(mainConfig.EnabledControllers, "failover-reservations-controller") {
setupLog.Info("enabling controller", "controller", "failover-reservations-controller")
failoverConfig := conf.GetConfigOrDie[failover.FailoverConfig]()

// Apply defaults for unset values
defaults := failover.DefaultConfig()
if failoverConfig.DatasourceName == "" {
failoverConfig.DatasourceName = defaults.DatasourceName
}
if failoverConfig.SchedulerURL == "" {
failoverConfig.SchedulerURL = defaults.SchedulerURL
}
if failoverConfig.ReconcileInterval == 0 {
failoverConfig.ReconcileInterval = defaults.ReconcileInterval
}
if failoverConfig.Creator == "" {
failoverConfig.Creator = defaults.Creator
}
if failoverConfig.FlavorFailoverRequirements == nil {
failoverConfig.FlavorFailoverRequirements = defaults.FlavorFailoverRequirements
}
if failoverConfig.RevalidationInterval == 0 {
failoverConfig.RevalidationInterval = defaults.RevalidationInterval
}
failoverConfig.ApplyDefaults()

// DatasourceName is still required - check after applying defaults
if failoverConfig.DatasourceName == "" {
Expand Down
25 changes: 13 additions & 12 deletions helm/bundles/cortex-nova/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,23 @@ cortex-scheduling-controllers:
# Maps flavor name patterns (glob) to required failover count
# Example: {"hana_*": 2, "m1.xlarge": 1}
flavorFailoverRequirements:
"*": 2
"*": 1
# How often to check for missing failover reservations (periodic bulk reconciliation)
# 35s = 35000000000 nanoseconds
reconcileInterval: 35000000000
# Used when maxVMsToProcess limits processing, allows faster catch-up
# 100ms = 100000000 nanoseconds
shortReconcileInterval: 100000000
# Tag for failover reservations (for identification and cleanup)
creator: cortex-failover-controller
# Limits VMs processed per cycle. Set to 0 to process all VMs.
maxVMsToProcess: 25
reconcileInterval: 15m
# Used when maxVMsToProcess limits processing, allows faster catch-up and for the first reconcile
shortReconcileInterval: 1m
# Number of max VMs to process in one periodic reconciliation loop
maxVMsToProcess: 5
# Minimum successful reservations to use short interval
minSuccessForShortInterval: 1
# Maximum failures allowed to still use short interval
maxFailuresForShortInterval: 99
# If true, uses hypervisor CRD as source of truth for VM location instead of postgres
trustHypervisorLocation: true
# How often to re-validate acknowledged failover reservations
# 30m = 1800000000000 nanoseconds
revalidationInterval: 1800000000000
revalidationInterval: 30m
# Prevents creating multiple new reservations on the same hypervisor per cycle
limitOneNewReservationPerHypervisor: false

cortex-knowledge-controllers:
<<: *cortex
Expand Down
101 changes: 88 additions & 13 deletions internal/scheduling/reservations/failover/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

package failover

import "time"
import (
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// FailoverConfig defines the configuration for failover reservation management.
type FailoverConfig struct {
Expand All @@ -13,7 +17,8 @@ type FailoverConfig struct {
FlavorFailoverRequirements map[string]int `json:"flavorFailoverRequirements"`

// ReconcileInterval is how often to check for missing failover reservations.
ReconcileInterval time.Duration `json:"reconcileInterval"`
// Supports Go duration strings like "30s", "1m", "15m".
ReconcileInterval metav1.Duration `json:"reconcileInterval"`

// Creator tag for failover reservations (for identification and cleanup).
Creator string `json:"creator"`
Expand All @@ -27,14 +32,23 @@ type FailoverConfig struct {
SchedulerURL string `json:"schedulerURL"`

// MaxVMsToProcess limits the number of VMs to process per reconciliation cycle.
// Set to 0 or negative to process all VMs (default behavior).
// Set to negative to process all VMs (default behavior).
// Useful for debugging and testing with large VM counts.
MaxVMsToProcess int `json:"maxVMsToProcess"`

// ShortReconcileInterval is used when MaxVMsToProcess limits processing.
// This allows faster catch-up when there are more VMs to process.
// Set to 0 to use ReconcileInterval (default behavior).
ShortReconcileInterval time.Duration `json:"shortReconcileInterval"`
// Supports Go duration strings like "100ms", "1s", "1m".
ShortReconcileInterval metav1.Duration `json:"shortReconcileInterval"`

// MinSuccessForShortInterval is the minimum number of successful reservations (created + reused)
// required to use ShortReconcileInterval. Default: 1. Use 0 to require no minimum.
MinSuccessForShortInterval *int `json:"minSuccessForShortInterval"`

// MaxFailuresForShortInterval is the maximum number of failures allowed to still use
// ShortReconcileInterval. Default: 99. Use 0 to allow no failures.
MaxFailuresForShortInterval *int `json:"maxFailuresForShortInterval"`

// TrustHypervisorLocation when true, uses the hypervisor CRD as the source of truth
// for VM location instead of postgres (OSEXTSRVATTRHost). This is useful when there
Expand All @@ -49,19 +63,80 @@ type FailoverConfig struct {
// After a reservation is acknowledged, it will be re-validated after this interval
// to ensure the reservation host is still valid for all allocated VMs.
// Default: 30 minutes
RevalidationInterval time.Duration `json:"revalidationInterval"`
// Supports Go duration strings like "15m", "30m", "1h".
RevalidationInterval metav1.Duration `json:"revalidationInterval"`

// LimitOneNewReservationPerHypervisor when true, prevents creating multiple new
// reservations on the same hypervisor within a single reconcile cycle.
// This helps spread reservations across hypervisors.
// Default: true
LimitOneNewReservationPerHypervisor bool `json:"limitOneNewReservationPerHypervisor"`

// VMSelectionRotationInterval controls how often the VM selection offset rotates
// when MaxVMsToProcess limits processing. Every N reconcile cycles, the offset
// rotates to process different VMs. This ensures all VMs eventually get processed.
// Default: 4 (rotate every 4th reconcile cycle). Use 0 to disable rotation.
VMSelectionRotationInterval *int `json:"vmSelectionRotationInterval"`
}

// intPtr returns a pointer to the given int value.
func intPtr(i int) *int {
return &i
}

// ApplyDefaults fills in any unset values with defaults.
func (c *FailoverConfig) ApplyDefaults() {
defaults := DefaultConfig()
if c.DatasourceName == "" {
c.DatasourceName = defaults.DatasourceName
}
if c.SchedulerURL == "" {
c.SchedulerURL = defaults.SchedulerURL
}
if c.ReconcileInterval.Duration == 0 {
c.ReconcileInterval = defaults.ReconcileInterval
}
if c.Creator == "" {
c.Creator = defaults.Creator
}
if c.FlavorFailoverRequirements == nil {
c.FlavorFailoverRequirements = defaults.FlavorFailoverRequirements
}
if c.RevalidationInterval.Duration == 0 {
c.RevalidationInterval = defaults.RevalidationInterval
}
if c.ShortReconcileInterval.Duration == 0 {
c.ShortReconcileInterval = defaults.ShortReconcileInterval
}
if c.MinSuccessForShortInterval == nil {
c.MinSuccessForShortInterval = defaults.MinSuccessForShortInterval
}
if c.MaxFailuresForShortInterval == nil {
c.MaxFailuresForShortInterval = defaults.MaxFailuresForShortInterval
}
if c.MaxVMsToProcess == 0 {
c.MaxVMsToProcess = defaults.MaxVMsToProcess
}
if c.VMSelectionRotationInterval == nil {
c.VMSelectionRotationInterval = defaults.VMSelectionRotationInterval
}
}

// DefaultConfig returns a default configuration.
func DefaultConfig() FailoverConfig {
return FailoverConfig{
FlavorFailoverRequirements: map[string]int{"*": 2}, // by default all VMs get 2 failover reservations
ReconcileInterval: 30 * time.Second,
ShortReconcileInterval: 100 * time.Millisecond,
Creator: "cortex-failover-controller",
DatasourceName: "nova-servers", // we have the server and flavor data source (both store in same postgres and same secret but still)
SchedulerURL: "http://localhost:8080/scheduler/nova/external",
TrustHypervisorLocation: false,
RevalidationInterval: 30 * time.Minute,
FlavorFailoverRequirements: map[string]int{"*": 2}, // by default all VMs get 2 failover reservations
ReconcileInterval: metav1.Duration{Duration: 30 * time.Second},
ShortReconcileInterval: metav1.Duration{Duration: 100 * time.Millisecond},
MinSuccessForShortInterval: intPtr(1),
MaxFailuresForShortInterval: intPtr(99),
MaxVMsToProcess: 30,
Creator: "cortex-failover-controller",
DatasourceName: "nova-servers", // we have the server and flavor data source (both store in same postgres and same secret but still)
SchedulerURL: "http://localhost:8080/scheduler/nova/external",
TrustHypervisorLocation: false,
RevalidationInterval: metav1.Duration{Duration: 30 * time.Minute},
LimitOneNewReservationPerHypervisor: true,
VMSelectionRotationInterval: intPtr(4),
}
}
6 changes: 6 additions & 0 deletions internal/scheduling/reservations/failover/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@ import (

"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
"github.com/go-logr/logr"
"github.com/google/uuid"
)

// WithNewGlobalRequestID creates a new context with a failover-prefixed global request ID.
func WithNewGlobalRequestID(ctx context.Context) context.Context {
return reservations.WithGlobalRequestID(ctx, "failover-"+uuid.New().String())
}

// LoggerFromContext returns a logger with greq and req values from the context.
// This creates a child logger with the request tracking values pre-attached,
// so you don't need to repeat them in every log call.
Expand Down
Loading
Loading