From 09463c332b28a3c1c674d52f83952eabe59b30e2 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 13:36:31 +0100 Subject: [PATCH 1/6] fix cr controller race condition --- .../commitments/api_change_commitments.go | 22 ++++++--- .../api_change_commitments_test.go | 33 +++++++++---- .../reservations/commitments/controller.go | 48 ++++++++----------- .../commitments/controller_test.go | 30 +++++++++--- 4 files changed, 81 insertions(+), 52 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 1ef003e30..e8e2ec172 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -376,18 +376,26 @@ func watchReservationsUntilReady( switch readyCond.Status { case metav1.ConditionTrue: - // check if host is not set in spec or status: if so, no capacity left to schedule the reservation + // Only consider truly ready if Status.Host is populated + // This handles the two-phase reconcile pattern where Ready=True + // is set atomically with Status.Host in the controller if current.Spec.TargetHost == "" || current.Status.Host == "" { allAreReady = false - failedReservations = append(failedReservations, current) - logger.Info("insufficient capacity for reservation", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message, "targetHostInSpec", current.Spec.TargetHost, "hostInStatus", current.Status.Host) - } else { - // Reservation is successfully scheduled, no further action needed - logger.Info("reservation ready", "reservation", current.Name, "host", current.Spec.TargetHost) + stillWaiting = append(stillWaiting, res) + continue } + // Reservation is successfully scheduled + logger.Info("reservation ready", "reservation", current.Name, "host", current.Status.Host) case metav1.ConditionFalse: - failedReservations = append(failedReservations, res) + // Check the reason - NoHostsFound means no capacity + if readyCond.Reason == "NoHostsFound" || readyCond.Reason == "NoHostsAvailable" { + failedReservations = append(failedReservations, current) + logger.Info("insufficient capacity for reservation", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message) + } else { + // Other failure reasons + logger.Info("reservation failed", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message) + } case metav1.ConditionUnknown: allAreReady = false stillWaiting = append(stillWaiting, res) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 4d378af55..d996c498e 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -1277,7 +1277,8 @@ func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { } } -// markReservationSchedulerProcessedStatus updates a reservation to have Ready=True status (scheduling can be succeeded or not - depending on host status) +// markReservationSchedulerProcessedStatus updates a reservation status based on scheduling result. +// If host is non-empty, sets Ready=True (success). If host is empty, sets Ready=False with NoHostsFound (failure). func (env *CommitmentTestEnv) markReservationSchedulerProcessedStatus(res *v1alpha1.Reservation, host string) { ctx := context.Background() @@ -1288,16 +1289,28 @@ func (env *CommitmentTestEnv) markReservationSchedulerProcessedStatus(res *v1alp return } - // Then update status + // Then update status - Ready=True only if host was found, Ready=False otherwise res.Status.Host = host - res.Status.Conditions = []metav1.Condition{ - { - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionTrue, - Reason: "ReservationActive", - Message: "Reservation is ready (set by test controller)", - LastTransitionTime: metav1.Now(), - }, + if host != "" { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + Message: "Reservation is ready (set by test controller)", + LastTransitionTime: metav1.Now(), + }, + } + } else { + res.Status.Conditions = []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionFalse, + Reason: "NoHostsFound", + Message: "No hosts with sufficient capacity (set by test controller)", + LastTransitionTime: metav1.Now(), + }, + } } if err := env.K8sClient.Status().Update(ctx, res); err != nil { env.T.Logf("Warning: Failed to update reservation status: %v", err) diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index f68e35cde..59b11a9bd 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -135,13 +135,17 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr // Sync Spec values to Status fields for non-pre-allocated reservations // This ensures the observed state reflects the desired state from Spec - needsStatusUpdate := false + // When TargetHost is set in Spec but not synced to Status, this means + // the scheduler found a host and we need to mark the reservation as ready. if res.Spec.TargetHost != "" && res.Status.Host != res.Spec.TargetHost { - res.Status.Host = res.Spec.TargetHost - needsStatusUpdate = true - } - if needsStatusUpdate { old := res.DeepCopy() + res.Status.Host = res.Spec.TargetHost + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + Message: "reservation is successfully scheduled", + }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion @@ -152,7 +156,9 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr // Object was deleted, no need to continue return ctrl.Result{}, nil } - logger.Info("synced spec to status", "host", res.Status.Host) + logger.Info("synced spec to status and marked ready", "host", res.Status.Host) + // Return and let next reconcile handle allocation verification + return ctrl.Result{}, nil } // Get project ID from CommittedResourceReservation spec if available. @@ -259,7 +265,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr } if len(scheduleResp.Hosts) == 0 { - logger.Info("no hosts found for reservation") + logger.Info("no hosts found for reservation", "reservation", res.Name, "flavorName", resourceName) old := res.DeepCopy() meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ Type: v1alpha1.ReservationConditionReady, @@ -280,11 +286,12 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil // No need to requeue, we didn't find a host. } - // Update the reservation with the found host (idx 0) + // Update the reservation Spec with the found host (idx 0) + // Only update Spec here - the Status will be synced in the next reconcile cycle + // This avoids race conditions from doing two patches in one reconcile host := scheduleResp.Hosts[0] logger.Info("found host for reservation", "host", host) - // First update Spec old := res.DeepCopy() res.Spec.TargetHost = host if err := r.Patch(ctx, &res, client.MergeFrom(old)); err != nil { @@ -297,26 +304,9 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } - // Then update Status - old = res.DeepCopy() - meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionTrue, - Reason: "ReservationActive", - Message: "reservation is successfully scheduled", - }) - res.Status.Host = host - patch := client.MergeFrom(old) - if err := r.Status().Patch(ctx, &res, patch); err != nil { - // Ignore not-found errors during background deletion - if client.IgnoreNotFound(err) != nil { - logger.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err - } - // Object was deleted, no need to continue - return ctrl.Result{}, nil - } - return ctrl.Result{}, nil // No need to requeue, the reservation is now active. + // The Spec patch will trigger a re-reconcile, which will sync Status in the + // "Sync Spec values to Status" section above + return ctrl.Result{}, nil } // reconcileAllocations verifies all allocations in Spec against actual Nova VM state. diff --git a/internal/scheduling/reservations/commitments/controller_test.go b/internal/scheduling/reservations/commitments/controller_test.go index 7433165a9..5af5dfca9 100644 --- a/internal/scheduling/reservations/commitments/controller_test.go +++ b/internal/scheduling/reservations/commitments/controller_test.go @@ -292,21 +292,39 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t }, } + // First reconcile: schedules the reservation and sets Spec.TargetHost result, err := reconciler.Reconcile(context.Background(), req) - if err != nil { - t.Errorf("reconcileInstanceReservation() error = %v", err) + t.Errorf("First reconcile error = %v", err) return } + if result.RequeueAfter > 0 { + t.Errorf("Expected no requeue after first reconcile but got %v", result.RequeueAfter) + } + // Verify Spec.TargetHost is set after first reconcile + var afterFirstReconcile v1alpha1.Reservation + if err = client.Get(context.Background(), req.NamespacedName, &afterFirstReconcile); err != nil { + t.Errorf("Failed to get reservation after first reconcile: %v", err) + return + } + if afterFirstReconcile.Spec.TargetHost != "test-host-1" { + t.Errorf("Expected Spec.TargetHost=%v after first reconcile, got %v", "test-host-1", afterFirstReconcile.Spec.TargetHost) + } + + // Second reconcile: syncs Spec.TargetHost to Status and sets Ready=True + result, err = reconciler.Reconcile(context.Background(), req) + if err != nil { + t.Errorf("Second reconcile error = %v", err) + return + } if result.RequeueAfter > 0 { - t.Errorf("Expected no requeue but got %v", result.RequeueAfter) + t.Errorf("Expected no requeue after second reconcile but got %v", result.RequeueAfter) } - // Verify the reservation status + // Verify the reservation status after second reconcile var updated v1alpha1.Reservation - err = client.Get(context.Background(), req.NamespacedName, &updated) - if err != nil { + if err = client.Get(context.Background(), req.NamespacedName, &updated); err != nil { t.Errorf("Failed to get updated reservation: %v", err) return } From 9b92e08dbb34ff23ebde7ae2c0022fe31e145290 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 13:58:03 +0100 Subject: [PATCH 2/6] logs --- api/v1alpha1/reservation_types.go | 7 +++++++ .../commitments/api_change_commitments.go | 11 ++++++----- .../reservations/commitments/controller.go | 18 +++++++++++------- .../commitments/reservation_manager.go | 12 ++++++------ .../reservations/commitments/state.go | 2 ++ 5 files changed, 32 insertions(+), 18 deletions(-) diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 4a7fe5cf2..97950a395 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -36,6 +36,13 @@ const ( ReservationTypeLabelFailover = "failover" ) +// Annotation keys for Reservation metadata. +const ( + // AnnotationCreatorRequestID tracks the request ID that created this reservation. + // Used for end-to-end traceability across API calls, controller reconciles, and scheduler invocations. + AnnotationCreatorRequestID = "reservations.cortex.cloud/creator-request-id" +) + // CommittedResourceAllocation represents a workload's assignment to a committed resource reservation slot. // The workload could be a VM (Nova/IronCore), Pod (Kubernetes), or other resource. type CommittedResourceAllocation struct { diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index e8e2ec172..818692e60 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -132,7 +132,7 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons manager := NewReservationManager(api.client) requireRollback := false failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback - logger.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) + creatorRequestID := reservations.GlobalRequestIDFromContext(ctx) knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) @@ -194,8 +194,7 @@ ProcessLoop: } for _, commitment := range resourceChanges.Commitments { - // Additional per-commitment validation if needed - logger.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + logger.V(1).Info("processing commitment", "commitmentUUID", commitment.UUID, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation // TODO add domain @@ -247,8 +246,10 @@ ProcessLoop: requireRollback = true break ProcessLoop } + // Set creator request ID for traceability across controller reconciles + stateDesired.CreatorRequestID = creatorRequestID - logger.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) + logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes) touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { @@ -257,7 +258,7 @@ ProcessLoop: requireRollback = true break ProcessLoop } - logger.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) + logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) reservationsToWatch = append(reservationsToWatch, touchedReservations...) } } diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 59b11a9bd..e03958151 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -50,7 +50,8 @@ type CommitmentReservationController struct { // Note: This controller only handles commitment reservations, as filtered by the predicate. func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { ctx = WithNewGlobalRequestID(ctx) - logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name, "namespace", req.Namespace) + logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) + // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { @@ -58,6 +59,11 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, client.IgnoreNotFound(err) } + // Extract creator request ID from annotation for end-to-end traceability + if creatorReq := res.Annotations[v1alpha1.AnnotationCreatorRequestID]; creatorReq != "" { + logger = logger.WithValues("creatorReq", creatorReq) + } + // filter for CR reservations resourceName := "" if res.Spec.CommittedResourceReservation != nil { @@ -86,7 +92,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr } if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { - logger.Info("reservation is active, verifying allocations") + logger.V(1).Info("reservation is active, verifying allocations") // Verify all allocations in Spec against actual VM state from database if err := r.reconcileAllocations(ctx, &res); err != nil { @@ -323,7 +329,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte // Skip if no allocations to verify if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { - logger.Info("no allocations to verify", "reservation", res.Name) + logger.V(1).Info("no allocations to verify", "reservation", res.Name) return nil } @@ -349,9 +355,8 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte actualHost := server.OSEXTSRVATTRHost newStatusAllocations[vmUUID] = actualHost - logger.Info("verified VM allocation", + logger.V(1).Info("verified VM allocation", "vm", vmUUID, - "reservation", res.Name, "actualHost", actualHost, "expectedHost", res.Status.Host) } else { @@ -381,8 +386,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte return fmt.Errorf("failed to patch reservation status: %w", err) } - logger.Info("reconciled allocations", - "reservation", res.Name, + logger.V(1).Info("reconciled allocations", "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), "statusAllocations", len(newStatusAllocations)) diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 773f74122..e09d44d4e 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -221,12 +221,9 @@ func (m *ReservationManager) syncReservationMetadata( (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { // Apply patch - logger.Info("syncing reservation metadata", - "reservation", reservation, - "desired commitmentUUID", state.CommitmentUUID, - "desired availabilityZone", state.AvailabilityZone, - "desired startTime", state.StartTime, - "desired endTime", state.EndTime) + logger.V(1).Info("syncing reservation metadata", + "reservation", reservation.Name, + "commitmentUUID", state.CommitmentUUID) patch := client.MergeFrom(reservation.DeepCopy()) @@ -322,6 +319,9 @@ func (m *ReservationManager) newReservation( Labels: map[string]string{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }, + Annotations: map[string]string{ + v1alpha1.AnnotationCreatorRequestID: state.CreatorRequestID, + }, }, Spec: spec, } diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index dbc3ad72d..c8e75ff83 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -46,6 +46,8 @@ type CommitmentState struct { StartTime *time.Time // EndTime is when the commitment expires EndTime *time.Time + // CreatorRequestID is the request ID that triggered this state change (for traceability) + CreatorRequestID string } // FromCommitment converts Limes commitment to CommitmentState. From 15f4f407c29885a8120e058d43b82ec8e067ad8b Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 14:14:26 +0100 Subject: [PATCH 3/6] logs --- .../commitments/api_change_commitments.go | 57 ++++++++++--------- .../commitments/reservation_manager.go | 40 +++++-------- 2 files changed, 44 insertions(+), 53 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 818692e60..6da3ca024 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -319,6 +319,7 @@ ProcessLoop: } // watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. +// Returns failed reservations and any errors encountered. func watchReservationsUntilReady( ctx context.Context, logger logr.Logger, @@ -333,19 +334,32 @@ func watchReservationsUntilReady( } deadline := time.Now().Add(timeout) + startTime := time.Now() + totalReservations := len(reservations) reservationsToWatch := make([]v1alpha1.Reservation, len(reservations)) copy(reservationsToWatch, reservations) + // Track successful reservations for summary + var successfulReservations []string + pollCount := 0 + for { + pollCount++ var stillWaiting []v1alpha1.Reservation if time.Now().After(deadline) { errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout)) + // Log summary on timeout + logger.Info("reservation watch completed (timeout)", + "total", totalReservations, + "ready", len(successfulReservations), + "failed", len(failedReservations), + "timedOut", len(reservationsToWatch), + "duration", time.Since(startTime).Round(time.Millisecond), + "polls", pollCount) return failedReservations, errors } - allAreReady := true - for _, res := range reservationsToWatch { // Fetch current state var current v1alpha1.Reservation @@ -355,9 +369,7 @@ func watchReservationsUntilReady( } if err := k8sClient.Get(ctx, nn, ¤t); err != nil { - allAreReady = false - // Reservation is still in process of being created, or there is a transient error, continue waiting for it - logger.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err) + // Reservation is still in process of being created, or there is a transient error stillWaiting = append(stillWaiting, res) continue } @@ -370,7 +382,6 @@ func watchReservationsUntilReady( if readyCond == nil { // Condition not set yet, keep waiting - allAreReady = false stillWaiting = append(stillWaiting, res) continue } @@ -378,43 +389,33 @@ func watchReservationsUntilReady( switch readyCond.Status { case metav1.ConditionTrue: // Only consider truly ready if Status.Host is populated - // This handles the two-phase reconcile pattern where Ready=True - // is set atomically with Status.Host in the controller if current.Spec.TargetHost == "" || current.Status.Host == "" { - allAreReady = false stillWaiting = append(stillWaiting, res) continue } - // Reservation is successfully scheduled - logger.Info("reservation ready", "reservation", current.Name, "host", current.Status.Host) + // Reservation is successfully scheduled - track for summary + successfulReservations = append(successfulReservations, current.Name) case metav1.ConditionFalse: - // Check the reason - NoHostsFound means no capacity - if readyCond.Reason == "NoHostsFound" || readyCond.Reason == "NoHostsAvailable" { - failedReservations = append(failedReservations, current) - logger.Info("insufficient capacity for reservation", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message) - } else { - // Other failure reasons - logger.Info("reservation failed", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message) - } + // Any failure reason counts as failed + failedReservations = append(failedReservations, current) case metav1.ConditionUnknown: - allAreReady = false stillWaiting = append(stillWaiting, res) } } - if allAreReady || len(stillWaiting) == 0 { - logger.Info("all reservations checked", - "failed", len(failedReservations)) + if len(stillWaiting) == 0 { + // All reservations have reached a terminal state - log summary + logger.Info("reservation watch completed", + "total", totalReservations, + "ready", len(successfulReservations), + "failed", len(failedReservations), + "duration", time.Since(startTime).Round(time.Millisecond), + "polls", pollCount) return failedReservations, errors } reservationsToWatch = stillWaiting - // Log progress - logger.V(1).Info("waiting for reservations to become ready", - "notReady", len(reservationsToWatch), - "total", len(reservations), - "timeRemaining", time.Until(deadline).Round(time.Second)) // Wait before next poll select { diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index e09d44d4e..40cfa08ef 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -82,18 +82,15 @@ func (m *ReservationManager) ApplyCommitmentState( deltaMemoryBytes -= memoryQuantity.Value() } - log.Info("applying commitment state", - "commitmentUUID", desiredState.CommitmentUUID, - "desiredMemoryBytes", desiredState.TotalMemoryBytes, - "deltaMemoryBytes", deltaMemoryBytes, - "existingSlots", len(existing), - ) + // Log only if there's actual work to do (delta != 0) + hasChanges := deltaMemoryBytes != 0 nextSlotIndex := GetNextSlotIndex(existing) // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) // They will be recreated with correct metadata in subsequent phases. var validReservations []v1alpha1.Reservation + var repairedCount int for _, res := range existing { if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { @@ -104,6 +101,7 @@ func (m *ReservationManager) ApplyCommitmentState( "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, "expectedProjectID", desiredState.ProjectID, "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) + repairedCount++ removedReservations = append(removedReservations, res) memValue := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() @@ -145,19 +143,13 @@ func (m *ReservationManager) ApplyCommitmentState( memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() - log.Info("deleting reservation (capacity decrease)", - "commitmentUUID", desiredState.CommitmentUUID, - "deltaMemoryBytes", deltaMemoryBytes, - "name", reservationToDelete.Name, - "numAllocations", len(reservationToDelete.Spec.CommittedResourceReservation.Allocations), - "memoryBytes", memValue.Value()) - if err := m.Delete(ctx, reservationToDelete); err != nil { return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) } } // Phase 5 (CREATE): Create new reservations (capacity increased) + var createdCount int for deltaMemoryBytes > 0 { // Need to create new reservation slots, always prefer largest flavor within the group // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio @@ -165,13 +157,7 @@ func (m *ReservationManager) ApplyCommitmentState( touchedReservations = append(touchedReservations, *reservation) memValue := reservation.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memValue.Value() - - log.Info("creating reservation", - "commitmentUUID", desiredState.CommitmentUUID, - "deltaMemoryBytes", deltaMemoryBytes, - "flavorName", reservation.Spec.CommittedResourceReservation.ResourceName, - "name", reservation.Name, - "memoryBytes", memValue.Value()) + createdCount++ if err := m.Create(ctx, reservation); err != nil { if apierrors.IsAlreadyExists(err) { @@ -198,11 +184,15 @@ func (m *ReservationManager) ApplyCommitmentState( } } - log.Info("completed commitment state sync", - "commitmentUUID", desiredState.CommitmentUUID, - "totalReservations", len(existing), - "created", len(touchedReservations)-len(existing), - "deleted", len(removedReservations)) + // Only log if there were actual changes + if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 { + log.Info("commitment state sync completed", + "commitmentUUID", desiredState.CommitmentUUID, + "created", createdCount, + "deleted", len(removedReservations), + "repaired", repairedCount, + "total", len(existing)+createdCount) + } return touchedReservations, removedReservations, nil } From 4021af51b519752d39984ef2d9f36f635edf520a Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 14:25:25 +0100 Subject: [PATCH 4/6] initialize cr prom metrics --- .../api_change_commitments_monitor.go | 19 ++++++++++++++- .../api_change_commitments_monitor_test.go | 24 ++++++++++++------- .../api_report_capacity_monitor.go | 14 ++++++++++- .../commitments/api_report_usage_monitor.go | 14 ++++++++++- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go index 30a09d49e..d8522008b 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go @@ -16,8 +16,10 @@ type ChangeCommitmentsAPIMonitor struct { } // NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { - return ChangeCommitmentsAPIMonitor{ + m := ChangeCommitmentsAPIMonitor{ requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_committed_resource_change_api_requests_total", Help: "Total number of committed resource change API requests by HTTP status code", @@ -35,6 +37,21 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready", }), } + + // Pre-initialize metrics with zero values for common HTTP status codes. + // This ensures metrics exist in Prometheus before the first request, + // preventing "metric missing" warnings in alerting rules. + for _, statusCode := range []string{"200", "400", "409", "500", "503"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + + // Pre-initialize commitment change result labels + for _, result := range []string{"accepted", "rejected"} { + m.commitmentChanges.WithLabelValues(result) + } + + return m } // Describe implements prometheus.Collector. diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go index 322d01aae..255f05bd8 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -102,11 +102,13 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { // Verify request counter has correct labels for _, family := range families { if *family.Name == "cortex_committed_resource_change_api_requests_total" { - if len(family.Metric) != 3 { - t.Errorf("Expected 3 request counter metrics, got %d", len(family.Metric)) + // At minimum we expect the 3 labels we added (200, 409, 503) + // Plus pre-initialized labels (400, 500) - so >= 5 total + if len(family.Metric) < 3 { + t.Errorf("Expected at least 3 request counter metrics, got %d", len(family.Metric)) } - // Check label names + // Check all metrics have the status_code label for _, metric := range family.Metric { labelNames := make(map[string]bool) for _, label := range metric.Label { @@ -120,11 +122,13 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { } if *family.Name == "cortex_committed_resource_change_api_request_duration_seconds" { - if len(family.Metric) != 1 { - t.Errorf("Expected 1 histogram metric, got %d", len(family.Metric)) + // At minimum we expect the label we used (200) + // Plus pre-initialized labels - so >= 1 total + if len(family.Metric) < 1 { + t.Errorf("Expected at least 1 histogram metric, got %d", len(family.Metric)) } - // Check label names + // Check all metrics have the status_code label for _, metric := range family.Metric { labelNames := make(map[string]bool) for _, label := range metric.Label { @@ -138,11 +142,13 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { } if *family.Name == "cortex_committed_resource_change_api_commitment_changes_total" { - if len(family.Metric) != 2 { - t.Errorf("Expected 2 commitment changes metrics, got %d", len(family.Metric)) + // At minimum we expect the 2 labels we added (success, rejected) + // Plus pre-initialized labels (accepted) - so >= 2 total + if len(family.Metric) < 2 { + t.Errorf("Expected at least 2 commitment changes metrics, got %d", len(family.Metric)) } - // Check label names + // Check all metrics have the result label for _, metric := range family.Metric { labelNames := make(map[string]bool) for _, label := range metric.Label { diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go b/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go index d484b6f27..d78af6cc7 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go @@ -14,8 +14,10 @@ type ReportCapacityAPIMonitor struct { } // NewReportCapacityAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor { - return ReportCapacityAPIMonitor{ + m := ReportCapacityAPIMonitor{ requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_committed_resource_capacity_api_requests_total", Help: "Total number of committed resource capacity API requests by HTTP status code", @@ -26,6 +28,16 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor { Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"status_code"}), } + + // Pre-initialize metrics with zero values for common HTTP status codes. + // This ensures metrics exist in Prometheus before the first request, + // preventing "metric missing" warnings in alerting rules. + for _, statusCode := range []string{"200", "500", "503"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + + return m } // Describe implements prometheus.Collector. diff --git a/internal/scheduling/reservations/commitments/api_report_usage_monitor.go b/internal/scheduling/reservations/commitments/api_report_usage_monitor.go index d3fc68018..bbcaaaf86 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage_monitor.go +++ b/internal/scheduling/reservations/commitments/api_report_usage_monitor.go @@ -14,8 +14,10 @@ type ReportUsageAPIMonitor struct { } // NewReportUsageAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. func NewReportUsageAPIMonitor() ReportUsageAPIMonitor { - return ReportUsageAPIMonitor{ + m := ReportUsageAPIMonitor{ requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_committed_resource_usage_api_requests_total", Help: "Total number of committed resource usage API requests by HTTP status code", @@ -26,6 +28,16 @@ func NewReportUsageAPIMonitor() ReportUsageAPIMonitor { Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"status_code"}), } + + // Pre-initialize metrics with zero values for common HTTP status codes. + // This ensures metrics exist in Prometheus before the first request, + // preventing "metric missing" warnings in alerting rules. + for _, statusCode := range []string{"200", "400", "404", "500", "503"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + + return m } // Describe implements prometheus.Collector. From 8167f78d7e21014b1f14156dd2aef000eda2b9b7 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 14:38:19 +0100 Subject: [PATCH 5/6] cr off switch for capacity and usage reporting --- .../reservations/commitments/api_report_capacity.go | 8 ++++++++ .../reservations/commitments/api_report_usage.go | 8 ++++++++ .../scheduling/reservations/commitments/config.go | 12 ++++++++++++ 3 files changed, 28 insertions(+) diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go index 2f7618ced..836ae017f 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -20,6 +20,14 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) startTime := time.Now() statusCode := http.StatusOK + // Check if API is enabled + if !api.config.EnableReportCapacityAPI { + statusCode = http.StatusServiceUnavailable + http.Error(w, "report-capacity API is disabled", statusCode) + api.recordCapacityMetrics(statusCode, startTime) + return + } + ctx := WithNewGlobalRequestID(r.Context()) logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity") diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go index 86b514d87..a533d4e40 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage.go +++ b/internal/scheduling/reservations/commitments/api_report_usage.go @@ -24,6 +24,14 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { startTime := time.Now() statusCode := http.StatusOK + // Check if API is enabled + if !api.config.EnableReportUsageAPI { + statusCode = http.StatusServiceUnavailable + http.Error(w, "report-usage API is disabled", statusCode) + api.recordUsageMetrics(statusCode, startTime) + return + } + requestID := r.Header.Get("X-Request-ID") if requestID == "" { requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 0004cf19b..98a5d59ae 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -47,6 +47,16 @@ type Config struct { // When false, the endpoint will return HTTP 503 Service Unavailable. // The info endpoint remains available for health checks. EnableChangeCommitmentsAPI bool `json:"committedResourceEnableChangeCommitmentsAPI"` + + // EnableReportUsageAPI controls whether the report-usage API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + // This can be used as an emergency switch if the usage reporting is causing issues. + EnableReportUsageAPI bool `json:"committedResourceEnableReportUsageAPI"` + + // EnableReportCapacityAPI controls whether the report-capacity API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + // This can be used as an emergency switch if the capacity reporting is causing issues. + EnableReportCapacityAPI bool `json:"committedResourceEnableReportCapacityAPI"` } func DefaultConfig() Config { @@ -58,5 +68,7 @@ func DefaultConfig() Config { ChangeAPIWatchReservationsTimeout: 10 * time.Second, ChangeAPIWatchReservationsPollInterval: 500 * time.Millisecond, EnableChangeCommitmentsAPI: true, + EnableReportUsageAPI: true, + EnableReportCapacityAPI: true, } } From 716e622b3bebb2a4770ac099ff6495d3b31d6981 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 14:49:10 +0100 Subject: [PATCH 6/6] header with logging id --- .../commitments/api_change_commitments.go | 12 +++++++----- .../reservations/commitments/api_info.go | 11 ++++++++++- .../commitments/api_report_capacity.go | 11 ++++++++++- .../commitments/api_report_usage.go | 17 +++++++++++------ 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 6da3ca024..953319dbe 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -49,6 +49,13 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque req := liquid.CommitmentChangeRequest{} statusCode := http.StatusOK + // Extract or generate request ID for tracing - always set in response header + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + w.Header().Set("X-Request-ID", requestID) + // Check if API is enabled if !api.config.EnableChangeCommitmentsAPI { statusCode = http.StatusServiceUnavailable @@ -61,11 +68,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque api.changeMutex.Lock() defer api.changeMutex.Unlock() - // Extract or generate request ID for tracing - requestID := r.Header.Get("X-Request-ID") - if requestID == "" { - requestID = uuid.New().String() - } ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID) logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments") diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index 71a84feb4..c189c859a 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -12,6 +12,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" + "github.com/google/uuid" liquid "github.com/sapcc/go-api-declarations/liquid" ) @@ -19,7 +20,15 @@ import ( // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { - ctx := WithNewGlobalRequestID(r.Context()) + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + // Set request ID in response header for client correlation + w.Header().Set("X-Request-ID", requestID) + + ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/info") // Only accept GET method diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go index 836ae017f..19b7fb24c 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -9,6 +9,8 @@ import ( "strconv" "time" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/google/uuid" "github.com/sapcc/go-api-declarations/liquid" ) @@ -20,6 +22,13 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) startTime := time.Now() statusCode := http.StatusOK + // Extract or generate request ID for tracing - always set in response header + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + w.Header().Set("X-Request-ID", requestID) + // Check if API is enabled if !api.config.EnableReportCapacityAPI { statusCode = http.StatusServiceUnavailable @@ -28,7 +37,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) return } - ctx := WithNewGlobalRequestID(r.Context()) + ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity") // Only accept POST method diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go index a533d4e40..558758bab 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage.go +++ b/internal/scheduling/reservations/commitments/api_report_usage.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/google/uuid" "github.com/sapcc/go-api-declarations/liquid" ) @@ -24,20 +25,24 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { startTime := time.Now() statusCode := http.StatusOK + // Extract or generate request ID for tracing - always set in response header + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + w.Header().Set("X-Request-ID", requestID) + + log := baseLog.WithValues("requestID", requestID, "endpoint", "report-usage") + // Check if API is enabled if !api.config.EnableReportUsageAPI { statusCode = http.StatusServiceUnavailable + log.Info("report-usage API is disabled, rejecting request") http.Error(w, "report-usage API is disabled", statusCode) api.recordUsageMetrics(statusCode, startTime) return } - requestID := r.Header.Get("X-Request-ID") - if requestID == "" { - requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) - } - log := baseLog.WithValues("requestID", requestID, "endpoint", "report-usage") - if r.Method != http.MethodPost { statusCode = http.StatusMethodNotAllowed http.Error(w, "Method not allowed", statusCode)