Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,115 @@ groups:
The committed resource capacity API (Limes LIQUID integration) is experiencing
high latency (p95 > 5s). This may indicate slow database queries or knowledge
CRD retrieval. Limes scrapes may time out, affecting capacity reporting.

# Committed Resource Syncer Alerts
- alert: CortexNovaCommittedResourceSyncerNotRunning
expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0
for: 5m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer not running"
description: >
The committed resource syncer has not run in the last 2 hours. This indicates
that the syncer may have stopped or is encountering errors. Check the syncer logs for errors.

- alert: CortexNovaCommittedResourceSyncerErrorsHigh
expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
for: 5m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer experiencing errors"
description: >
The committed resource syncer has encountered multiple errors in the last hour.
This may indicate connectivity issues with Limes. Check the syncer logs for error details.

- alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
expr: |
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05
for: 15m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer unit mismatch rate >5%"
description: >
More than 5% of commitments are being skipped due to unit mismatches between
Limes and Cortex flavor groups. This happens when Limes has not yet been
updated to use the new unit format after a flavor group change. The affected
commitments will keep their existing reservations until Limes notices the update.
Check the logs if this error persists for longer time.

- alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
expr: |
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0
for: 15m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer unknown flavor group rate >0%"
description: >
Some commitments reference flavor groups that don't exist in
Cortex Knowledge (anymore). This may indicate that flavor group configuration is
out of sync between Limes and Cortex, or that Knowledge extraction is failing.
Check the flavor group Knowledge CRD and history to see what was changed.

- alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
expr: |
(
rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01
for: 15m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer local change rate >1%"
description: >
More than 1% of synced commitments are requiring reservation changes
(creates, deletes, or repairs). This is higher than expected for steady-state
operation and may indicate data inconsistencies, external modifications to
reservations, or issues with the CRDs. Check Cortex logs for details.

- alert: CortexNovaCommittedResourceSyncerRepairRateHigh
expr: |
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
/ rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
for: 15m
labels:
context: committed-resource-syncer
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource syncer repair rate >0%"
description: >
Some commitments have reservations that needed repair
(wrong metadata like project ID or flavor group). This may indicate data
corruption, bugs in reservation creation, or external modifications.
Reservations are automatically repaired, but the root cause should be
investigated if this alert persists.
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,15 @@ ProcessLoop:

logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes)

touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi")
applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi")
if err != nil {
failedCommitments[string(commitment.UUID)] = "failed to apply commitment state"
logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err)
requireRollback = true
break ProcessLoop
}
logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations))
reservationsToWatch = append(reservationsToWatch, touchedReservations...)
logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations))
reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...)
}
}
}
Expand Down Expand Up @@ -305,7 +305,7 @@ ProcessLoop:
for commitmentUUID, state := range statesBefore {
// Rollback to statesBefore for this commitment
logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state)
_, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback")
_, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback")
if err != nil {
logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err)
// continue with best effort rollback for other projects
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

// ApplyResult contains the result of applying a commitment state.
type ApplyResult struct {
// Created is the number of reservations created
Created int
// Deleted is the number of reservations deleted
Deleted int
// Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config)
Repaired int
// TouchedReservations are reservations that were created or updated
TouchedReservations []v1alpha1.Reservation
// RemovedReservations are reservations that were deleted
RemovedReservations []v1alpha1.Reservation
}

// ReservationManager handles CRUD operations for Reservation CRDs.
type ReservationManager struct {
client.Client
Expand All @@ -42,14 +56,16 @@ func NewReservationManager(k8sClient client.Client) *ReservationManager {
// - Deleting unused/excess slots when capacity decreases
// - Syncing reservation metadata for all remaining slots
//
// Returns touched reservations (created/updated) and removed reservations for caller tracking.
// Returns ApplyResult containing touched/removed reservations and counts for metrics.
func (m *ReservationManager) ApplyCommitmentState(
ctx context.Context,
log logr.Logger,
desiredState *CommitmentState,
flavorGroups map[string]compute.FlavorGroupFeature,
creator string,
) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) {
) (*ApplyResult, error) {

result := &ApplyResult{}

log = log.WithName("ReservationManager")

Expand All @@ -58,7 +74,7 @@ func (m *ReservationManager) ApplyCommitmentState(
if err := m.List(ctx, &allReservations, client.MatchingLabels{
v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
}); err != nil {
return nil, nil, fmt.Errorf("failed to list reservations: %w", err)
return nil, fmt.Errorf("failed to list reservations: %w", err)
}

// Filter by name prefix to find reservations for this commitment
Expand All @@ -74,7 +90,7 @@ func (m *ReservationManager) ApplyCommitmentState(
flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName]

if !exists {
return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName)
return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName)
}
deltaMemoryBytes := desiredState.TotalMemoryBytes
for _, res := range existing {
Expand All @@ -90,7 +106,6 @@ func (m *ReservationManager) ApplyCommitmentState(
// Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project)
// They will be recreated with correct metadata in subsequent phases.
var validReservations []v1alpha1.Reservation
var repairedCount int
for _, res := range existing {
if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName ||
res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID {
Expand All @@ -101,13 +116,13 @@ func (m *ReservationManager) ApplyCommitmentState(
"actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup,
"expectedProjectID", desiredState.ProjectID,
"actualProjectID", res.Spec.CommittedResourceReservation.ProjectID)
repairedCount++
removedReservations = append(removedReservations, res)
result.Repaired++
result.RemovedReservations = append(result.RemovedReservations, res)
memValue := res.Spec.Resources[hv1.ResourceMemory]
deltaMemoryBytes += memValue.Value()

if err := m.Delete(ctx, &res); err != nil {
return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
return result, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
}
} else {
validReservations = append(validReservations, res)
Expand Down Expand Up @@ -139,33 +154,33 @@ func (m *ReservationManager) ApplyCommitmentState(
reservationToDelete = &existing[len(existing)-1]
existing = existing[:len(existing)-1] // remove from existing list
}
removedReservations = append(removedReservations, *reservationToDelete)
result.RemovedReservations = append(result.RemovedReservations, *reservationToDelete)
result.Deleted++
memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory]
deltaMemoryBytes += memValue.Value()

if err := m.Delete(ctx, reservationToDelete); err != nil {
return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err)
return result, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err)
}
}

// Phase 5 (CREATE): Create new reservations (capacity increased)
var createdCount int
for deltaMemoryBytes > 0 {
// Need to create new reservation slots, always prefer largest flavor within the group
// TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio
reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator)
touchedReservations = append(touchedReservations, *reservation)
result.TouchedReservations = append(result.TouchedReservations, *reservation)
memValue := reservation.Spec.Resources[hv1.ResourceMemory]
deltaMemoryBytes -= memValue.Value()
createdCount++
result.Created++

if err := m.Create(ctx, reservation); err != nil {
if apierrors.IsAlreadyExists(err) {
return touchedReservations, removedReservations, fmt.Errorf(
return result, fmt.Errorf(
"reservation %s already exists (collision detected): %w",
reservation.Name, err)
}
return touchedReservations, removedReservations, fmt.Errorf(
return result, fmt.Errorf(
"failed to create reservation slot %d: %w",
nextSlotIndex, err)
}
Expand All @@ -177,24 +192,25 @@ func (m *ReservationManager) ApplyCommitmentState(
for i := range existing {
updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState)
if err != nil {
return touchedReservations, removedReservations, err
return result, err
}
if updated != nil {
touchedReservations = append(touchedReservations, *updated)
result.TouchedReservations = append(result.TouchedReservations, *updated)
result.Repaired++
}
}

// Only log if there were actual changes
if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 {
if hasChanges || result.Created > 0 || len(result.RemovedReservations) > 0 || result.Repaired > 0 {
log.Info("commitment state sync completed",
"commitmentUUID", desiredState.CommitmentUUID,
"created", createdCount,
"deleted", len(removedReservations),
"repaired", repairedCount,
"total", len(existing)+createdCount)
"created", result.Created,
"deleted", result.Deleted,
"repaired", result.Repaired,
"total", len(existing)+result.Created)
}

return touchedReservations, removedReservations, nil
return result, nil
}

// syncReservationMetadata updates reservation metadata if it differs from desired state.
Expand Down
Loading
Loading