From 57c3177631b636612d276a7679a84d93b7d1edd9 Mon Sep 17 00:00:00 2001 From: mblos Date: Mon, 23 Mar 2026 15:52:38 +0100 Subject: [PATCH 1/7] cr metrics added --- cmd/main.go | 2 +- .../reservations/commitments/api.go | 14 +- .../commitments/api_change_commitments.go | 37 +++- .../api_change_commitments_metrics.go | 44 ++++ .../api_change_commitments_monitor.go | 47 ++++ .../api_change_commitments_monitor_test.go | 203 ++++++++++++++++++ .../api_change_commitments_test.go | 4 +- 7 files changed, 336 insertions(+), 15 deletions(-) create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_metrics.go create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_monitor.go create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go diff --git a/cmd/main.go b/cmd/main.go index 1565c15cd..d152b21c6 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -326,7 +326,7 @@ func main() { // Initialize commitments API for LIQUID interface commitmentsAPI := commitments.NewAPI(multiclusterClient) - commitmentsAPI.Init(mux) + commitmentsAPI.Init(mux, metrics.Registry) // Detector pipeline controller setup. novaClient := nova.NewNovaClient() diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index 9d8fd5944..a97b00580 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -7,14 +7,16 @@ import ( "net/http" "sync" + "github.com/prometheus/client_golang/prometheus" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) // HTTPAPI implements Limes LIQUID commitment validation endpoints. type HTTPAPI struct { - client client.Client - config Config + client client.Client + config Config + monitor ChangeCommitmentsAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -25,12 +27,14 @@ func NewAPI(client client.Client) *HTTPAPI { func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI { return &HTTPAPI{ - client: client, - config: config, + client: client, + config: config, + monitor: NewChangeCommitmentsAPIMonitor(), } } -func (api *HTTPAPI) Init(mux *http.ServeMux) { +func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) { + registry.MustRegister(&api.monitor) mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/v1/commitments/info", api.HandleInfo) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 92398c974..82b8846c1 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -46,9 +46,17 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { // This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. // A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + // Initialize + resp := liquid.CommitmentChangeResponse{} + req := liquid.CommitmentChangeRequest{} + statusCode := http.StatusOK + // Check if API is enabled if !api.config.EnableChangeCommitmentsAPI { - http.Error(w, "change-commitments API is disabled", http.StatusServiceUnavailable) + statusCode = http.StatusServiceUnavailable + http.Error(w, "change-commitments API is disabled", statusCode) + api.recordMetrics(req, resp, statusCode, startTime) return } @@ -66,26 +74,27 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Only accept POST method if r.Method != http.MethodPost { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + statusCode = http.StatusMethodNotAllowed + http.Error(w, "Method not allowed", statusCode) + api.recordMetrics(req, resp, statusCode, startTime) return } // Parse request body - var req liquid.CommitmentChangeRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { logger.Error(err, "invalid request body") - http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest) + statusCode = http.StatusBadRequest + http.Error(w, "Invalid request body: "+err.Error(), statusCode) + api.recordMetrics(req, resp, statusCode, startTime) return } logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) - // Initialize response - resp := liquid.CommitmentChangeResponse{} - // Check for dry run -> early reject, not supported yet if req.DryRun { resp.RejectionReason = "Dry run not supported yet" + api.recordMetrics(req, resp, statusCode, startTime) logger.Info("rejecting dry run request") w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) @@ -97,14 +106,26 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Process commitment changes // For now, we'll implement a simplified path that checks capacity for immediate start CRs + if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil { // Error already written to response by processCommitmentChanges + // Determine status code from error context (409 or 503) + if strings.Contains(err.Error(), "version mismatch") { + statusCode = http.StatusConflict + } else if strings.Contains(err.Error(), "caches not ready") { + statusCode = http.StatusServiceUnavailable + } + // Record metrics for error cases + api.recordMetrics(req, resp, statusCode, startTime) return } + // Record metrics + api.recordMetrics(req, resp, statusCode, startTime) + // Return response w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) + w.WriteHeader(statusCode) if err := json.NewEncoder(w).Encode(resp); err != nil { return } diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go new file mode 100644 index 000000000..edb09b5da --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go @@ -0,0 +1,44 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "strconv" + "time" + + "github.com/sapcc/go-api-declarations/liquid" +) + +// recordMetrics records Prometheus metrics for a change commitments request. +func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liquid.CommitmentChangeResponse, statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + + // Record request counter and duration + api.monitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.monitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) + + // Count total commitment changes in the request + commitmentCount := countCommitments(req) + + // Determine result based on response + result := "success" + if resp.RejectionReason != "" { + result = "rejected" + } + + // Record commitment changes counter + api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount)) +} + +// countCommitments counts the total number of commitments in a request. +func countCommitments(req liquid.CommitmentChangeRequest) int { + count := 0 + for _, projectChanges := range req.ByProject { + for _, resourceChanges := range projectChanges.ByResource { + count += len(resourceChanges.Commitments) + } + } + return count +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go new file mode 100644 index 000000000..5bfabf175 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go @@ -0,0 +1,47 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// ChangeCommitmentsAPIMonitor provides metrics for the CR change API. +type ChangeCommitmentsAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec + commitmentChanges *prometheus.CounterVec +} + +// NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics. +func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { + return ChangeCommitmentsAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_cr_change_api_requests_total", + Help: "Total number of CR change API requests by HTTP status code", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_cr_change_api_request_duration_seconds", + Help: "Duration of CR change API requests in seconds by HTTP status code", + }, []string{"status_code"}), + commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_cr_change_api_commitment_changes_total", + Help: "Total number of commitment changes processed by result", + }, []string{"result"}), + } +} + +// Describe implements prometheus.Collector. +func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) + m.commitmentChanges.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) + m.commitmentChanges.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go new file mode 100644 index 000000000..bf3acfb40 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -0,0 +1,203 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "encoding/json" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/sapcc/go-api-declarations/liquid" +) + +func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { + registry := prometheus.NewRegistry() + monitor := NewChangeCommitmentsAPIMonitor() + + if err := registry.Register(&monitor); err != nil { + t.Fatalf("Failed to register monitor: %v", err) + } + + // Observe metrics before gathering (Prometheus metrics with labels only appear after being used) + monitor.requestCounter.WithLabelValues("200").Inc() + monitor.requestDuration.WithLabelValues("200").Observe(0.1) + monitor.commitmentChanges.WithLabelValues("success").Inc() + + // Verify metrics can be gathered + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + // Check that all three metrics are present + foundRequestCounter := false + foundRequestDuration := false + foundCommitmentChanges := false + + for _, family := range families { + switch *family.Name { + case "cortex_cr_change_api_requests_total": + foundRequestCounter = true + if *family.Type != dto.MetricType_COUNTER { + t.Errorf("Expected counter metric type, got %v", *family.Type) + } + case "cortex_cr_change_api_request_duration_seconds": + foundRequestDuration = true + if *family.Type != dto.MetricType_HISTOGRAM { + t.Errorf("Expected histogram metric type, got %v", *family.Type) + } + case "cortex_cr_change_api_commitment_changes_total": + foundCommitmentChanges = true + if *family.Type != dto.MetricType_COUNTER { + t.Errorf("Expected counter metric type, got %v", *family.Type) + } + } + } + + if !foundRequestCounter { + t.Error("Request counter metric not found in registry") + } + if !foundRequestDuration { + t.Error("Request duration histogram not found in registry") + } + if !foundCommitmentChanges { + t.Error("Commitment changes counter not found in registry") + } +} + +func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { + registry := prometheus.NewRegistry() + monitor := NewChangeCommitmentsAPIMonitor() + + if err := registry.Register(&monitor); err != nil { + t.Fatalf("Failed to register monitor: %v", err) + } + + // Record some test metrics + monitor.requestCounter.WithLabelValues("200").Inc() + monitor.requestCounter.WithLabelValues("409").Inc() + monitor.requestCounter.WithLabelValues("503").Inc() + monitor.requestDuration.WithLabelValues("200").Observe(1.5) + monitor.commitmentChanges.WithLabelValues("success").Add(5) + monitor.commitmentChanges.WithLabelValues("rejected").Add(2) + + // Gather metrics + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + // Verify request counter has correct labels + for _, family := range families { + if *family.Name == "cortex_cr_change_api_requests_total" { + if len(family.Metric) != 3 { + t.Errorf("Expected 3 request counter metrics, got %d", len(family.Metric)) + } + + // Check label names + for _, metric := range family.Metric { + labelNames := make(map[string]bool) + for _, label := range metric.Label { + labelNames[*label.Name] = true + } + + if !labelNames["status_code"] { + t.Error("Missing 'status_code' label in request counter") + } + } + } + + if *family.Name == "cortex_cr_change_api_request_duration_seconds" { + if len(family.Metric) != 1 { + t.Errorf("Expected 1 histogram metric, got %d", len(family.Metric)) + } + + // Check label names + for _, metric := range family.Metric { + labelNames := make(map[string]bool) + for _, label := range metric.Label { + labelNames[*label.Name] = true + } + + if !labelNames["status_code"] { + t.Error("Missing 'status_code' label in histogram") + } + } + } + + if *family.Name == "cortex_cr_change_api_commitment_changes_total" { + if len(family.Metric) != 2 { + t.Errorf("Expected 2 commitment changes metrics, got %d", len(family.Metric)) + } + + // Check label names + for _, metric := range family.Metric { + labelNames := make(map[string]bool) + for _, label := range metric.Label { + labelNames[*label.Name] = true + } + + if !labelNames["result"] { + t.Error("Missing 'result' label in commitment changes counter") + } + } + } + } +} + +func TestCountCommitments(t *testing.T) { + testCases := []struct { + name string + request CommitmentChangeRequest + expected int + }{ + { + name: "Single commitment", + request: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), + ), + expected: 1, + }, + { + name: "Multiple commitments same project", + request: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), + createCommitment("ram_hana_2", "project-A", "uuid-2", "confirmed", 2), + ), + expected: 2, + }, + { + name: "Multiple commitments multiple projects", + request: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), + createCommitment("ram_hana_1", "project-B", "uuid-2", "confirmed", 3), + createCommitment("ram_gp_1", "project-C", "uuid-3", "confirmed", 1), + ), + expected: 3, + }, + { + name: "Empty request", + request: newCommitmentRequest("az-a", false, 1234), + expected: 0, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Convert test request to liquid request + reqJSON := buildRequestJSON(tc.request) + var req liquid.CommitmentChangeRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("Failed to parse request: %v", err) + } + + result := countCommitments(req) + + if result != tc.expected { + t.Errorf("Expected %d commitments, got %d", tc.expected, result) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 58f8cf997..4d68227a1 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -23,6 +23,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" "github.com/sapcc/go-api-declarations/liquid" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -969,7 +970,8 @@ func newCommitmentTestEnv( api = NewAPI(wrappedClient) } mux := http.NewServeMux() - api.Init(mux) + registry := prometheus.NewRegistry() + api.Init(mux, registry) httpServer := httptest.NewServer(mux) env.HTTPServer = httpServer From 8a8bfb93ed7c9ef41dfd67b26f4a29bc4fa5b833 Mon Sep 17 00:00:00 2001 From: mblos Date: Mon, 23 Mar 2026 16:00:28 +0100 Subject: [PATCH 2/7] unify logging --- .../reservations/commitments/api.go | 3 -- .../commitments/api_change_commitments.go | 7 ++-- .../reservations/commitments/api_info.go | 23 +++++-------- .../commitments/api_report_capacity.go | 20 ++++------- .../reservations/commitments/client.go | 23 ++++++------- .../reservations/commitments/context.go | 18 +++++----- .../reservations/commitments/controller.go | 11 +++--- .../reservations/commitments/state.go | 4 +-- .../reservations/commitments/syncer.go | 34 +++++++++---------- 9 files changed, 63 insertions(+), 80 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index a97b00580..49e1ee261 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -8,7 +8,6 @@ import ( "sync" "github.com/prometheus/client_golang/prometheus" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -39,5 +38,3 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) { // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/v1/commitments/info", api.HandleInfo) } - -var commitmentApiLog = ctrl.Log.WithName("commitment_api") diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 82b8846c1..016e42731 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -21,12 +21,9 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) -var apiLog = ctrl.Log.WithName("commitment-reservation-api") - // sortedKeys returns map keys sorted alphabetically for deterministic iteration. func sortedKeys[K ~string, V any](m map[K]V) []K { keys := make([]K, 0, len(m)) @@ -69,8 +66,8 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque if requestID == "" { requestID = uuid.New().String() } - ctx := reservations.WithGlobalRequestID(context.Background(), requestID) - logger := APILoggerFromContext(ctx).WithValues("endpoint", "/v1/change-commitments") + ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID) + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments") // Only accept POST method if r.Method != http.MethodPost { diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index db02dd708..140dc54d4 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -9,7 +9,6 @@ import ( "fmt" "net/http" "strings" - "time" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" @@ -20,12 +19,8 @@ import ( // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { - // Extract or generate request ID for tracing - requestID := r.Header.Get("X-Request-ID") - if requestID == "" { - requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) - } - log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/info") + ctx := WithNewGlobalRequestID(r.Context()) + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/info") // Only accept GET method if r.Method != http.MethodGet { @@ -33,13 +28,13 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { return } - log.V(1).Info("processing info request") + logger.V(1).Info("processing info request") // Build info response - info, err := api.buildServiceInfo(r.Context(), log) + info, err := api.buildServiceInfo(ctx, logger) if err != nil { // Use Info level for expected conditions like knowledge not being ready yet - log.Info("service info not available yet", "error", err.Error()) + logger.Info("service info not available yet", "error", err.Error()) http.Error(w, "Service temporarily unavailable: "+err.Error(), http.StatusServiceUnavailable) return @@ -49,13 +44,13 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) if err := json.NewEncoder(w).Encode(info); err != nil { - log.Error(err, "failed to encode service info") + logger.Error(err, "failed to encode service info") return } } // buildServiceInfo constructs the ServiceInfo response with metadata for all flavor groups. -func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liquid.ServiceInfo, error) { +func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (liquid.ServiceInfo, error) { // Get all flavor groups from Knowledge CRDs knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) @@ -92,7 +87,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liqu HandlesCommitments: true, // We handle commitment changes via /v1/change-commitments } - log.V(1).Info("registered flavor group resource", + logger.V(1).Info("registered flavor group resource", "resourceName", resourceName, "flavorGroup", groupName, "displayName", displayName, @@ -106,7 +101,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liqu version = knowledgeCRD.Status.LastContentChange.Unix() } - log.Info("built service info", + logger.Info("built service info", "resourceCount", len(resources), "version", version) diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go index 0ec1f5e7d..f526dbd2c 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -5,9 +5,7 @@ package commitments import ( "encoding/json" - "fmt" "net/http" - "time" "github.com/sapcc/go-api-declarations/liquid" ) @@ -17,12 +15,8 @@ import ( // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) { - // Extract or generate request ID for tracing - requestID := r.Header.Get("X-Request-ID") - if requestID == "" { - requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) - } - log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/report-capacity") + ctx := WithNewGlobalRequestID(r.Context()) + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/report-capacity") // Only accept POST method if r.Method != http.MethodPost { @@ -30,7 +24,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) return } - log.V(1).Info("processing report capacity request") + logger.V(1).Info("processing report capacity request") // Parse request body (may be empty or contain ServiceCapacityRequest) var req liquid.ServiceCapacityRequest @@ -41,21 +35,21 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) // Calculate capacity calculator := NewCapacityCalculator(api.client) - report, err := calculator.CalculateCapacity(r.Context()) + report, err := calculator.CalculateCapacity(ctx) if err != nil { - log.Error(err, "failed to calculate capacity") + logger.Error(err, "failed to calculate capacity") http.Error(w, "Failed to calculate capacity: "+err.Error(), http.StatusInternalServerError) return } - log.Info("calculated capacity report", "resourceCount", len(report.Resources)) + logger.Info("calculated capacity report", "resourceCount", len(report.Resources)) // Return response w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) if err := json.NewEncoder(w).Encode(report); err != nil { - log.Error(err, "failed to encode capacity report") + logger.Error(err, "failed to encode capacity report") return } } diff --git a/internal/scheduling/reservations/commitments/client.go b/internal/scheduling/reservations/commitments/client.go index 2e5585c99..b31feb7c0 100644 --- a/internal/scheduling/reservations/commitments/client.go +++ b/internal/scheduling/reservations/commitments/client.go @@ -17,7 +17,6 @@ import ( "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/projects" "github.com/sapcc/go-bits/jobloop" "github.com/sapcc/go-bits/must" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -48,7 +47,7 @@ func NewCommitmentsClient() CommitmentsClient { } func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf SyncerConfig) error { - log := ctrl.Log.WithName("CommitmentClient") + logger := LoggerFromContext(ctx).WithValues("component", "client") var authenticatedHTTP = http.DefaultClient if conf.SSOSecretRef != nil { @@ -72,7 +71,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "identity", Availability: "public", })) - log.Info("using identity endpoint", "url", url) + logger.Info("using identity endpoint", "url", url) c.keystone = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -84,7 +83,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "compute", Availability: "public", })) - log.Info("using nova endpoint", "url", url) + logger.Info("using nova endpoint", "url", url) c.nova = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -97,7 +96,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "resources", Availability: "public", })) - log.Info("using limes endpoint", "url", url) + logger.Info("using limes endpoint", "url", url) c.limes = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -107,9 +106,9 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf } func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) { - log := ctrl.Log.WithName("CommitmentClient") + logger := LoggerFromContext(ctx).WithValues("component", "client") - log.V(1).Info("fetching projects from keystone") + logger.V(1).Info("fetching projects from keystone") allPages, err := projects.List(c.keystone, nil).AllPages(ctx) if err != nil { return nil, err @@ -120,15 +119,15 @@ func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) if err := allPages.(projects.ProjectPage).ExtractInto(data); err != nil { return nil, err } - log.V(1).Info("fetched projects from keystone", "count", len(data.Projects)) + logger.V(1).Info("fetched projects from keystone", "count", len(data.Projects)) return data.Projects, nil } // ListCommitmentsByID fetches commitments for all projects in parallel. func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) { - log := ctrl.Log.WithName("CommitmentClient") + logger := LoggerFromContext(ctx).WithValues("component", "client") - log.V(1).Info("fetching commitments from limes", "projects", len(projects)) + logger.V(1).Info("fetching commitments from limes", "projects", len(projects)) commitmentsMutex := gosync.Mutex{} commitments := make(map[string]Commitment) var wg gosync.WaitGroup @@ -161,11 +160,11 @@ func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects .. // Return the first error encountered, if any. for err := range errChan { if err != nil { - log.Error(err, "failed to resolve commitments") + logger.Error(err, "failed to resolve commitments") return nil, err } } - log.V(1).Info("resolved commitments from limes", "count", len(commitments)) + logger.V(1).Info("resolved commitments from limes", "count", len(commitments)) return commitments, nil } diff --git a/internal/scheduling/reservations/commitments/context.go b/internal/scheduling/reservations/commitments/context.go index a461ac1c2..64257fcae 100644 --- a/internal/scheduling/reservations/commitments/context.go +++ b/internal/scheduling/reservations/commitments/context.go @@ -9,21 +9,23 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" "github.com/google/uuid" + ctrl "sigs.k8s.io/controller-runtime" ) +// baseLog is the base logger for all committed-resource operations. +// Use LoggerFromContext to get a logger with request tracking values. +var baseLog = ctrl.Log.WithName("committed-resource") + +// WithNewGlobalRequestID creates a new context with a committed-resource-prefixed global request ID. func WithNewGlobalRequestID(ctx context.Context) context.Context { return reservations.WithGlobalRequestID(ctx, "committed-resource-"+uuid.New().String()) } +// LoggerFromContext returns a logger with greq and req values from the context. +// This creates a child logger with the request tracking values pre-attached, +// so you don't need to repeat them in every log call. func LoggerFromContext(ctx context.Context) logr.Logger { - return commitmentLog.WithValues( - "greq", reservations.GlobalRequestIDFromContext(ctx), - "req", reservations.RequestIDFromContext(ctx), - ) -} - -func APILoggerFromContext(ctx context.Context) logr.Logger { - return apiLog.WithValues( + return baseLog.WithValues( "greq", reservations.GlobalRequestIDFromContext(ctx), "req", reservations.RequestIDFromContext(ctx), ) diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 6a30b6b6c..f68e35cde 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -31,8 +31,6 @@ import ( "github.com/go-logr/logr" ) -var commitmentLog = ctrl.Log.WithName("commitment-reservation-controller") - // CommitmentReservationController reconciles commitment Reservation objects type CommitmentReservationController struct { // Client for the kubernetes API. @@ -51,7 +49,8 @@ type CommitmentReservationController struct { // move the current state of the cluster closer to the desired state. // Note: This controller only handles commitment reservations, as filtered by the predicate. func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := LoggerFromContext(ctx).WithValues("reservation", req.Name, "namespace", req.Namespace) + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name, "namespace", req.Namespace) // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { @@ -323,7 +322,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr // reconcileAllocations verifies all allocations in Spec against actual Nova VM state. // It updates Status.Allocations based on the actual host location of each VM. func (r *CommitmentReservationController) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { - logger := LoggerFromContext(ctx) + logger := LoggerFromContext(ctx).WithValues("component", "controller") // Skip if no CommittedResourceReservation if res.Spec.CommittedResourceReservation == nil { @@ -440,7 +439,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con return nil, errors.New("database connection not initialized") } - log := logf.FromContext(ctx) + logger := LoggerFromContext(ctx).WithValues("component", "controller") // Query servers from the database cache. var servers []nova.Server @@ -451,7 +450,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con return nil, fmt.Errorf("failed to query servers from database: %w", err) } - log.V(1).Info("queried servers from database", + logger.V(1).Info("queried servers from database", "projectID", projectID, "serverCount", len(servers)) diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index bfa6fdb28..11bbc4f1d 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -199,8 +199,8 @@ func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, er } // check flavor group consistency, ignore if not matching to repair corrupted state in k8s if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName { - // log message - commitmentLog.Error(errors.New("inconsistent flavor group in reservation"), + // log message using baseLog since this is a static function without context + baseLog.Error(errors.New("inconsistent flavor group in reservation"), "reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation", "reservationName", res.Name, "expectedFlavorGroup", state.FlavorGroupName, diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 2f302479a..b7db11351 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -14,7 +14,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -132,35 +131,36 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo func (s *Syncer) SyncReservations(ctx context.Context) error { // TODO handle concurrency with change API: consider creation time of reservations and status ready - // Create logger with run ID for this sync execution + // Create context with request ID for this sync execution runID := fmt.Sprintf("sync-%d", time.Now().Unix()) - log := ctrl.Log.WithName("CommitmentSyncer").WithValues("runID", runID) + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("component", "syncer", "runID", runID) - log.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval) + logger.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval) // Check if flavor group knowledge is ready knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} knowledgeCRD, err := knowledge.Get(ctx) if err != nil { - log.Error(err, "failed to check flavor group knowledge readiness") + logger.Error(err, "failed to check flavor group knowledge readiness") return err } if knowledgeCRD == nil { - log.Info("skipping commitment sync - flavor group knowledge not ready yet") + logger.Info("skipping commitment sync - flavor group knowledge not ready yet") return nil } // Get flavor groups using the CRD we already fetched flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD) if err != nil { - log.Error(err, "failed to get flavor groups from knowledge") + logger.Error(err, "failed to get flavor groups from knowledge") return err } // Get all commitments as states - commitmentStates, err := s.getCommitmentStates(ctx, log, flavorGroups) + commitmentStates, err := s.getCommitmentStates(ctx, logger, flavorGroups) if err != nil { - log.Error(err, "failed to get compute commitments") + logger.Error(err, "failed to get compute commitments") return err } @@ -169,15 +169,15 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { // Apply each commitment state using the manager for _, state := range commitmentStates { - log.Info("applying commitment state", + logger.Info("applying commitment state", "commitmentUUID", state.CommitmentUUID, "projectID", state.ProjectID, "flavorGroup", state.FlavorGroupName, "totalMemoryBytes", state.TotalMemoryBytes) - _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, CreatorValue) + _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) if err != nil { - log.Error(err, "failed to apply commitment state", + logger.Error(err, "failed to apply commitment state", "commitmentUUID", state.CommitmentUUID) // Continue with other commitments even if one fails continue @@ -190,7 +190,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { if err := s.List(ctx, &existingReservations, client.MatchingLabels{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { - log.Error(err, "failed to list existing committed resource reservations") + logger.Error(err, "failed to list existing committed resource reservations") return err } @@ -205,22 +205,22 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { // Extract commitment UUID from reservation name commitmentUUID := extractCommitmentUUID(existing.Name) if commitmentUUID == "" { - log.Info("skipping reservation with unparseable name", "name", existing.Name) + logger.Info("skipping reservation with unparseable name", "name", existing.Name) continue } if !activeCommitments[commitmentUUID] { // This commitment no longer exists, delete the reservation if err := s.Delete(ctx, &existing); err != nil { - log.Error(err, "failed to delete reservation", "name", existing.Name) + logger.Error(err, "failed to delete reservation", "name", existing.Name) return err } - log.Info("deleted reservation for expired commitment", + logger.Info("deleted reservation for expired commitment", "name", existing.Name, "commitmentUUID", commitmentUUID) } } - log.Info("synced reservations", "commitmentCount", len(commitmentStates)) + logger.Info("synced reservations", "commitmentCount", len(commitmentStates)) return nil } From 10380c971f80e5a0fcf011edf24d1cb7020e79f6 Mon Sep 17 00:00:00 2001 From: mblos Date: Mon, 23 Mar 2026 16:08:55 +0100 Subject: [PATCH 3/7] metrics --- .../cortex-nova/alerts/nova.alerts.yaml | 52 ++++++++++++++++++- .../api_change_commitments_monitor.go | 10 ++-- .../api_change_commitments_monitor_test.go | 12 ++--- 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 65de5c626..8a757d9fc 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -274,4 +274,54 @@ groups: description: > This may indicate issues with the pipeline configuration. It is recommended to investigate the - pipeline status and logs for more details. \ No newline at end of file + pipeline status and logs for more details. + + # Committed Resource (Limes Integration) Alerts + - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 500 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates that Cortex is having issues + processing commitment changes from Limes. + + - alert: CortexNovaCommittedResourceLatencyTooHigh + expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API latency too high" + description: > + The committed resource change API (Limes LIQUID integration) is experiencing + high latency (p95 > 30s). + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h]) + / rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h]) > 0.5 + for: 15m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource rejection rate too high" + description: > + More than 50% of commitment change requests are being rejected. + This may indicate insufficient capacity to fulfill new commitments + or issues with the commitment scheduling logic. diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go index 5bfabf175..873e57eb0 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go @@ -18,15 +18,15 @@ type ChangeCommitmentsAPIMonitor struct { func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { return ChangeCommitmentsAPIMonitor{ requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_cr_change_api_requests_total", - Help: "Total number of CR change API requests by HTTP status code", + Name: "cortex_committed_resource_change_api_requests_total", + Help: "Total number of committed resource change API requests by HTTP status code", }, []string{"status_code"}), requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_cr_change_api_request_duration_seconds", - Help: "Duration of CR change API requests in seconds by HTTP status code", + Name: "cortex_committed_resource_change_api_request_duration_seconds", + Help: "Duration of committed resource change API requests in seconds by HTTP status code", }, []string{"status_code"}), commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_cr_change_api_commitment_changes_total", + Name: "cortex_committed_resource_change_api_commitment_changes_total", Help: "Total number of commitment changes processed by result", }, []string{"result"}), } diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go index bf3acfb40..7d8b080b7 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -38,17 +38,17 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { for _, family := range families { switch *family.Name { - case "cortex_cr_change_api_requests_total": + case "cortex_committed_resource_change_api_requests_total": foundRequestCounter = true if *family.Type != dto.MetricType_COUNTER { t.Errorf("Expected counter metric type, got %v", *family.Type) } - case "cortex_cr_change_api_request_duration_seconds": + case "cortex_committed_resource_change_api_request_duration_seconds": foundRequestDuration = true if *family.Type != dto.MetricType_HISTOGRAM { t.Errorf("Expected histogram metric type, got %v", *family.Type) } - case "cortex_cr_change_api_commitment_changes_total": + case "cortex_committed_resource_change_api_commitment_changes_total": foundCommitmentChanges = true if *family.Type != dto.MetricType_COUNTER { t.Errorf("Expected counter metric type, got %v", *family.Type) @@ -91,7 +91,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { // Verify request counter has correct labels for _, family := range families { - if *family.Name == "cortex_cr_change_api_requests_total" { + if *family.Name == "cortex_committed_resource_change_api_requests_total" { if len(family.Metric) != 3 { t.Errorf("Expected 3 request counter metrics, got %d", len(family.Metric)) } @@ -109,7 +109,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { } } - if *family.Name == "cortex_cr_change_api_request_duration_seconds" { + if *family.Name == "cortex_committed_resource_change_api_request_duration_seconds" { if len(family.Metric) != 1 { t.Errorf("Expected 1 histogram metric, got %d", len(family.Metric)) } @@ -127,7 +127,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { } } - if *family.Name == "cortex_cr_change_api_commitment_changes_total" { + if *family.Name == "cortex_committed_resource_change_api_commitment_changes_total" { if len(family.Metric) != 2 { t.Errorf("Expected 2 commitment changes metrics, got %d", len(family.Metric)) } From 2064b9184614cc458b0562e3e21f0450393f4031 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 09:01:03 +0100 Subject: [PATCH 4/7] fix --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 8a757d9fc..538b9279c 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -311,7 +311,7 @@ groups: - alert: CortexNovaCommittedResourceRejectionRateTooHigh expr: | rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h]) - / rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h]) > 0.5 + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h])) > 0.5 for: 15m labels: context: committed-resource-api From 19b182ff6957e52ea68dd190d7e467ec6e9a21f8 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 09:09:47 +0100 Subject: [PATCH 5/7] timeout alert --- .../cortex-nova/alerts/nova.alerts.yaml | 53 +++++++++++++++++-- .../commitments/api_change_commitments.go | 1 + .../api_change_commitments_monitor.go | 7 +++ .../api_change_commitments_monitor_test.go | 12 ++++- 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 538b9279c..c8db56b8c 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -277,6 +277,24 @@ groups: pipeline status and logs for more details. # Committed Resource (Limes Integration) Alerts + - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 400 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may happen when Limes sends a request with + an outdated info version (409), the API is temporarily unavailable, + or the request format is invalid. Limes will typically retry these + requests, so no immediate action is needed unless the errors persist. + - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 for: 5m @@ -290,8 +308,10 @@ groups: summary: "Committed Resource change API HTTP 500 errors too high" description: > The committed resource change API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates that Cortex is having issues - processing commitment changes from Limes. + with HTTP 5xx errors. This is not expected and indicates that Cortex + is having an internal problem processing commitment changes. Limes will + continue to retry, but new commitments may not be fulfilled until the + issue is resolved. - alert: CortexNovaCommittedResourceLatencyTooHigh expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30 @@ -306,7 +326,9 @@ groups: summary: "Committed Resource change API latency too high" description: > The committed resource change API (Limes LIQUID integration) is experiencing - high latency (p95 > 30s). + high latency (p95 > 30s). This may indicate that the scheduling pipeline + is under heavy load or that reservation scheduling is taking longer than + expected. Limes requests may time out, causing commitment changes to fail. - alert: CortexNovaCommittedResourceRejectionRateTooHigh expr: | @@ -323,5 +345,26 @@ groups: summary: "Committed Resource rejection rate too high" description: > More than 50% of commitment change requests are being rejected. - This may indicate insufficient capacity to fulfill new commitments - or issues with the commitment scheduling logic. + This may indicate insufficient capacity in the datacenter to fulfill + new commitments, or issues with the commitment scheduling logic. + Rejected commitments are rolled back, so Limes will see them as failed + and may retry or report the failure to users. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API timeouts too high" + description: > + The committed resource change API (Limes LIQUID integration) timed out + while waiting for reservations to become ready. This indicates that the + scheduling pipeline is overloaded or reservations are taking too long + to be scheduled. Affected commitment changes are rolled back and Limes + will see them as failed. Consider investigating the scheduler performance + or increasing the timeout configuration. diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 016e42731..2488d5843 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -272,6 +272,7 @@ ProcessLoop: } if len(failedReservations) == 0 { resp.RejectionReason += "timeout reached while processing commitment changes" + api.monitor.timeouts.Inc() } requireRollback = true } diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go index 873e57eb0..30a09d49e 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go @@ -12,6 +12,7 @@ type ChangeCommitmentsAPIMonitor struct { requestCounter *prometheus.CounterVec requestDuration *prometheus.HistogramVec commitmentChanges *prometheus.CounterVec + timeouts prometheus.Counter } // NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics. @@ -29,6 +30,10 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { Name: "cortex_committed_resource_change_api_commitment_changes_total", Help: "Total number of commitment changes processed by result", }, []string{"result"}), + timeouts: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_change_api_timeouts_total", + Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready", + }), } } @@ -37,6 +42,7 @@ func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) { m.requestCounter.Describe(ch) m.requestDuration.Describe(ch) m.commitmentChanges.Describe(ch) + m.timeouts.Describe(ch) } // Collect implements prometheus.Collector. @@ -44,4 +50,5 @@ func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) { m.requestCounter.Collect(ch) m.requestDuration.Collect(ch) m.commitmentChanges.Collect(ch) + m.timeouts.Collect(ch) } diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go index 7d8b080b7..322d01aae 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -24,6 +24,7 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { monitor.requestCounter.WithLabelValues("200").Inc() monitor.requestDuration.WithLabelValues("200").Observe(0.1) monitor.commitmentChanges.WithLabelValues("success").Inc() + monitor.timeouts.Inc() // Verify metrics can be gathered families, err := registry.Gather() @@ -31,10 +32,11 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { t.Fatalf("Failed to gather metrics: %v", err) } - // Check that all three metrics are present + // Check that all metrics are present foundRequestCounter := false foundRequestDuration := false foundCommitmentChanges := false + foundTimeouts := false for _, family := range families { switch *family.Name { @@ -53,6 +55,11 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { if *family.Type != dto.MetricType_COUNTER { t.Errorf("Expected counter metric type, got %v", *family.Type) } + case "cortex_committed_resource_change_api_timeouts_total": + foundTimeouts = true + if *family.Type != dto.MetricType_COUNTER { + t.Errorf("Expected counter metric type, got %v", *family.Type) + } } } @@ -65,6 +72,9 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { if !foundCommitmentChanges { t.Error("Commitment changes counter not found in registry") } + if !foundTimeouts { + t.Error("Timeouts counter not found in registry") + } } func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { From 37da104062fcd1f724f0a4578e1f02c44d62d628 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 09:21:10 +0100 Subject: [PATCH 6/7] 5m instead of 1h --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index c8db56b8c..864651fba 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -332,9 +332,9 @@ groups: - alert: CortexNovaCommittedResourceRejectionRateTooHigh expr: | - rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h]) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h])) > 0.5 - for: 15m + rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 + for: 5m labels: context: committed-resource-api dashboard: cortex/cortex @@ -351,7 +351,7 @@ groups: and may retry or report the failure to users. - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[1h]) > 0 + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 for: 5m labels: context: committed-resource-api From a0f46276b823bd0002918f2a196fb69ef801adb6 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 24 Mar 2026 09:36:45 +0100 Subject: [PATCH 7/7] . --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 864651fba..675c5a27a 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -314,7 +314,7 @@ groups: issue is resolved. - alert: CortexNovaCommittedResourceLatencyTooHigh - expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30 + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 for: 5m labels: context: committed-resource-api