cobaltcore-dev · mblos · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -326,7 +326,7 @@ func main() {
 
 		// Initialize commitments API for LIQUID interface
 		commitmentsAPI := commitments.NewAPI(multiclusterClient)
-		commitmentsAPI.Init(mux)
+		commitmentsAPI.Init(mux, metrics.Registry)
 
 		// Detector pipeline controller setup.
 		novaClient := nova.NewNovaClient()

@@ -274,4 +274,97 @@ groups:
       description: >
         This may indicate issues with the pipeline
         configuration. It is recommended to investigate the
-        pipeline status and logs for more details.
+        pipeline status and logs for more details.
+
+  # Committed Resource (Limes Integration) Alerts
+  - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 400 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may happen when Limes sends a request with
+        an outdated info version (409), the API is temporarily unavailable,
+        or the request format is invalid. Limes will typically retry these
+        requests, so no immediate action is needed unless the errors persist.
+
+  - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 500 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This is not expected and indicates that Cortex
+        is having an internal problem processing commitment changes. Limes will
+        continue to retry, but new commitments may not be fulfilled until the
+        issue is resolved.
+
+  - alert: CortexNovaCommittedResourceLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API latency too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 30s). This may indicate that the scheduling pipeline
+        is under heavy load or that reservation scheduling is taking longer than
+        expected. Limes requests may time out, causing commitment changes to fail.
+
+  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
+    expr: |
+      rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])
+      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource rejection rate too high"
+      description: >
+        More than 50% of commitment change requests are being rejected.
+        This may indicate insufficient capacity in the datacenter to fulfill
+        new commitments, or issues with the commitment scheduling logic.
+        Rejected commitments are rolled back, so Limes will see them as failed
+        and may retry or report the failure to users.
+
+  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API timeouts too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) timed out
+        while waiting for reservations to become ready. This indicates that the
+        scheduling pipeline is overloaded or reservations are taking too long
+        to be scheduled. Affected commitment changes are rolled back and Limes
+        will see them as failed. Consider investigating the scheduler performance
+        or increasing the timeout configuration.
@@ -7,14 +7,15 @@ import (
 	"net/http"
 	"sync"
 
-	ctrl "sigs.k8s.io/controller-runtime"
+	"github.com/prometheus/client_golang/prometheus"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 // HTTPAPI implements Limes LIQUID commitment validation endpoints.
 type HTTPAPI struct {
-	client client.Client
-	config Config
+	client  client.Client
+	config  Config
+	monitor ChangeCommitmentsAPIMonitor
 	// Mutex to serialize change-commitments requests
 	changeMutex sync.Mutex
 }
@@ -25,15 +26,15 @@ func NewAPI(client client.Client) *HTTPAPI {
 
 func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI {
 	return &HTTPAPI{
-		client: client,
-		config: config,
+		client:  client,
+		config:  config,
+		monitor: NewChangeCommitmentsAPIMonitor(),
 	}
 }
 
-func (api *HTTPAPI) Init(mux *http.ServeMux) {
+func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
+	registry.MustRegister(&api.monitor)
 	mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
 	// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
 	mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
 }
-
-var commitmentApiLog = ctrl.Log.WithName("commitment_api")
@@ -21,12 +21,9 @@ import (
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-var apiLog = ctrl.Log.WithName("commitment-reservation-api")
-
 // sortedKeys returns map keys sorted alphabetically for deterministic iteration.
 func sortedKeys[K ~string, V any](m map[K]V) []K {
 	keys := make([]K, 0, len(m))
@@ -46,9 +43,17 @@ func sortedKeys[K ~string, V any](m map[K]V) []K {
 // This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle.
 // A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back.
 func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) {
+	startTime := time.Now()
+	// Initialize
+	resp := liquid.CommitmentChangeResponse{}
+	req := liquid.CommitmentChangeRequest{}
+	statusCode := http.StatusOK
+
 	// Check if API is enabled
 	if !api.config.EnableChangeCommitmentsAPI {
-		http.Error(w, "change-commitments API is disabled", http.StatusServiceUnavailable)
+		statusCode = http.StatusServiceUnavailable
+		http.Error(w, "change-commitments API is disabled", statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
@@ -61,31 +66,32 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
 	if requestID == "" {
 		requestID = uuid.New().String()
 	}
-	ctx := reservations.WithGlobalRequestID(context.Background(), requestID)
-	logger := APILoggerFromContext(ctx).WithValues("endpoint", "/v1/change-commitments")
+	ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID)
+	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments")
 
 	// Only accept POST method
 	if r.Method != http.MethodPost {
-		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		statusCode = http.StatusMethodNotAllowed
+		http.Error(w, "Method not allowed", statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
 	// Parse request body
-	var req liquid.CommitmentChangeRequest
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		logger.Error(err, "invalid request body")
-		http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest)
+		statusCode = http.StatusBadRequest
+		http.Error(w, "Invalid request body: "+err.Error(), statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
 	logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ)
 
-	// Initialize response
-	resp := liquid.CommitmentChangeResponse{}
-
 	// Check for dry run -> early reject, not supported yet
 	if req.DryRun {
 		resp.RejectionReason = "Dry run not supported yet"
+		api.recordMetrics(req, resp, statusCode, startTime)
 		logger.Info("rejecting dry run request")
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(http.StatusOK)
@@ -97,14 +103,26 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
 
 	// Process commitment changes
 	// For now, we'll implement a simplified path that checks capacity for immediate start CRs
+
 	if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil {
 		// Error already written to response by processCommitmentChanges
+		// Determine status code from error context (409 or 503)
+		if strings.Contains(err.Error(), "version mismatch") {
+			statusCode = http.StatusConflict
+		} else if strings.Contains(err.Error(), "caches not ready") {
+			statusCode = http.StatusServiceUnavailable
+		}
+		// Record metrics for error cases
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
+	// Record metrics
+	api.recordMetrics(req, resp, statusCode, startTime)
+
 	// Return response
 	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
+	w.WriteHeader(statusCode)
 	if err := json.NewEncoder(w).Encode(resp); err != nil {
 		return
 	}
@@ -254,6 +272,7 @@ ProcessLoop:
 			}
 			if len(failedReservations) == 0 {
 				resp.RejectionReason += "timeout reached while processing commitment changes"
+				api.monitor.timeouts.Inc()
 			}
 			requireRollback = true
 		}

@@ -0,0 +1,44 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"strconv"
+	"time"
+
+	"github.com/sapcc/go-api-declarations/liquid"
+)
+
+// recordMetrics records Prometheus metrics for a change commitments request.
+func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liquid.CommitmentChangeResponse, statusCode int, startTime time.Time) {
+	duration := time.Since(startTime).Seconds()
+	statusCodeStr := strconv.Itoa(statusCode)
+
+	// Record request counter and duration
+	api.monitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
+	api.monitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
+
+	// Count total commitment changes in the request
+	commitmentCount := countCommitments(req)
+
+	// Determine result based on response
+	result := "success"
+	if resp.RejectionReason != "" {
+		result = "rejected"
+	}
+
+	// Record commitment changes counter
+	api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount))
+}
+
+// countCommitments counts the total number of commitments in a request.
+func countCommitments(req liquid.CommitmentChangeRequest) int {
+	count := 0
+	for _, projectChanges := range req.ByProject {
+		for _, resourceChanges := range projectChanges.ByResource {
+			count += len(resourceChanges.Commitments)
+		}
+	}
+	return count
+}
@@ -0,0 +1,54 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// ChangeCommitmentsAPIMonitor provides metrics for the CR change API.
+type ChangeCommitmentsAPIMonitor struct {
+	requestCounter    *prometheus.CounterVec
+	requestDuration   *prometheus.HistogramVec
+	commitmentChanges *prometheus.CounterVec
+	timeouts          prometheus.Counter
+}
+
+// NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics.
+func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
+	return ChangeCommitmentsAPIMonitor{
+		requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_change_api_requests_total",
+			Help: "Total number of committed resource change API requests by HTTP status code",
+		}, []string{"status_code"}),
+		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Name: "cortex_committed_resource_change_api_request_duration_seconds",
+			Help: "Duration of committed resource change API requests in seconds by HTTP status code",
+		}, []string{"status_code"}),
+		commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_change_api_commitment_changes_total",
+			Help: "Total number of commitment changes processed by result",
+		}, []string{"result"}),
+		timeouts: prometheus.NewCounter(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_change_api_timeouts_total",
+			Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready",
+		}),
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
+	m.requestCounter.Describe(ch)
+	m.requestDuration.Describe(ch)
+	m.commitmentChanges.Describe(ch)
+	m.timeouts.Describe(ch)
+}
+
+// Collect implements prometheus.Collector.
+func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) {
+	m.requestCounter.Collect(ch)
+	m.requestDuration.Collect(ch)
+	m.commitmentChanges.Collect(ch)
+	m.timeouts.Collect(ch)
+}