From 57c3177631b636612d276a7679a84d93b7d1edd9 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Mon, 23 Mar 2026 15:52:38 +0100
Subject: [PATCH 1/7] cr metrics added

---
 cmd/main.go                                   |   2 +-
 .../reservations/commitments/api.go           |  14 +-
 .../commitments/api_change_commitments.go     |  37 +++-
 .../api_change_commitments_metrics.go         |  44 ++++
 .../api_change_commitments_monitor.go         |  47 ++++
 .../api_change_commitments_monitor_test.go    | 203 ++++++++++++++++++
 .../api_change_commitments_test.go            |   4 +-
 7 files changed, 336 insertions(+), 15 deletions(-)
 create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_metrics.go
 create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
 create mode 100644 internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go

diff --git a/cmd/main.go b/cmd/main.go
index 1565c15cd..d152b21c6 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -326,7 +326,7 @@ func main() {
 
 		// Initialize commitments API for LIQUID interface
 		commitmentsAPI := commitments.NewAPI(multiclusterClient)
-		commitmentsAPI.Init(mux)
+		commitmentsAPI.Init(mux, metrics.Registry)
 
 		// Detector pipeline controller setup.
 		novaClient := nova.NewNovaClient()
diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go
index 9d8fd5944..a97b00580 100644
--- a/internal/scheduling/reservations/commitments/api.go
+++ b/internal/scheduling/reservations/commitments/api.go
@@ -7,14 +7,16 @@ import (
 	"net/http"
 	"sync"
 
+	"github.com/prometheus/client_golang/prometheus"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 // HTTPAPI implements Limes LIQUID commitment validation endpoints.
 type HTTPAPI struct {
-	client client.Client
-	config Config
+	client  client.Client
+	config  Config
+	monitor ChangeCommitmentsAPIMonitor
 	// Mutex to serialize change-commitments requests
 	changeMutex sync.Mutex
 }
@@ -25,12 +27,14 @@ func NewAPI(client client.Client) *HTTPAPI {
 
 func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI {
 	return &HTTPAPI{
-		client: client,
-		config: config,
+		client:  client,
+		config:  config,
+		monitor: NewChangeCommitmentsAPIMonitor(),
 	}
 }
 
-func (api *HTTPAPI) Init(mux *http.ServeMux) {
+func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
+	registry.MustRegister(&api.monitor)
 	mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
 	// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
 	mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go
index 92398c974..82b8846c1 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments.go
@@ -46,9 +46,17 @@ func sortedKeys[K ~string, V any](m map[K]V) []K {
 // This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle.
 // A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back.
 func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) {
+	startTime := time.Now()
+	// Initialize
+	resp := liquid.CommitmentChangeResponse{}
+	req := liquid.CommitmentChangeRequest{}
+	statusCode := http.StatusOK
+
 	// Check if API is enabled
 	if !api.config.EnableChangeCommitmentsAPI {
-		http.Error(w, "change-commitments API is disabled", http.StatusServiceUnavailable)
+		statusCode = http.StatusServiceUnavailable
+		http.Error(w, "change-commitments API is disabled", statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
@@ -66,26 +74,27 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
 
 	// Only accept POST method
 	if r.Method != http.MethodPost {
-		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		statusCode = http.StatusMethodNotAllowed
+		http.Error(w, "Method not allowed", statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
 	// Parse request body
-	var req liquid.CommitmentChangeRequest
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		logger.Error(err, "invalid request body")
-		http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest)
+		statusCode = http.StatusBadRequest
+		http.Error(w, "Invalid request body: "+err.Error(), statusCode)
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
 	logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ)
 
-	// Initialize response
-	resp := liquid.CommitmentChangeResponse{}
-
 	// Check for dry run -> early reject, not supported yet
 	if req.DryRun {
 		resp.RejectionReason = "Dry run not supported yet"
+		api.recordMetrics(req, resp, statusCode, startTime)
 		logger.Info("rejecting dry run request")
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(http.StatusOK)
@@ -97,14 +106,26 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
 
 	// Process commitment changes
 	// For now, we'll implement a simplified path that checks capacity for immediate start CRs
+
 	if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil {
 		// Error already written to response by processCommitmentChanges
+		// Determine status code from error context (409 or 503)
+		if strings.Contains(err.Error(), "version mismatch") {
+			statusCode = http.StatusConflict
+		} else if strings.Contains(err.Error(), "caches not ready") {
+			statusCode = http.StatusServiceUnavailable
+		}
+		// Record metrics for error cases
+		api.recordMetrics(req, resp, statusCode, startTime)
 		return
 	}
 
+	// Record metrics
+	api.recordMetrics(req, resp, statusCode, startTime)
+
 	// Return response
 	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
+	w.WriteHeader(statusCode)
 	if err := json.NewEncoder(w).Encode(resp); err != nil {
 		return
 	}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go
new file mode 100644
index 000000000..edb09b5da
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_metrics.go
@@ -0,0 +1,44 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"strconv"
+	"time"
+
+	"github.com/sapcc/go-api-declarations/liquid"
+)
+
+// recordMetrics records Prometheus metrics for a change commitments request.
+func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liquid.CommitmentChangeResponse, statusCode int, startTime time.Time) {
+	duration := time.Since(startTime).Seconds()
+	statusCodeStr := strconv.Itoa(statusCode)
+
+	// Record request counter and duration
+	api.monitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
+	api.monitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
+
+	// Count total commitment changes in the request
+	commitmentCount := countCommitments(req)
+
+	// Determine result based on response
+	result := "success"
+	if resp.RejectionReason != "" {
+		result = "rejected"
+	}
+
+	// Record commitment changes counter
+	api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount))
+}
+
+// countCommitments counts the total number of commitments in a request.
+func countCommitments(req liquid.CommitmentChangeRequest) int {
+	count := 0
+	for _, projectChanges := range req.ByProject {
+		for _, resourceChanges := range projectChanges.ByResource {
+			count += len(resourceChanges.Commitments)
+		}
+	}
+	return count
+}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
new file mode 100644
index 000000000..5bfabf175
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
@@ -0,0 +1,47 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// ChangeCommitmentsAPIMonitor provides metrics for the CR change API.
+type ChangeCommitmentsAPIMonitor struct {
+	requestCounter    *prometheus.CounterVec
+	requestDuration   *prometheus.HistogramVec
+	commitmentChanges *prometheus.CounterVec
+}
+
+// NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics.
+func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
+	return ChangeCommitmentsAPIMonitor{
+		requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_cr_change_api_requests_total",
+			Help: "Total number of CR change API requests by HTTP status code",
+		}, []string{"status_code"}),
+		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Name: "cortex_cr_change_api_request_duration_seconds",
+			Help: "Duration of CR change API requests in seconds by HTTP status code",
+		}, []string{"status_code"}),
+		commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_cr_change_api_commitment_changes_total",
+			Help: "Total number of commitment changes processed by result",
+		}, []string{"result"}),
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
+	m.requestCounter.Describe(ch)
+	m.requestDuration.Describe(ch)
+	m.commitmentChanges.Describe(ch)
+}
+
+// Collect implements prometheus.Collector.
+func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) {
+	m.requestCounter.Collect(ch)
+	m.requestDuration.Collect(ch)
+	m.commitmentChanges.Collect(ch)
+}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
new file mode 100644
index 000000000..bf3acfb40
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
@@ -0,0 +1,203 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/sapcc/go-api-declarations/liquid"
+)
+
+func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	monitor := NewChangeCommitmentsAPIMonitor()
+
+	if err := registry.Register(&monitor); err != nil {
+		t.Fatalf("Failed to register monitor: %v", err)
+	}
+
+	// Observe metrics before gathering (Prometheus metrics with labels only appear after being used)
+	monitor.requestCounter.WithLabelValues("200").Inc()
+	monitor.requestDuration.WithLabelValues("200").Observe(0.1)
+	monitor.commitmentChanges.WithLabelValues("success").Inc()
+
+	// Verify metrics can be gathered
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("Failed to gather metrics: %v", err)
+	}
+
+	// Check that all three metrics are present
+	foundRequestCounter := false
+	foundRequestDuration := false
+	foundCommitmentChanges := false
+
+	for _, family := range families {
+		switch *family.Name {
+		case "cortex_cr_change_api_requests_total":
+			foundRequestCounter = true
+			if *family.Type != dto.MetricType_COUNTER {
+				t.Errorf("Expected counter metric type, got %v", *family.Type)
+			}
+		case "cortex_cr_change_api_request_duration_seconds":
+			foundRequestDuration = true
+			if *family.Type != dto.MetricType_HISTOGRAM {
+				t.Errorf("Expected histogram metric type, got %v", *family.Type)
+			}
+		case "cortex_cr_change_api_commitment_changes_total":
+			foundCommitmentChanges = true
+			if *family.Type != dto.MetricType_COUNTER {
+				t.Errorf("Expected counter metric type, got %v", *family.Type)
+			}
+		}
+	}
+
+	if !foundRequestCounter {
+		t.Error("Request counter metric not found in registry")
+	}
+	if !foundRequestDuration {
+		t.Error("Request duration histogram not found in registry")
+	}
+	if !foundCommitmentChanges {
+		t.Error("Commitment changes counter not found in registry")
+	}
+}
+
+func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	monitor := NewChangeCommitmentsAPIMonitor()
+
+	if err := registry.Register(&monitor); err != nil {
+		t.Fatalf("Failed to register monitor: %v", err)
+	}
+
+	// Record some test metrics
+	monitor.requestCounter.WithLabelValues("200").Inc()
+	monitor.requestCounter.WithLabelValues("409").Inc()
+	monitor.requestCounter.WithLabelValues("503").Inc()
+	monitor.requestDuration.WithLabelValues("200").Observe(1.5)
+	monitor.commitmentChanges.WithLabelValues("success").Add(5)
+	monitor.commitmentChanges.WithLabelValues("rejected").Add(2)
+
+	// Gather metrics
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("Failed to gather metrics: %v", err)
+	}
+
+	// Verify request counter has correct labels
+	for _, family := range families {
+		if *family.Name == "cortex_cr_change_api_requests_total" {
+			if len(family.Metric) != 3 {
+				t.Errorf("Expected 3 request counter metrics, got %d", len(family.Metric))
+			}
+
+			// Check label names
+			for _, metric := range family.Metric {
+				labelNames := make(map[string]bool)
+				for _, label := range metric.Label {
+					labelNames[*label.Name] = true
+				}
+
+				if !labelNames["status_code"] {
+					t.Error("Missing 'status_code' label in request counter")
+				}
+			}
+		}
+
+		if *family.Name == "cortex_cr_change_api_request_duration_seconds" {
+			if len(family.Metric) != 1 {
+				t.Errorf("Expected 1 histogram metric, got %d", len(family.Metric))
+			}
+
+			// Check label names
+			for _, metric := range family.Metric {
+				labelNames := make(map[string]bool)
+				for _, label := range metric.Label {
+					labelNames[*label.Name] = true
+				}
+
+				if !labelNames["status_code"] {
+					t.Error("Missing 'status_code' label in histogram")
+				}
+			}
+		}
+
+		if *family.Name == "cortex_cr_change_api_commitment_changes_total" {
+			if len(family.Metric) != 2 {
+				t.Errorf("Expected 2 commitment changes metrics, got %d", len(family.Metric))
+			}
+
+			// Check label names
+			for _, metric := range family.Metric {
+				labelNames := make(map[string]bool)
+				for _, label := range metric.Label {
+					labelNames[*label.Name] = true
+				}
+
+				if !labelNames["result"] {
+					t.Error("Missing 'result' label in commitment changes counter")
+				}
+			}
+		}
+	}
+}
+
+func TestCountCommitments(t *testing.T) {
+	testCases := []struct {
+		name     string
+		request  CommitmentChangeRequest
+		expected int
+	}{
+		{
+			name: "Single commitment",
+			request: newCommitmentRequest("az-a", false, 1234,
+				createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2),
+			),
+			expected: 1,
+		},
+		{
+			name: "Multiple commitments same project",
+			request: newCommitmentRequest("az-a", false, 1234,
+				createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2),
+				createCommitment("ram_hana_2", "project-A", "uuid-2", "confirmed", 2),
+			),
+			expected: 2,
+		},
+		{
+			name: "Multiple commitments multiple projects",
+			request: newCommitmentRequest("az-a", false, 1234,
+				createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2),
+				createCommitment("ram_hana_1", "project-B", "uuid-2", "confirmed", 3),
+				createCommitment("ram_gp_1", "project-C", "uuid-3", "confirmed", 1),
+			),
+			expected: 3,
+		},
+		{
+			name:     "Empty request",
+			request:  newCommitmentRequest("az-a", false, 1234),
+			expected: 0,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Convert test request to liquid request
+			reqJSON := buildRequestJSON(tc.request)
+			var req liquid.CommitmentChangeRequest
+			if err := json.Unmarshal([]byte(reqJSON), &req); err != nil {
+				t.Fatalf("Failed to parse request: %v", err)
+			}
+
+			result := countCommitments(req)
+
+			if result != tc.expected {
+				t.Errorf("Expected %d commitments, got %d", tc.expected, result)
+			}
+		})
+	}
+}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go
index 58f8cf997..4d68227a1 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go
@@ -23,6 +23,7 @@ import (
 	"github.com/cobaltcore-dev/cortex/api/v1alpha1"
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
 	hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sapcc/go-api-declarations/liquid"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -969,7 +970,8 @@ func newCommitmentTestEnv(
 		api = NewAPI(wrappedClient)
 	}
 	mux := http.NewServeMux()
-	api.Init(mux)
+	registry := prometheus.NewRegistry()
+	api.Init(mux, registry)
 	httpServer := httptest.NewServer(mux)
 
 	env.HTTPServer = httpServer

From 8a8bfb93ed7c9ef41dfd67b26f4a29bc4fa5b833 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Mon, 23 Mar 2026 16:00:28 +0100
Subject: [PATCH 2/7] unify logging

---
 .../reservations/commitments/api.go           |  3 --
 .../commitments/api_change_commitments.go     |  7 ++--
 .../reservations/commitments/api_info.go      | 23 +++++--------
 .../commitments/api_report_capacity.go        | 20 ++++-------
 .../reservations/commitments/client.go        | 23 ++++++-------
 .../reservations/commitments/context.go       | 18 +++++-----
 .../reservations/commitments/controller.go    | 11 +++---
 .../reservations/commitments/state.go         |  4 +--
 .../reservations/commitments/syncer.go        | 34 +++++++++----------
 9 files changed, 63 insertions(+), 80 deletions(-)

diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go
index a97b00580..49e1ee261 100644
--- a/internal/scheduling/reservations/commitments/api.go
+++ b/internal/scheduling/reservations/commitments/api.go
@@ -8,7 +8,6 @@ import (
 	"sync"
 
 	"github.com/prometheus/client_golang/prometheus"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -39,5 +38,3 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
 	// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
 	mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
 }
-
-var commitmentApiLog = ctrl.Log.WithName("commitment_api")
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go
index 82b8846c1..016e42731 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments.go
@@ -21,12 +21,9 @@ import (
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-var apiLog = ctrl.Log.WithName("commitment-reservation-api")
-
 // sortedKeys returns map keys sorted alphabetically for deterministic iteration.
 func sortedKeys[K ~string, V any](m map[K]V) []K {
 	keys := make([]K, 0, len(m))
@@ -69,8 +66,8 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
 	if requestID == "" {
 		requestID = uuid.New().String()
 	}
-	ctx := reservations.WithGlobalRequestID(context.Background(), requestID)
-	logger := APILoggerFromContext(ctx).WithValues("endpoint", "/v1/change-commitments")
+	ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID)
+	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments")
 
 	// Only accept POST method
 	if r.Method != http.MethodPost {
diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go
index db02dd708..140dc54d4 100644
--- a/internal/scheduling/reservations/commitments/api_info.go
+++ b/internal/scheduling/reservations/commitments/api_info.go
@@ -9,7 +9,6 @@ import (
 	"fmt"
 	"net/http"
 	"strings"
-	"time"
 
 	"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
 	"github.com/go-logr/logr"
@@ -20,12 +19,8 @@ import (
 // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
 // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
 func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
-	// Extract or generate request ID for tracing
-	requestID := r.Header.Get("X-Request-ID")
-	if requestID == "" {
-		requestID = fmt.Sprintf("req-%d", time.Now().UnixNano())
-	}
-	log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/info")
+	ctx := WithNewGlobalRequestID(r.Context())
+	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/info")
 
 	// Only accept GET method
 	if r.Method != http.MethodGet {
@@ -33,13 +28,13 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	log.V(1).Info("processing info request")
+	logger.V(1).Info("processing info request")
 
 	// Build info response
-	info, err := api.buildServiceInfo(r.Context(), log)
+	info, err := api.buildServiceInfo(ctx, logger)
 	if err != nil {
 		// Use Info level for expected conditions like knowledge not being ready yet
-		log.Info("service info not available yet", "error", err.Error())
+		logger.Info("service info not available yet", "error", err.Error())
 		http.Error(w, "Service temporarily unavailable: "+err.Error(),
 			http.StatusServiceUnavailable)
 		return
@@ -49,13 +44,13 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(http.StatusOK)
 	if err := json.NewEncoder(w).Encode(info); err != nil {
-		log.Error(err, "failed to encode service info")
+		logger.Error(err, "failed to encode service info")
 		return
 	}
 }
 
 // buildServiceInfo constructs the ServiceInfo response with metadata for all flavor groups.
-func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liquid.ServiceInfo, error) {
+func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (liquid.ServiceInfo, error) {
 	// Get all flavor groups from Knowledge CRDs
 	knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client}
 	flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil)
@@ -92,7 +87,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liqu
 			HandlesCommitments:  true,                   // We handle commitment changes via /v1/change-commitments
 		}
 
-		log.V(1).Info("registered flavor group resource",
+		logger.V(1).Info("registered flavor group resource",
 			"resourceName", resourceName,
 			"flavorGroup", groupName,
 			"displayName", displayName,
@@ -106,7 +101,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liqu
 		version = knowledgeCRD.Status.LastContentChange.Unix()
 	}
 
-	log.Info("built service info",
+	logger.Info("built service info",
 		"resourceCount", len(resources),
 		"version", version)
 
diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go
index 0ec1f5e7d..f526dbd2c 100644
--- a/internal/scheduling/reservations/commitments/api_report_capacity.go
+++ b/internal/scheduling/reservations/commitments/api_report_capacity.go
@@ -5,9 +5,7 @@ package commitments
 
 import (
 	"encoding/json"
-	"fmt"
 	"net/http"
-	"time"
 
 	"github.com/sapcc/go-api-declarations/liquid"
 )
@@ -17,12 +15,8 @@ import (
 // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
 // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group.
 func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) {
-	// Extract or generate request ID for tracing
-	requestID := r.Header.Get("X-Request-ID")
-	if requestID == "" {
-		requestID = fmt.Sprintf("req-%d", time.Now().UnixNano())
-	}
-	log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/report-capacity")
+	ctx := WithNewGlobalRequestID(r.Context())
+	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/report-capacity")
 
 	// Only accept POST method
 	if r.Method != http.MethodPost {
@@ -30,7 +24,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
 		return
 	}
 
-	log.V(1).Info("processing report capacity request")
+	logger.V(1).Info("processing report capacity request")
 
 	// Parse request body (may be empty or contain ServiceCapacityRequest)
 	var req liquid.ServiceCapacityRequest
@@ -41,21 +35,21 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
 
 	// Calculate capacity
 	calculator := NewCapacityCalculator(api.client)
-	report, err := calculator.CalculateCapacity(r.Context())
+	report, err := calculator.CalculateCapacity(ctx)
 	if err != nil {
-		log.Error(err, "failed to calculate capacity")
+		logger.Error(err, "failed to calculate capacity")
 		http.Error(w, "Failed to calculate capacity: "+err.Error(),
 			http.StatusInternalServerError)
 		return
 	}
 
-	log.Info("calculated capacity report", "resourceCount", len(report.Resources))
+	logger.Info("calculated capacity report", "resourceCount", len(report.Resources))
 
 	// Return response
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(http.StatusOK)
 	if err := json.NewEncoder(w).Encode(report); err != nil {
-		log.Error(err, "failed to encode capacity report")
+		logger.Error(err, "failed to encode capacity report")
 		return
 	}
 }
diff --git a/internal/scheduling/reservations/commitments/client.go b/internal/scheduling/reservations/commitments/client.go
index 2e5585c99..b31feb7c0 100644
--- a/internal/scheduling/reservations/commitments/client.go
+++ b/internal/scheduling/reservations/commitments/client.go
@@ -17,7 +17,6 @@ import (
 	"github.com/gophercloud/gophercloud/v2/openstack/identity/v3/projects"
 	"github.com/sapcc/go-bits/jobloop"
 	"github.com/sapcc/go-bits/must"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -48,7 +47,7 @@ func NewCommitmentsClient() CommitmentsClient {
 }
 
 func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf SyncerConfig) error {
-	log := ctrl.Log.WithName("CommitmentClient")
+	logger := LoggerFromContext(ctx).WithValues("component", "client")
 
 	var authenticatedHTTP = http.DefaultClient
 	if conf.SSOSecretRef != nil {
@@ -72,7 +71,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf
 		Type:         "identity",
 		Availability: "public",
 	}))
-	log.Info("using identity endpoint", "url", url)
+	logger.Info("using identity endpoint", "url", url)
 	c.keystone = &gophercloud.ServiceClient{
 		ProviderClient: c.provider,
 		Endpoint:       url,
@@ -84,7 +83,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf
 		Type:         "compute",
 		Availability: "public",
 	}))
-	log.Info("using nova endpoint", "url", url)
+	logger.Info("using nova endpoint", "url", url)
 	c.nova = &gophercloud.ServiceClient{
 		ProviderClient: c.provider,
 		Endpoint:       url,
@@ -97,7 +96,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf
 		Type:         "resources",
 		Availability: "public",
 	}))
-	log.Info("using limes endpoint", "url", url)
+	logger.Info("using limes endpoint", "url", url)
 	c.limes = &gophercloud.ServiceClient{
 		ProviderClient: c.provider,
 		Endpoint:       url,
@@ -107,9 +106,9 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf
 }
 
 func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) {
-	log := ctrl.Log.WithName("CommitmentClient")
+	logger := LoggerFromContext(ctx).WithValues("component", "client")
 
-	log.V(1).Info("fetching projects from keystone")
+	logger.V(1).Info("fetching projects from keystone")
 	allPages, err := projects.List(c.keystone, nil).AllPages(ctx)
 	if err != nil {
 		return nil, err
@@ -120,15 +119,15 @@ func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error)
 	if err := allPages.(projects.ProjectPage).ExtractInto(data); err != nil {
 		return nil, err
 	}
-	log.V(1).Info("fetched projects from keystone", "count", len(data.Projects))
+	logger.V(1).Info("fetched projects from keystone", "count", len(data.Projects))
 	return data.Projects, nil
 }
 
 // ListCommitmentsByID fetches commitments for all projects in parallel.
 func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
-	log := ctrl.Log.WithName("CommitmentClient")
+	logger := LoggerFromContext(ctx).WithValues("component", "client")
 
-	log.V(1).Info("fetching commitments from limes", "projects", len(projects))
+	logger.V(1).Info("fetching commitments from limes", "projects", len(projects))
 	commitmentsMutex := gosync.Mutex{}
 	commitments := make(map[string]Commitment)
 	var wg gosync.WaitGroup
@@ -161,11 +160,11 @@ func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ..
 	// Return the first error encountered, if any.
 	for err := range errChan {
 		if err != nil {
-			log.Error(err, "failed to resolve commitments")
+			logger.Error(err, "failed to resolve commitments")
 			return nil, err
 		}
 	}
-	log.V(1).Info("resolved commitments from limes", "count", len(commitments))
+	logger.V(1).Info("resolved commitments from limes", "count", len(commitments))
 	return commitments, nil
 }
 
diff --git a/internal/scheduling/reservations/commitments/context.go b/internal/scheduling/reservations/commitments/context.go
index a461ac1c2..64257fcae 100644
--- a/internal/scheduling/reservations/commitments/context.go
+++ b/internal/scheduling/reservations/commitments/context.go
@@ -9,21 +9,23 @@ import (
 	"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
 	"github.com/go-logr/logr"
 	"github.com/google/uuid"
+	ctrl "sigs.k8s.io/controller-runtime"
 )
 
+// baseLog is the base logger for all committed-resource operations.
+// Use LoggerFromContext to get a logger with request tracking values.
+var baseLog = ctrl.Log.WithName("committed-resource")
+
+// WithNewGlobalRequestID creates a new context with a committed-resource-prefixed global request ID.
 func WithNewGlobalRequestID(ctx context.Context) context.Context {
 	return reservations.WithGlobalRequestID(ctx, "committed-resource-"+uuid.New().String())
 }
 
+// LoggerFromContext returns a logger with greq and req values from the context.
+// This creates a child logger with the request tracking values pre-attached,
+// so you don't need to repeat them in every log call.
 func LoggerFromContext(ctx context.Context) logr.Logger {
-	return commitmentLog.WithValues(
-		"greq", reservations.GlobalRequestIDFromContext(ctx),
-		"req", reservations.RequestIDFromContext(ctx),
-	)
-}
-
-func APILoggerFromContext(ctx context.Context) logr.Logger {
-	return apiLog.WithValues(
+	return baseLog.WithValues(
 		"greq", reservations.GlobalRequestIDFromContext(ctx),
 		"req", reservations.RequestIDFromContext(ctx),
 	)
diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go
index 6a30b6b6c..f68e35cde 100644
--- a/internal/scheduling/reservations/commitments/controller.go
+++ b/internal/scheduling/reservations/commitments/controller.go
@@ -31,8 +31,6 @@ import (
 	"github.com/go-logr/logr"
 )
 
-var commitmentLog = ctrl.Log.WithName("commitment-reservation-controller")
-
 // CommitmentReservationController reconciles commitment Reservation objects
 type CommitmentReservationController struct {
 	// Client for the kubernetes API.
@@ -51,7 +49,8 @@ type CommitmentReservationController struct {
 // move the current state of the cluster closer to the desired state.
 // Note: This controller only handles commitment reservations, as filtered by the predicate.
 func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
-	logger := LoggerFromContext(ctx).WithValues("reservation", req.Name, "namespace", req.Namespace)
+	ctx = WithNewGlobalRequestID(ctx)
+	logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name, "namespace", req.Namespace)
 	// Fetch the reservation object.
 	var res v1alpha1.Reservation
 	if err := r.Get(ctx, req.NamespacedName, &res); err != nil {
@@ -323,7 +322,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
 // reconcileAllocations verifies all allocations in Spec against actual Nova VM state.
 // It updates Status.Allocations based on the actual host location of each VM.
 func (r *CommitmentReservationController) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error {
-	logger := LoggerFromContext(ctx)
+	logger := LoggerFromContext(ctx).WithValues("component", "controller")
 
 	// Skip if no CommittedResourceReservation
 	if res.Spec.CommittedResourceReservation == nil {
@@ -440,7 +439,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con
 		return nil, errors.New("database connection not initialized")
 	}
 
-	log := logf.FromContext(ctx)
+	logger := LoggerFromContext(ctx).WithValues("component", "controller")
 
 	// Query servers from the database cache.
 	var servers []nova.Server
@@ -451,7 +450,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con
 		return nil, fmt.Errorf("failed to query servers from database: %w", err)
 	}
 
-	log.V(1).Info("queried servers from database",
+	logger.V(1).Info("queried servers from database",
 		"projectID", projectID,
 		"serverCount", len(servers))
 
diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go
index bfa6fdb28..11bbc4f1d 100644
--- a/internal/scheduling/reservations/commitments/state.go
+++ b/internal/scheduling/reservations/commitments/state.go
@@ -199,8 +199,8 @@ func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, er
 		}
 		// check flavor group consistency, ignore if not matching to repair corrupted state in k8s
 		if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName {
-			// log message
-			commitmentLog.Error(errors.New("inconsistent flavor group in reservation"),
+			// log message using baseLog since this is a static function without context
+			baseLog.Error(errors.New("inconsistent flavor group in reservation"),
 				"reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation",
 				"reservationName", res.Name,
 				"expectedFlavorGroup", state.FlavorGroupName,
diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go
index 2f302479a..b7db11351 100644
--- a/internal/scheduling/reservations/commitments/syncer.go
+++ b/internal/scheduling/reservations/commitments/syncer.go
@@ -14,7 +14,6 @@ import (
 	"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
 	"github.com/go-logr/logr"
 	corev1 "k8s.io/api/core/v1"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -132,35 +131,36 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo
 func (s *Syncer) SyncReservations(ctx context.Context) error {
 	// TODO handle concurrency with change API: consider creation time of reservations and status ready
 
-	// Create logger with run ID for this sync execution
+	// Create context with request ID for this sync execution
 	runID := fmt.Sprintf("sync-%d", time.Now().Unix())
-	log := ctrl.Log.WithName("CommitmentSyncer").WithValues("runID", runID)
+	ctx = WithNewGlobalRequestID(ctx)
+	logger := LoggerFromContext(ctx).WithValues("component", "syncer", "runID", runID)
 
-	log.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval)
+	logger.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval)
 
 	// Check if flavor group knowledge is ready
 	knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client}
 	knowledgeCRD, err := knowledge.Get(ctx)
 	if err != nil {
-		log.Error(err, "failed to check flavor group knowledge readiness")
+		logger.Error(err, "failed to check flavor group knowledge readiness")
 		return err
 	}
 	if knowledgeCRD == nil {
-		log.Info("skipping commitment sync - flavor group knowledge not ready yet")
+		logger.Info("skipping commitment sync - flavor group knowledge not ready yet")
 		return nil
 	}
 
 	// Get flavor groups using the CRD we already fetched
 	flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD)
 	if err != nil {
-		log.Error(err, "failed to get flavor groups from knowledge")
+		logger.Error(err, "failed to get flavor groups from knowledge")
 		return err
 	}
 
 	// Get all commitments as states
-	commitmentStates, err := s.getCommitmentStates(ctx, log, flavorGroups)
+	commitmentStates, err := s.getCommitmentStates(ctx, logger, flavorGroups)
 	if err != nil {
-		log.Error(err, "failed to get compute commitments")
+		logger.Error(err, "failed to get compute commitments")
 		return err
 	}
 
@@ -169,15 +169,15 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
 
 	// Apply each commitment state using the manager
 	for _, state := range commitmentStates {
-		log.Info("applying commitment state",
+		logger.Info("applying commitment state",
 			"commitmentUUID", state.CommitmentUUID,
 			"projectID", state.ProjectID,
 			"flavorGroup", state.FlavorGroupName,
 			"totalMemoryBytes", state.TotalMemoryBytes)
 
-		_, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, CreatorValue)
+		_, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue)
 		if err != nil {
-			log.Error(err, "failed to apply commitment state",
+			logger.Error(err, "failed to apply commitment state",
 				"commitmentUUID", state.CommitmentUUID)
 			// Continue with other commitments even if one fails
 			continue
@@ -190,7 +190,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
 	if err := s.List(ctx, &existingReservations, client.MatchingLabels{
 		v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
 	}); err != nil {
-		log.Error(err, "failed to list existing committed resource reservations")
+		logger.Error(err, "failed to list existing committed resource reservations")
 		return err
 	}
 
@@ -205,22 +205,22 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
 		// Extract commitment UUID from reservation name
 		commitmentUUID := extractCommitmentUUID(existing.Name)
 		if commitmentUUID == "" {
-			log.Info("skipping reservation with unparseable name", "name", existing.Name)
+			logger.Info("skipping reservation with unparseable name", "name", existing.Name)
 			continue
 		}
 
 		if !activeCommitments[commitmentUUID] {
 			// This commitment no longer exists, delete the reservation
 			if err := s.Delete(ctx, &existing); err != nil {
-				log.Error(err, "failed to delete reservation", "name", existing.Name)
+				logger.Error(err, "failed to delete reservation", "name", existing.Name)
 				return err
 			}
-			log.Info("deleted reservation for expired commitment",
+			logger.Info("deleted reservation for expired commitment",
 				"name", existing.Name,
 				"commitmentUUID", commitmentUUID)
 		}
 	}
 
-	log.Info("synced reservations", "commitmentCount", len(commitmentStates))
+	logger.Info("synced reservations", "commitmentCount", len(commitmentStates))
 	return nil
 }

From 10380c971f80e5a0fcf011edf24d1cb7020e79f6 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Mon, 23 Mar 2026 16:08:55 +0100
Subject: [PATCH 3/7] metrics

---
 .../cortex-nova/alerts/nova.alerts.yaml       | 52 ++++++++++++++++++-
 .../api_change_commitments_monitor.go         | 10 ++--
 .../api_change_commitments_monitor_test.go    | 12 ++---
 3 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 65de5c626..8a757d9fc 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -274,4 +274,54 @@ groups:
       description: >
         This may indicate issues with the pipeline
         configuration. It is recommended to investigate the
-        pipeline status and logs for more details.
\ No newline at end of file
+        pipeline status and logs for more details.
+
+  # Committed Resource (Limes Integration) Alerts
+  - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 500 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates that Cortex is having issues
+        processing commitment changes from Limes.
+
+  - alert: CortexNovaCommittedResourceLatencyTooHigh
+    expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API latency too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 30s).
+
+  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
+    expr: |
+      rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h])
+      / rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h]) > 0.5
+    for: 15m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource rejection rate too high"
+      description: >
+        More than 50% of commitment change requests are being rejected.
+        This may indicate insufficient capacity to fulfill new commitments
+        or issues with the commitment scheduling logic.
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
index 5bfabf175..873e57eb0 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
@@ -18,15 +18,15 @@ type ChangeCommitmentsAPIMonitor struct {
 func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
 	return ChangeCommitmentsAPIMonitor{
 		requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
-			Name: "cortex_cr_change_api_requests_total",
-			Help: "Total number of CR change API requests by HTTP status code",
+			Name: "cortex_committed_resource_change_api_requests_total",
+			Help: "Total number of committed resource change API requests by HTTP status code",
 		}, []string{"status_code"}),
 		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
-			Name: "cortex_cr_change_api_request_duration_seconds",
-			Help: "Duration of CR change API requests in seconds by HTTP status code",
+			Name: "cortex_committed_resource_change_api_request_duration_seconds",
+			Help: "Duration of committed resource change API requests in seconds by HTTP status code",
 		}, []string{"status_code"}),
 		commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{
-			Name: "cortex_cr_change_api_commitment_changes_total",
+			Name: "cortex_committed_resource_change_api_commitment_changes_total",
 			Help: "Total number of commitment changes processed by result",
 		}, []string{"result"}),
 	}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
index bf3acfb40..7d8b080b7 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
@@ -38,17 +38,17 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 
 	for _, family := range families {
 		switch *family.Name {
-		case "cortex_cr_change_api_requests_total":
+		case "cortex_committed_resource_change_api_requests_total":
 			foundRequestCounter = true
 			if *family.Type != dto.MetricType_COUNTER {
 				t.Errorf("Expected counter metric type, got %v", *family.Type)
 			}
-		case "cortex_cr_change_api_request_duration_seconds":
+		case "cortex_committed_resource_change_api_request_duration_seconds":
 			foundRequestDuration = true
 			if *family.Type != dto.MetricType_HISTOGRAM {
 				t.Errorf("Expected histogram metric type, got %v", *family.Type)
 			}
-		case "cortex_cr_change_api_commitment_changes_total":
+		case "cortex_committed_resource_change_api_commitment_changes_total":
 			foundCommitmentChanges = true
 			if *family.Type != dto.MetricType_COUNTER {
 				t.Errorf("Expected counter metric type, got %v", *family.Type)
@@ -91,7 +91,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 
 	// Verify request counter has correct labels
 	for _, family := range families {
-		if *family.Name == "cortex_cr_change_api_requests_total" {
+		if *family.Name == "cortex_committed_resource_change_api_requests_total" {
 			if len(family.Metric) != 3 {
 				t.Errorf("Expected 3 request counter metrics, got %d", len(family.Metric))
 			}
@@ -109,7 +109,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 			}
 		}
 
-		if *family.Name == "cortex_cr_change_api_request_duration_seconds" {
+		if *family.Name == "cortex_committed_resource_change_api_request_duration_seconds" {
 			if len(family.Metric) != 1 {
 				t.Errorf("Expected 1 histogram metric, got %d", len(family.Metric))
 			}
@@ -127,7 +127,7 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 			}
 		}
 
-		if *family.Name == "cortex_cr_change_api_commitment_changes_total" {
+		if *family.Name == "cortex_committed_resource_change_api_commitment_changes_total" {
 			if len(family.Metric) != 2 {
 				t.Errorf("Expected 2 commitment changes metrics, got %d", len(family.Metric))
 			}

From 2064b9184614cc458b0562e3e21f0450393f4031 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Tue, 24 Mar 2026 09:01:03 +0100
Subject: [PATCH 4/7] fix

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 8a757d9fc..538b9279c 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -311,7 +311,7 @@ groups:
   - alert: CortexNovaCommittedResourceRejectionRateTooHigh
     expr: |
       rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h])
-      / rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h]) > 0.5
+      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h])) > 0.5
     for: 15m
     labels:
       context: committed-resource-api

From 19b182ff6957e52ea68dd190d7e467ec6e9a21f8 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Tue, 24 Mar 2026 09:09:47 +0100
Subject: [PATCH 5/7] timeout alert

---
 .../cortex-nova/alerts/nova.alerts.yaml       | 53 +++++++++++++++++--
 .../commitments/api_change_commitments.go     |  1 +
 .../api_change_commitments_monitor.go         |  7 +++
 .../api_change_commitments_monitor_test.go    | 12 ++++-
 4 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 538b9279c..c8db56b8c 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -277,6 +277,24 @@ groups:
         pipeline status and logs for more details.
 
   # Committed Resource (Limes Integration) Alerts
+  - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 400 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may happen when Limes sends a request with
+        an outdated info version (409), the API is temporarily unavailable,
+        or the request format is invalid. Limes will typically retry these
+        requests, so no immediate action is needed unless the errors persist.
+
   - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
     expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
     for: 5m
@@ -290,8 +308,10 @@ groups:
       summary: "Committed Resource change API HTTP 500 errors too high"
       description: >
         The committed resource change API (Limes LIQUID integration) is responding
-        with HTTP 5xx errors. This indicates that Cortex is having issues
-        processing commitment changes from Limes.
+        with HTTP 5xx errors. This is not expected and indicates that Cortex
+        is having an internal problem processing commitment changes. Limes will
+        continue to retry, but new commitments may not be fulfilled until the
+        issue is resolved.
 
   - alert: CortexNovaCommittedResourceLatencyTooHigh
     expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30
@@ -306,7 +326,9 @@ groups:
       summary: "Committed Resource change API latency too high"
       description: >
         The committed resource change API (Limes LIQUID integration) is experiencing
-        high latency (p95 > 30s).
+        high latency (p95 > 30s). This may indicate that the scheduling pipeline
+        is under heavy load or that reservation scheduling is taking longer than
+        expected. Limes requests may time out, causing commitment changes to fail.
 
   - alert: CortexNovaCommittedResourceRejectionRateTooHigh
     expr: |
@@ -323,5 +345,26 @@ groups:
       summary: "Committed Resource rejection rate too high"
       description: >
         More than 50% of commitment change requests are being rejected.
-        This may indicate insufficient capacity to fulfill new commitments
-        or issues with the commitment scheduling logic.
+        This may indicate insufficient capacity in the datacenter to fulfill
+        new commitments, or issues with the commitment scheduling logic.
+        Rejected commitments are rolled back, so Limes will see them as failed
+        and may retry or report the failure to users.
+
+  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[1h]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API timeouts too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) timed out
+        while waiting for reservations to become ready. This indicates that the
+        scheduling pipeline is overloaded or reservations are taking too long
+        to be scheduled. Affected commitment changes are rolled back and Limes
+        will see them as failed. Consider investigating the scheduler performance
+        or increasing the timeout configuration.
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go
index 016e42731..2488d5843 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments.go
@@ -272,6 +272,7 @@ ProcessLoop:
 			}
 			if len(failedReservations) == 0 {
 				resp.RejectionReason += "timeout reached while processing commitment changes"
+				api.monitor.timeouts.Inc()
 			}
 			requireRollback = true
 		}
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
index 873e57eb0..30a09d49e 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor.go
@@ -12,6 +12,7 @@ type ChangeCommitmentsAPIMonitor struct {
 	requestCounter    *prometheus.CounterVec
 	requestDuration   *prometheus.HistogramVec
 	commitmentChanges *prometheus.CounterVec
+	timeouts          prometheus.Counter
 }
 
 // NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics.
@@ -29,6 +30,10 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
 			Name: "cortex_committed_resource_change_api_commitment_changes_total",
 			Help: "Total number of commitment changes processed by result",
 		}, []string{"result"}),
+		timeouts: prometheus.NewCounter(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_change_api_timeouts_total",
+			Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready",
+		}),
 	}
 }
 
@@ -37,6 +42,7 @@ func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
 	m.requestCounter.Describe(ch)
 	m.requestDuration.Describe(ch)
 	m.commitmentChanges.Describe(ch)
+	m.timeouts.Describe(ch)
 }
 
 // Collect implements prometheus.Collector.
@@ -44,4 +50,5 @@ func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) {
 	m.requestCounter.Collect(ch)
 	m.requestDuration.Collect(ch)
 	m.commitmentChanges.Collect(ch)
+	m.timeouts.Collect(ch)
 }
diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
index 7d8b080b7..322d01aae 100644
--- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
+++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go
@@ -24,6 +24,7 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 	monitor.requestCounter.WithLabelValues("200").Inc()
 	monitor.requestDuration.WithLabelValues("200").Observe(0.1)
 	monitor.commitmentChanges.WithLabelValues("success").Inc()
+	monitor.timeouts.Inc()
 
 	// Verify metrics can be gathered
 	families, err := registry.Gather()
@@ -31,10 +32,11 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 		t.Fatalf("Failed to gather metrics: %v", err)
 	}
 
-	// Check that all three metrics are present
+	// Check that all metrics are present
 	foundRequestCounter := false
 	foundRequestDuration := false
 	foundCommitmentChanges := false
+	foundTimeouts := false
 
 	for _, family := range families {
 		switch *family.Name {
@@ -53,6 +55,11 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 			if *family.Type != dto.MetricType_COUNTER {
 				t.Errorf("Expected counter metric type, got %v", *family.Type)
 			}
+		case "cortex_committed_resource_change_api_timeouts_total":
+			foundTimeouts = true
+			if *family.Type != dto.MetricType_COUNTER {
+				t.Errorf("Expected counter metric type, got %v", *family.Type)
+			}
 		}
 	}
 
@@ -65,6 +72,9 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 	if !foundCommitmentChanges {
 		t.Error("Commitment changes counter not found in registry")
 	}
+	if !foundTimeouts {
+		t.Error("Timeouts counter not found in registry")
+	}
 }
 
 func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {

From 37da104062fcd1f724f0a4578e1f02c44d62d628 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Tue, 24 Mar 2026 09:21:10 +0100
Subject: [PATCH 6/7] 5m instead of 1h

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index c8db56b8c..864651fba 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -332,9 +332,9 @@ groups:
 
   - alert: CortexNovaCommittedResourceRejectionRateTooHigh
     expr: |
-      rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[1h])
-      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[1h])) > 0.5
-    for: 15m
+      rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])
+      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
+    for: 5m
     labels:
       context: committed-resource-api
       dashboard: cortex/cortex
@@ -351,7 +351,7 @@ groups:
         and may retry or report the failure to users.
 
   - alert: CortexNovaCommittedResourceTimeoutsTooHigh
-    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[1h]) > 0
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
     for: 5m
     labels:
       context: committed-resource-api

From a0f46276b823bd0002918f2a196fb69ef801adb6 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Tue, 24 Mar 2026 09:36:45 +0100
Subject: [PATCH 7/7] .

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 864651fba..675c5a27a 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -314,7 +314,7 @@ groups:
         issue is resolved.
 
   - alert: CortexNovaCommittedResourceLatencyTooHigh
-    expr: histogram_quantile(0.95, rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) > 30
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
     for: 5m
     labels:
       context: committed-resource-api