Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ func main() {

// Initialize commitments API for LIQUID interface
commitmentsAPI := commitments.NewAPI(multiclusterClient)
commitmentsAPI.Init(mux)
commitmentsAPI.Init(mux, metrics.Registry)

// Detector pipeline controller setup.
novaClient := nova.NewNovaClient()
Expand Down
95 changes: 94 additions & 1 deletion helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,97 @@ groups:
description: >
This may indicate issues with the pipeline
configuration. It is recommended to investigate the
pipeline status and logs for more details.
pipeline status and logs for more details.

# Committed Resource (Limes Integration) Alerts
- alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource change API HTTP 400 errors too high"
description: >
The committed resource change API (Limes LIQUID integration) is responding
with HTTP 4xx errors. This may happen when Limes sends a request with
an outdated info version (409), the API is temporarily unavailable,
or the request format is invalid. Limes will typically retry these
requests, so no immediate action is needed unless the errors persist.

- alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource change API HTTP 500 errors too high"
description: >
The committed resource change API (Limes LIQUID integration) is responding
with HTTP 5xx errors. This is not expected and indicates that Cortex
is having an internal problem processing commitment changes. Limes will
continue to retry, but new commitments may not be fulfilled until the
issue is resolved.

- alert: CortexNovaCommittedResourceLatencyTooHigh
expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource change API latency too high"
description: >
The committed resource change API (Limes LIQUID integration) is experiencing
high latency (p95 > 30s). This may indicate that the scheduling pipeline
is under heavy load or that reservation scheduling is taking longer than
expected. Limes requests may time out, causing commitment changes to fail.

- alert: CortexNovaCommittedResourceRejectionRateTooHigh
expr: |
rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])
/ sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource rejection rate too high"
description: >
More than 50% of commitment change requests are being rejected.
This may indicate insufficient capacity in the datacenter to fulfill
new commitments, or issues with the commitment scheduling logic.
Rejected commitments are rolled back, so Limes will see them as failed
and may retry or report the failure to users.

- alert: CortexNovaCommittedResourceTimeoutsTooHigh
expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource change API timeouts too high"
description: >
The committed resource change API (Limes LIQUID integration) timed out
while waiting for reservations to become ready. This indicates that the
scheduling pipeline is overloaded or reservations are taking too long
to be scheduled. Affected commitment changes are rolled back and Limes
will see them as failed. Consider investigating the scheduler performance
or increasing the timeout configuration.
17 changes: 9 additions & 8 deletions internal/scheduling/reservations/commitments/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ import (
"net/http"
"sync"

ctrl "sigs.k8s.io/controller-runtime"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// HTTPAPI implements Limes LIQUID commitment validation endpoints.
type HTTPAPI struct {
client client.Client
config Config
client client.Client
config Config
monitor ChangeCommitmentsAPIMonitor
// Mutex to serialize change-commitments requests
changeMutex sync.Mutex
}
Expand All @@ -25,15 +26,15 @@ func NewAPI(client client.Client) *HTTPAPI {

func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI {
return &HTTPAPI{
client: client,
config: config,
client: client,
config: config,
monitor: NewChangeCommitmentsAPIMonitor(),
}
}

func (api *HTTPAPI) Init(mux *http.ServeMux) {
func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
registry.MustRegister(&api.monitor)
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
}

var commitmentApiLog = ctrl.Log.WithName("commitment_api")
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,9 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var apiLog = ctrl.Log.WithName("commitment-reservation-api")

// sortedKeys returns map keys sorted alphabetically for deterministic iteration.
func sortedKeys[K ~string, V any](m map[K]V) []K {
keys := make([]K, 0, len(m))
Expand All @@ -46,9 +43,17 @@ func sortedKeys[K ~string, V any](m map[K]V) []K {
// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle.
// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back.
func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) {
startTime := time.Now()
// Initialize
resp := liquid.CommitmentChangeResponse{}
req := liquid.CommitmentChangeRequest{}
statusCode := http.StatusOK

// Check if API is enabled
if !api.config.EnableChangeCommitmentsAPI {
http.Error(w, "change-commitments API is disabled", http.StatusServiceUnavailable)
statusCode = http.StatusServiceUnavailable
http.Error(w, "change-commitments API is disabled", statusCode)
api.recordMetrics(req, resp, statusCode, startTime)
return
}

Expand All @@ -61,31 +66,32 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
if requestID == "" {
requestID = uuid.New().String()
}
ctx := reservations.WithGlobalRequestID(context.Background(), requestID)
logger := APILoggerFromContext(ctx).WithValues("endpoint", "/v1/change-commitments")
ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID)
logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments")

// Only accept POST method
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
statusCode = http.StatusMethodNotAllowed
http.Error(w, "Method not allowed", statusCode)
api.recordMetrics(req, resp, statusCode, startTime)
return
}

// Parse request body
var req liquid.CommitmentChangeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
logger.Error(err, "invalid request body")
http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest)
statusCode = http.StatusBadRequest
http.Error(w, "Invalid request body: "+err.Error(), statusCode)
api.recordMetrics(req, resp, statusCode, startTime)
return
}

logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ)

// Initialize response
resp := liquid.CommitmentChangeResponse{}

// Check for dry run -> early reject, not supported yet
if req.DryRun {
resp.RejectionReason = "Dry run not supported yet"
api.recordMetrics(req, resp, statusCode, startTime)
logger.Info("rejecting dry run request")
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
Expand All @@ -97,14 +103,26 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque

// Process commitment changes
// For now, we'll implement a simplified path that checks capacity for immediate start CRs

if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil {
// Error already written to response by processCommitmentChanges
// Determine status code from error context (409 or 503)
if strings.Contains(err.Error(), "version mismatch") {
statusCode = http.StatusConflict
} else if strings.Contains(err.Error(), "caches not ready") {
statusCode = http.StatusServiceUnavailable
}
Comment on lines +109 to +114
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fragile status code derivation via error string matching.

Determining HTTP status codes by inspecting error message substrings is brittle—if the error messages change, this logic silently breaks. Consider using typed errors or sentinel errors to convey the status code.

💡 Suggested approach using sentinel errors
+var (
+	errVersionMismatch = errors.New("version mismatch")
+	errCachesNotReady  = errors.New("caches not ready")
+)

 // In processCommitmentChanges, return the sentinel error:
-return errors.New("version mismatch")
+return errVersionMismatch

 // In HandleChangeCommitments:
-if strings.Contains(err.Error(), "version mismatch") {
-    statusCode = http.StatusConflict
-} else if strings.Contains(err.Error(), "caches not ready") {
-    statusCode = http.StatusServiceUnavailable
-}
+if errors.Is(err, errVersionMismatch) {
+    statusCode = http.StatusConflict
+} else if errors.Is(err, errCachesNotReady) {
+    statusCode = http.StatusServiceUnavailable
+}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/scheduling/reservations/commitments/api_change_commitments.go`
around lines 109 - 114, Replace fragile substring checks on err.Error() with
explicit error types or sentinel errors and use errors.Is or a typed error
interface to map to HTTP codes; for example define sentinel errors like
ErrVersionMismatch and ErrCachesNotReady (or implement an error type with a
StatusCode() int) and then in the code that currently checks
strings.Contains(err.Error(), "version mismatch") / "caches not ready" perform
errors.Is(err, ErrVersionMismatch) / errors.Is(err, ErrCachesNotReady) or cast
to the typed error and read StatusCode() to set statusCode to
http.StatusConflict or http.StatusServiceUnavailable respectively, updating the
logic around the err variable where statusCode is derived.

// Record metrics for error cases
api.recordMetrics(req, resp, statusCode, startTime)
return
}

// Record metrics
api.recordMetrics(req, resp, statusCode, startTime)

// Return response
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.WriteHeader(statusCode)
if err := json.NewEncoder(w).Encode(resp); err != nil {
return
}
Expand Down Expand Up @@ -254,6 +272,7 @@ ProcessLoop:
}
if len(failedReservations) == 0 {
resp.RejectionReason += "timeout reached while processing commitment changes"
api.monitor.timeouts.Inc()
}
requireRollback = true
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0

package commitments

import (
"strconv"
"time"

"github.com/sapcc/go-api-declarations/liquid"
)

// recordMetrics records Prometheus metrics for a change commitments request.
func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liquid.CommitmentChangeResponse, statusCode int, startTime time.Time) {
duration := time.Since(startTime).Seconds()
statusCodeStr := strconv.Itoa(statusCode)

// Record request counter and duration
api.monitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
api.monitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)

// Count total commitment changes in the request
commitmentCount := countCommitments(req)

// Determine result based on response
result := "success"
if resp.RejectionReason != "" {
result = "rejected"
}

// Record commitment changes counter
api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount))
}

// countCommitments counts the total number of commitments in a request.
func countCommitments(req liquid.CommitmentChangeRequest) int {
count := 0
for _, projectChanges := range req.ByProject {
for _, resourceChanges := range projectChanges.ByResource {
count += len(resourceChanges.Commitments)
}
}
return count
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0

package commitments

import (
"github.com/prometheus/client_golang/prometheus"
)

// ChangeCommitmentsAPIMonitor provides metrics for the CR change API.
type ChangeCommitmentsAPIMonitor struct {
requestCounter *prometheus.CounterVec
requestDuration *prometheus.HistogramVec
commitmentChanges *prometheus.CounterVec
timeouts prometheus.Counter
}

// NewChangeCommitmentsAPIMonitor creates a new monitor with Prometheus metrics.
func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
return ChangeCommitmentsAPIMonitor{
requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cortex_committed_resource_change_api_requests_total",
Help: "Total number of committed resource change API requests by HTTP status code",
}, []string{"status_code"}),
requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "cortex_committed_resource_change_api_request_duration_seconds",
Help: "Duration of committed resource change API requests in seconds by HTTP status code",
}, []string{"status_code"}),
commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cortex_committed_resource_change_api_commitment_changes_total",
Help: "Total number of commitment changes processed by result",
}, []string{"result"}),
timeouts: prometheus.NewCounter(prometheus.CounterOpts{
Name: "cortex_committed_resource_change_api_timeouts_total",
Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready",
}),
}
}

// Describe implements prometheus.Collector.
func (m *ChangeCommitmentsAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
m.requestCounter.Describe(ch)
m.requestDuration.Describe(ch)
m.commitmentChanges.Describe(ch)
m.timeouts.Describe(ch)
}

// Collect implements prometheus.Collector.
func (m *ChangeCommitmentsAPIMonitor) Collect(ch chan<- prometheus.Metric) {
m.requestCounter.Collect(ch)
m.requestDuration.Collect(ch)
m.commitmentChanges.Collect(ch)
m.timeouts.Collect(ch)
}
Loading
Loading