From 74b78a6a84bc72b62ed95a297aebbdfea23d1b96 Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Wed, 25 Mar 2026 10:08:42 +0100 Subject: [PATCH 01/30] Set SchedulerHints in failover request and removed incorrect prefix (#626) --- api/external/nova/messages.go | 5 +++++ .../reservations/failover/reservation_scheduling.go | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/api/external/nova/messages.go b/api/external/nova/messages.go index 15c652ba4..a0a5a8616 100644 --- a/api/external/nova/messages.go +++ b/api/external/nova/messages.go @@ -138,6 +138,8 @@ const ( EvacuateIntent v1alpha1.SchedulingIntent = "evacuate" // CreateIntent indicates that the request is intended for creating a new VM. CreateIntent v1alpha1.SchedulingIntent = "create" + // ReserveForFailoverIntent indicates that the request is for failover reservation scheduling. + ReserveForFailoverIntent v1alpha1.SchedulingIntent = "reserve_for_failover" ) // GetIntent analyzes the request spec and determines the intent of the scheduling request. @@ -160,6 +162,9 @@ func (req ExternalSchedulerRequest) GetIntent() (v1alpha1.SchedulingIntent, erro // See: https://github.com/sapcc/nova/blob/c88393/nova/compute/api.py#L5770 case "evacuate": return EvacuateIntent, nil + // Used by cortex failover reservation controller + case "reserve_for_failover": + return ReserveForFailoverIntent, nil default: return CreateIntent, nil } diff --git a/internal/scheduling/reservations/failover/reservation_scheduling.go b/internal/scheduling/reservations/failover/reservation_scheduling.go index 873e5dbca..6859d2cc4 100644 --- a/internal/scheduling/reservations/failover/reservation_scheduling.go +++ b/internal/scheduling/reservations/failover/reservation_scheduling.go @@ -79,7 +79,7 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex // Note: We pass all hypervisors (from all AZs) in EligibleHosts. The scheduler pipeline's // filter_correct_az filter will exclude hosts that are not in the VM's availability zone. scheduleReq := reservations.ScheduleReservationRequest{ - InstanceUUID: "failover-" + vm.UUID, + InstanceUUID: vm.UUID, ProjectID: vm.ProjectID, FlavorName: vm.FlavorName, FlavorExtraSpecs: flavorExtraSpecs, @@ -89,6 +89,7 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex IgnoreHosts: ignoreHypervisors, Pipeline: pipeline, AvailabilityZone: vm.AvailabilityZone, + SchedulerHints: map[string]any{"_nova_check_type": string(api.ReserveForFailoverIntent)}, } logger.V(1).Info("scheduling failover reservation", @@ -218,7 +219,7 @@ func (c *FailoverReservationController) validateVMViaSchedulerEvacuation( IgnoreHosts: []string{vm.CurrentHypervisor}, Pipeline: PipelineAcknowledgeFailoverReservation, AvailabilityZone: vm.AvailabilityZone, - SchedulerHints: map[string]any{"_nova_check_type": "evacuate"}, + SchedulerHints: map[string]any{"_nova_check_type": string(api.EvacuateIntent)}, } logger.V(1).Info("validating VM via scheduler evacuation", From 39d1957631e3753e73857e872a427a5f0958340d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 09:18:17 +0000 Subject: [PATCH 02/30] Bump cortex chart appVersions to sha-74b78a6a [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index caf0030ea..1a04d0e9a 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-88946405" +appVersion: "sha-74b78a6a" icon: "https://example.com/icon.png" dependencies: [] From 72e6a960494b9b418021687ca99533765141c1d3 Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:10:54 +0100 Subject: [PATCH 03/30] Add multicluster support for the event recorder (#619) ## Changes - Add cluster routing capabilities to EventRecorder --- docs/guides/multicluster/readme.md | 2 +- docs/guides/multicluster/run.sh | 8 +- docs/guides/multicluster/schedule.sh | 253 +++++++++++++++++ docs/guides/multicluster/test-pipeline.yaml | 12 + .../filter_weigher_pipeline_controller.go | 2 +- .../filter_weigher_pipeline_controller.go | 2 +- .../filter_weigher_pipeline_controller.go | 2 +- .../filter_weigher_pipeline_controller.go | 2 +- .../filter_weigher_pipeline_controller.go | 2 +- .../reservations/failover/controller.go | 2 +- pkg/multicluster/client_test.go | 18 +- pkg/multicluster/recorder.go | 83 ++++++ pkg/multicluster/recorder_test.go | 259 ++++++++++++++++++ 13 files changed, 636 insertions(+), 11 deletions(-) create mode 100755 docs/guides/multicluster/schedule.sh create mode 100644 docs/guides/multicluster/test-pipeline.yaml create mode 100644 pkg/multicluster/recorder.go create mode 100644 pkg/multicluster/recorder_test.go diff --git a/docs/guides/multicluster/readme.md b/docs/guides/multicluster/readme.md index f65d19486..ab9c9e60a 100644 --- a/docs/guides/multicluster/readme.md +++ b/docs/guides/multicluster/readme.md @@ -1,7 +1,7 @@ # Cortex Multi-Cluster Testing > [!NOTE] -> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. +> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. If you want to test the multi-cluster setup you can run the `schedule.sh` script, which will create a scheduling request and show you how it gets processed across the clusters. Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. Similarly, another use case is to have multiple remote clusters that maintain all the compute workloads and expose resources that Cortex needs to access, such as the `Hypervisor` resource. diff --git a/docs/guides/multicluster/run.sh b/docs/guides/multicluster/run.sh index 063de8ab6..f68956870 100755 --- a/docs/guides/multicluster/run.sh +++ b/docs/guides/multicluster/run.sh @@ -52,16 +52,20 @@ global: gvks: - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList + - cortex.cloud/v1alpha1/History + - cortex.cloud/v1alpha1/HistoryList labels: - az: cortex-remote-az-a + availabilityZone: cortex-remote-az-a caCert: | $(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /') - host: https://host.docker.internal:8445 gvks: - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList + - cortex.cloud/v1alpha1/History + - cortex.cloud/v1alpha1/HistoryList labels: - az: cortex-remote-az-b + availabilityZone: cortex-remote-az-b caCert: | $(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /') EOF diff --git a/docs/guides/multicluster/schedule.sh b/docs/guides/multicluster/schedule.sh new file mode 100755 index 000000000..a1caaf24d --- /dev/null +++ b/docs/guides/multicluster/schedule.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +set -e + +API_URL="http://localhost:8001/scheduler/nova/external" +INSTANCE_UUID="cortex-test-instance-001" +HISTORY_NAME="nova-$INSTANCE_UUID" + +# --- Step 1: Apply the test pipeline ----------------------------------------- + +echo "=== Step 1: Apply test pipeline ===" +echo "" +echo "The test pipeline is a minimal filter-weigher pipeline with:" +echo " - createHistory: true (so a History CRD is created for each decision)" +echo " - filter_correct_az (filters hosts not matching the requested AZ)" +echo " - no weighers (hosts are returned in their original order)" +echo "" + +kubectl --context kind-cortex-home apply -f docs/guides/multicluster/test-pipeline.yaml + +echo "" +echo "Press enter to send a scheduling request..." +read -r + +# --- Step 2: Send scheduling request ----------------------------------------- + +echo "=== Step 2: Send scheduling request ===" +echo "" +echo "Sending a Nova external scheduler request to the cortex API." +echo "" +echo " Instance UUID: $INSTANCE_UUID" +echo " Availability Zone: cortex-remote-az-b" +echo " Pipeline: multicluster-test" +echo " Candidate hosts: hypervisor-{1,2}-az-{a,b} (4 hosts across 2 AZs)" +echo "" +echo "The pipeline's filter_correct_az step should filter out the az-a hosts," +echo "leaving only hypervisor-1-az-b and hypervisor-2-az-b." +echo "" + +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -d @- </dev/null || echo "$BODY" + +if [ "$HTTP_CODE" != "200" ]; then + echo "" + echo "ERROR: Scheduling request failed. Check the controller logs:" + echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager" + exit 1 +fi + +echo "" +echo "Press enter to check History CRDs and events across all clusters..." +read -r + +# --- Step 3: Check History and Events ---------------------------------------- + +echo "=== Step 3: Check History CRDs and Events ===" +echo "" +echo "The pipeline has createHistory: true, so a History CRD named '$HISTORY_NAME'" +echo "should have been created. An event should also have been recorded on it." +echo "Based on the multicluster config, this should be on the remote cluster cortex-remote-az-b." +echo "" + +sleep 1 + +for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do + echo "--- $CLUSTER ---" + echo "Histories:" + kubectl --context "$CLUSTER" get histories 2>/dev/null || echo " (none)" + echo "Events:" + kubectl --context "$CLUSTER" get events --field-selector reason=SchedulingSucceeded 2>/dev/null || echo " (none)" + echo "" +done + +echo "Press enter to describe the History CRD and see the full scheduling result..." +read -r + +# --- Step 4: Describe History ------------------------------------------------ + +echo "=== Step 4: Describe History CRD ===" +echo "" +echo "The History CRD contains the full scheduling decision context:" +echo " - Which pipeline was used" +echo " - The target host that was selected" +echo " - An explanation of each filter/weigher step" +echo " - The Ready condition (True = host selected, False = no host found)" +echo "" + +# Try all clusters to find where the History ended up. +for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do + if kubectl --context "$CLUSTER" get history "$HISTORY_NAME" &>/dev/null; then + echo "Found History '$HISTORY_NAME' in $CLUSTER:" + echo "" + kubectl --context "$CLUSTER" describe history "$HISTORY_NAME" + exit 0 + fi +done + +echo "WARNING: History '$HISTORY_NAME' was not found in any cluster." +echo "Check the controller logs for errors:" +echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager | grep -i history" diff --git a/docs/guides/multicluster/test-pipeline.yaml b/docs/guides/multicluster/test-pipeline.yaml new file mode 100644 index 000000000..667f4a7fe --- /dev/null +++ b/docs/guides/multicluster/test-pipeline.yaml @@ -0,0 +1,12 @@ +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: multicluster-test +spec: + schedulingDomain: nova + description: Minimal test pipeline for the multicluster guide. + type: filter-weigher + createHistory: true + filters: + - name: filter_correct_az + weighers: [] diff --git a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go index 7163cc5a6..52ec37306 100644 --- a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainCinder - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-cinder-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-cinder-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/machines/filter_weigher_pipeline_controller.go b/internal/scheduling/machines/filter_weigher_pipeline_controller.go index d76203812..35d51708a 100644 --- a/internal/scheduling/machines/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/machines/filter_weigher_pipeline_controller.go @@ -222,7 +222,7 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainMachines - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-machines-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-machines-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/manila/filter_weigher_pipeline_controller.go b/internal/scheduling/manila/filter_weigher_pipeline_controller.go index 6ab938511..128b7d719 100644 --- a/internal/scheduling/manila/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/manila/filter_weigher_pipeline_controller.go @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainManila - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-manila-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-manila-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/nova/filter_weigher_pipeline_controller.go b/internal/scheduling/nova/filter_weigher_pipeline_controller.go index bb9c5bb07..279ac1c3e 100644 --- a/internal/scheduling/nova/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/nova/filter_weigher_pipeline_controller.go @@ -199,7 +199,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainNova - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-nova-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-nova-scheduler")} c.gatherer = &candidateGatherer{Client: mcl} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err diff --git a/internal/scheduling/pods/filter_weigher_pipeline_controller.go b/internal/scheduling/pods/filter_weigher_pipeline_controller.go index ceba977b8..0ceee6485 100644 --- a/internal/scheduling/pods/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/pods/filter_weigher_pipeline_controller.go @@ -234,7 +234,7 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainPods - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-pods-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-pods-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go index fbb4f9c01..f2175adc3 100644 --- a/internal/scheduling/reservations/failover/controller.go +++ b/internal/scheduling/reservations/failover/controller.go @@ -766,7 +766,7 @@ func (c *FailoverReservationController) patchReservationStatus(ctx context.Conte // SetupWithManager sets up the watch-based reconciler with the Manager. // This handles per-reservation reconciliation triggered by CRD changes. func (c *FailoverReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { - c.Recorder = mgr.GetEventRecorder("failover-reservation-controller") + c.Recorder = mcl.GetEventRecorder("failover-reservation-controller") bldr := multicluster.BuildController(mcl, mgr) bldr, err := bldr.WatchesMulticluster( diff --git a/pkg/multicluster/client_test.go b/pkg/multicluster/client_test.go index 64b0ae94c..4ea3900cb 100644 --- a/pkg/multicluster/client_test.go +++ b/pkg/multicluster/client_test.go @@ -13,6 +13,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/events" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" @@ -74,8 +76,9 @@ func (f *fakeCache) getIndexFieldCalls() []indexFieldCall { // fakeCluster implements cluster.Cluster interface for testing. type fakeCluster struct { cluster.Cluster - fakeClient client.Client - fakeCache *fakeCache + fakeClient client.Client + fakeCache *fakeCache + fakeRecorder events.EventRecorder } func (f *fakeCluster) GetClient() client.Client { @@ -86,6 +89,17 @@ func (f *fakeCluster) GetCache() cache.Cache { return f.fakeCache } +func (f *fakeCluster) GetEventRecorder(_ string) events.EventRecorder { + if f.fakeRecorder != nil { + return f.fakeRecorder + } + return &fakeEventRecorder{} +} + +func (f *fakeCluster) GetEventRecorderFor(_ string) record.EventRecorder { + return record.NewFakeRecorder(100) +} + func newFakeCluster(scheme *runtime.Scheme, objs ...client.Object) *fakeCluster { return &fakeCluster{ fakeClient: fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(), diff --git a/pkg/multicluster/recorder.go b/pkg/multicluster/recorder.go new file mode 100644 index 000000000..8c7c78d3d --- /dev/null +++ b/pkg/multicluster/recorder.go @@ -0,0 +1,83 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package multicluster + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cluster" +) + +// MultiClusterRecorder implements events.EventRecorder and routes events to the +// correct cluster based on the GVK of the "regarding" object. It uses the same +// routing logic as the multicluster Client's write path. +type MultiClusterRecorder struct { + client *Client + homeRecorder events.EventRecorder + recorders map[cluster.Cluster]events.EventRecorder +} + +// GetEventRecorder creates a multi-cluster-aware EventRecorder. It pre-creates +// a per-cluster recorder for the home cluster and every remote cluster currently +// registered in the client. The name parameter is passed through to each +// cluster's GetEventRecorder method (it becomes the reportingController in the +// Kubernetes Event). +func (c *Client) GetEventRecorder(name string) events.EventRecorder { + homeRecorder := c.HomeCluster.GetEventRecorder(name) + + recorders := make(map[cluster.Cluster]events.EventRecorder) + recorders[c.HomeCluster] = homeRecorder + + c.remoteClustersMu.RLock() + defer c.remoteClustersMu.RUnlock() + + for _, remotes := range c.remoteClusters { + for _, r := range remotes { + if _, exists := recorders[r.cluster]; !exists { + recorders[r.cluster] = r.cluster.GetEventRecorder(name) + } + } + } + + return &MultiClusterRecorder{ + client: c, + homeRecorder: homeRecorder, + recorders: recorders, + } +} + +// Eventf routes the event to the cluster that owns the "regarding" object. +// Falls back to the home cluster recorder if routing fails. +func (r *MultiClusterRecorder) Eventf(regarding, related runtime.Object, eventtype, reason, action, note string, args ...any) { + recorder := r.recorderFor(regarding) + recorder.Eventf(regarding, related, eventtype, reason, action, note, args...) +} + +// recorderFor resolves which per-cluster recorder to use for the given object. +func (r *MultiClusterRecorder) recorderFor(obj runtime.Object) events.EventRecorder { + if obj == nil { + return r.homeRecorder + } + + gvk, err := r.client.GVKFromHomeScheme(obj) + if err != nil { + ctrl.Log.V(1).Info("multi-cluster recorder: failed to resolve GVK, using home recorder", "error", err) + return r.homeRecorder + } + + cl, err := r.client.clusterForWrite(gvk, obj) + if err != nil { + ctrl.Log.V(1).Info("multi-cluster recorder: no cluster matched, using home recorder", "gvk", gvk, "error", err) + return r.homeRecorder + } + + recorder, ok := r.recorders[cl] + if !ok { + ctrl.Log.V(1).Info("multi-cluster recorder: no pre-created recorder for cluster, using home recorder", "gvk", gvk) + return r.homeRecorder + } + + return recorder +} diff --git a/pkg/multicluster/recorder_test.go b/pkg/multicluster/recorder_test.go new file mode 100644 index 000000000..90bdd71dc --- /dev/null +++ b/pkg/multicluster/recorder_test.go @@ -0,0 +1,259 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package multicluster + +import ( + "fmt" + "sync" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// fakeEventRecorder captures Eventf calls for assertions. +type fakeEventRecorder struct { + mu sync.Mutex + calls []eventfCall +} + +type eventfCall struct { + regarding runtime.Object + eventtype string + reason string + action string + note string +} + +func (f *fakeEventRecorder) Eventf(regarding, _ runtime.Object, eventtype, reason, action, note string, args ...any) { + f.mu.Lock() + defer f.mu.Unlock() + f.calls = append(f.calls, eventfCall{ + regarding: regarding, + eventtype: eventtype, + reason: reason, + action: action, + note: fmt.Sprintf(note, args...), + }) +} + +func (f *fakeEventRecorder) getCalls() []eventfCall { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]eventfCall, len(f.calls)) + copy(out, f.calls) + return out +} + +func TestMultiClusterRecorder_HomeGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: homeRecorder, + } + + historyGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{historyGVK: true}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + history := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: "nova-uuid-1"}, + } + recorder.Eventf(history, nil, corev1.EventTypeNormal, "SchedulingSucceeded", "Scheduled", "selected host: %s", "compute-1") + + calls := homeRecorder.getCalls() + if len(calls) != 1 { + t.Fatalf("expected 1 call, got %d", len(calls)) + } + if calls[0].eventtype != corev1.EventTypeNormal { + t.Errorf("expected event type %q, got %q", corev1.EventTypeNormal, calls[0].eventtype) + } + if calls[0].reason != "SchedulingSucceeded" { + t.Errorf("expected reason %q, got %q", "SchedulingSucceeded", calls[0].reason) + } + if calls[0].note != "selected host: compute-1" { + t.Errorf("expected note %q, got %q", "selected host: compute-1", calls[0].note) + } +} + +func TestMultiClusterRecorder_RemoteGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + remoteRecorder := &fakeEventRecorder{} + + homeCluster := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: homeRecorder, + } + remote := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: remoteRecorder, + } + + reservationGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + ResourceRouters: map[schema.GroupVersionKind]ResourceRouter{ + reservationGVK: ReservationsResourceRouter{}, + }, + remoteClusters: map[schema.GroupVersionKind][]remoteCluster{ + reservationGVK: {{cluster: remote, labels: map[string]string{"availabilityZone": "az-a"}}}, + }, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-1"}, + Spec: v1alpha1.ReservationSpec{AvailabilityZone: "az-a"}, + } + recorder.Eventf(res, nil, corev1.EventTypeNormal, "ValidationPassed", "Validated", "reservation validated") + + // Event should go to the remote recorder, not home. + homeCalls := homeRecorder.getCalls() + if len(homeCalls) != 0 { + t.Errorf("expected 0 home calls, got %d", len(homeCalls)) + } + remoteCalls := remoteRecorder.getCalls() + if len(remoteCalls) != 1 { + t.Fatalf("expected 1 remote call, got %d", len(remoteCalls)) + } + if remoteCalls[0].action != "Validated" { + t.Errorf("expected action %q, got %q", "Validated", remoteCalls[0].action) + } +} + +func TestMultiClusterRecorder_MultipleRemotes(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + remoteARecorder := &fakeEventRecorder{} + remoteBRecorder := &fakeEventRecorder{} + + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + remoteA := &fakeCluster{fakeRecorder: remoteARecorder, fakeCache: &fakeCache{}} + remoteB := &fakeCluster{fakeRecorder: remoteBRecorder, fakeCache: &fakeCache{}} + + reservationGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + ResourceRouters: map[schema.GroupVersionKind]ResourceRouter{ + reservationGVK: ReservationsResourceRouter{}, + }, + remoteClusters: map[schema.GroupVersionKind][]remoteCluster{ + reservationGVK: { + {cluster: remoteA, labels: map[string]string{"availabilityZone": "az-a"}}, + {cluster: remoteB, labels: map[string]string{"availabilityZone": "az-b"}}, + }, + }, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + // Event for az-b should go to remoteB. + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-b"}, + Spec: v1alpha1.ReservationSpec{AvailabilityZone: "az-b"}, + } + recorder.Eventf(res, nil, corev1.EventTypeWarning, "SchedulingFailed", "FailedScheduling", "no host found") + + remoteCallsA := remoteARecorder.getCalls() + if len(remoteCallsA) != 0 { + t.Errorf("expected 0 calls to remote-a, got %d", len(remoteCallsA)) + } + remoteCallsB := remoteBRecorder.getCalls() + if len(remoteCallsB) != 1 { + t.Fatalf("expected 1 call to remote-b, got %d", len(remoteCallsB)) + } + if remoteCallsB[0].reason != "SchedulingFailed" { + t.Errorf("expected reason %q, got %q", "SchedulingFailed", remoteCallsB[0].reason) + } +} + +func TestMultiClusterRecorder_FallbackOnUnknownGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + // unknownType is not in the scheme — should fall back to home recorder. + obj := &unknownType{ObjectMeta: metav1.ObjectMeta{Name: "unknown-1"}} + recorder.Eventf(obj, nil, corev1.EventTypeNormal, "Test", "Test", "test message") + + if len(homeRecorder.getCalls()) != 1 { + t.Fatalf("expected 1 home call on fallback, got %d", len(homeRecorder.getCalls())) + } +} + +func TestMultiClusterRecorder_FallbackOnNilRegarding(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + } + + recorder := mcl.GetEventRecorder("test-recorder") + recorder.Eventf(nil, nil, corev1.EventTypeNormal, "Test", "Test", "nil regarding") + + if len(homeRecorder.getCalls()) != 1 { + t.Fatalf("expected 1 home call for nil regarding, got %d", len(homeRecorder.getCalls())) + } +} + +func TestMultiClusterRecorder_ConcurrentEventf(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + historyGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{historyGVK: true}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + const goroutines = 20 + var wg sync.WaitGroup + wg.Add(goroutines) + for i := range goroutines { + go func(n int) { + defer wg.Done() + history := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("history-%d", n)}, + } + recorder.Eventf(history, nil, corev1.EventTypeNormal, "Test", "Test", "event %d", n) + }(i) + } + wg.Wait() + + homeCalls := homeRecorder.getCalls() + if len(homeCalls) != goroutines { + t.Errorf("expected %d calls, got %d", goroutines, len(homeCalls)) + } +} From c8d2bac0a31613b3f6c68deb1aadbf853589a58e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 10:20:21 +0000 Subject: [PATCH 04/30] Bump cortex chart appVersions to sha-72e6a960 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 1a04d0e9a..7d49f3bf4 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-74b78a6a" +appVersion: "sha-72e6a960" icon: "https://example.com/icon.png" dependencies: [] From 76321376a7716e8481284e6643f535be6cc92f1b Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:33:24 +0100 Subject: [PATCH 05/30] Update commitments with support for non-standard units (#624) --- Tiltfile | 7 + cmd/main.go | 8 +- go.mod | 6 +- go.sum | 8 +- .../cortex-nova/alerts/nova.alerts.yaml | 20 ++- .../reservations/commitments/api.go | 3 + .../reservations/commitments/api_info.go | 55 +++++- .../commitments/api_info_monitor.go | 53 ++++++ .../reservations/commitments/api_info_test.go | 71 +++++++- .../reservations/commitments/context.go | 6 + .../reservations/commitments/controller.go | 13 +- .../reservations/commitments/e2e_checks.go | 77 ++++++++ .../reservations/commitments/syncer.go | 64 +++++-- .../commitments/syncer_monitor.go | 63 +++++++ .../reservations/commitments/syncer_test.go | 165 ++++++++++++++++++ .../reservations/scheduler_client.go | 1 + 16 files changed, 578 insertions(+), 42 deletions(-) create mode 100644 internal/scheduling/reservations/commitments/api_info_monitor.go create mode 100644 internal/scheduling/reservations/commitments/e2e_checks.go create mode 100644 internal/scheduling/reservations/commitments/syncer_monitor.go diff --git a/Tiltfile b/Tiltfile index a42fe43f4..6871d18b3 100644 --- a/Tiltfile +++ b/Tiltfile @@ -189,6 +189,13 @@ if 'nova' in ACTIVE_DEPLOYMENTS: trigger_mode=TRIGGER_MODE_MANUAL, auto_init=False, ) + local_resource( + 'Commitments E2E Tests', + '/bin/sh -c "kubectl exec deploy/cortex-nova-scheduling-controller-manager -- /manager e2e-commitments"', + labels=['Cortex-Nova'], + trigger_mode=TRIGGER_MODE_MANUAL, + auto_init=False, + ) if 'manila' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Manila bundle") diff --git a/cmd/main.go b/cmd/main.go index 351bf29eb..5d8acf1e7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -110,6 +110,10 @@ func main() { manilaChecksConfig := conf.GetConfigOrDie[manila.ChecksConfig]() manila.RunChecks(ctx, client, manilaChecksConfig) return + case "e2e-commitments": + commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]() + commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig) + return } } @@ -665,7 +669,9 @@ func main() { if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") { setupLog.Info("starting commitments syncer") - syncer := commitments.NewSyncer(multiclusterClient) + syncerMonitor := commitments.NewSyncerMonitor() + must.Succeed(metrics.Registry.Register(syncerMonitor)) + syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor) syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]() syncerDefaults := commitments.DefaultSyncerConfig() if syncerConfig.SyncInterval == 0 { diff --git a/go.mod b/go.mod index cf74f39f1..a6c3b9ce7 100644 --- a/go.mod +++ b/go.mod @@ -72,7 +72,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/sapcc/go-api-declarations v1.20.2 + github.com/sapcc/go-api-declarations v1.21.0 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect github.com/spf13/pflag v1.0.10 // indirect @@ -98,7 +98,7 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.42.0 // indirect - golang.org/x/term v0.41.0 // indirect + golang.org/x/term v0.41.0 golang.org/x/text v0.33.0 // indirect golang.org/x/time v0.14.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect @@ -108,7 +108,7 @@ require ( google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect + gopkg.in/yaml.v3 v3.0.1 gotest.tools v2.2.0+incompatible // indirect k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/apiserver v0.35.0 // indirect diff --git a/go.sum b/go.sum index 98a4f9477..6dba2b5cc 100644 --- a/go.sum +++ b/go.sum @@ -176,8 +176,8 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ= -github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= +github.com/sapcc/go-api-declarations v1.21.0 h1:Ag6GXgJLTFdBDKmrJU4QFllQbgGSenSGeHpLuvuxeDk= +github.com/sapcc/go-api-declarations v1.21.0/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ= github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= @@ -251,12 +251,8 @@ golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 9b80e079b..a69f3be51 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -276,7 +276,25 @@ groups: configuration. It is recommended to investigate the pipeline status and logs for more details. - # Committed Resource (Limes Integration) Alerts + # Committed Resource Info API Alerts + - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource info API HTTP 500 errors too high" + description: > + The committed resource info API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems building service info, + such as invalid flavor group data. Limes will not be able to discover available + resources until the issue is resolved. + + # Committed Resource Change API Alerts - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 for: 5m diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index daedb8a2f..3c6179d83 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -28,6 +28,7 @@ type HTTPAPI struct { monitor ChangeCommitmentsAPIMonitor usageMonitor ReportUsageAPIMonitor capacityMonitor ReportCapacityAPIMonitor + infoMonitor InfoAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -44,6 +45,7 @@ func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaC monitor: NewChangeCommitmentsAPIMonitor(), usageMonitor: NewReportUsageAPIMonitor(), capacityMonitor: NewReportCapacityAPIMonitor(), + infoMonitor: NewInfoAPIMonitor(), } } @@ -51,6 +53,7 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.monitor) registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) + registry.MustRegister(&api.infoMonitor) mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/v1/commitments/info", api.HandleInfo) diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index c189c859a..7c8c72748 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -6,9 +6,12 @@ package commitments import ( "context" "encoding/json" + "errors" "fmt" "net/http" + "strconv" "strings" + "time" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" @@ -16,10 +19,16 @@ import ( liquid "github.com/sapcc/go-api-declarations/liquid" ) +// errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration) +var errInternalServiceInfo = errors.New("internal error building service info") + // handles GET /v1/info requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + statusCode := http.StatusOK + // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { @@ -33,7 +42,9 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { // Only accept GET method if r.Method != http.MethodGet { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + statusCode = http.StatusMethodNotAllowed + http.Error(w, "Method not allowed", statusCode) + api.recordInfoMetrics(statusCode, startTime) return } @@ -42,20 +53,35 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { // Build info response info, err := api.buildServiceInfo(ctx, logger) if err != nil { - // Use Info level for expected conditions like knowledge not being ready yet - logger.Info("service info not available yet", "error", err.Error()) - http.Error(w, "Service temporarily unavailable: "+err.Error(), - http.StatusServiceUnavailable) + if errors.Is(err, errInternalServiceInfo) { + logger.Error(err, "internal error building service info") + statusCode = http.StatusInternalServerError + http.Error(w, "Internal server error: "+err.Error(), statusCode) + } else { + // Use Info level for expected conditions like knowledge not being ready yet + logger.Info("service info not available yet", "error", err.Error()) + statusCode = http.StatusServiceUnavailable + http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode) + } + api.recordInfoMetrics(statusCode, startTime) return } // Return response w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) + w.WriteHeader(statusCode) if err := json.NewEncoder(w).Encode(info); err != nil { logger.Error(err, "failed to encode service info") - return } + api.recordInfoMetrics(statusCode, startTime) +} + +// recordInfoMetrics records Prometheus metrics for an info API request. +func (api *HTTPAPI) recordInfoMetrics(statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + api.infoMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.infoMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) } // resourceAttributes holds the custom attributes for a resource in the info API response. @@ -108,9 +134,22 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l attrsJSON = nil } + // Build unit from smallest flavor memory (e.g., "131072 MiB" for 128 GiB) + // Validate memory is positive to avoid panic in MultiplyBy (which panics on factor=0) + if groupData.SmallestFlavor.MemoryMB == 0 { + return liquid.ServiceInfo{}, fmt.Errorf("%w: flavor group %q has invalid smallest flavor with memoryMB=0", + errInternalServiceInfo, groupName) + } + unit, err := liquid.UnitMebibytes.MultiplyBy(groupData.SmallestFlavor.MemoryMB) + if err != nil { + // Note: This error only occurs on uint64 overflow, which is unrealistic for memory values + return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", + errInternalServiceInfo, groupName, err) + } + resources[resourceName] = liquid.ResourceInfo{ DisplayName: displayName, - Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances + Unit: unit, // Non-standard unit: multiples of smallest flavor RAM Topology: liquid.AZAwareTopology, // Commitments are per-AZ NeedsResourceDemand: false, // Capacity planning out of scope for now HasCapacity: handlesCommitments, // We report capacity via /v1/report-capacity only for groups that accept commitments diff --git a/internal/scheduling/reservations/commitments/api_info_monitor.go b/internal/scheduling/reservations/commitments/api_info_monitor.go new file mode 100644 index 000000000..aaae937ef --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info_monitor.go @@ -0,0 +1,53 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// InfoAPIMonitor provides metrics for the CR info API. +type InfoAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec +} + +// NewInfoAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. +func NewInfoAPIMonitor() InfoAPIMonitor { + m := InfoAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_info_api_requests_total", + Help: "Total number of committed resource info API requests by HTTP status code", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_info_api_request_duration_seconds", + Help: "Duration of committed resource info API requests in seconds by HTTP status code", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, []string{"status_code"}), + } + + // Pre-initialize metrics with zero values for common HTTP status codes. + // This ensures metrics exist in Prometheus before the first request, + // preventing "metric missing" warnings in alerting rules. + for _, statusCode := range []string{"200", "405", "500", "503"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + + return m +} + +// Describe implements prometheus.Collector. +func (m *InfoAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *InfoAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go index 828a255ff..4f42cd2a2 100644 --- a/internal/scheduling/reservations/commitments/api_info_test.go +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -28,9 +28,7 @@ func TestHandleInfo_KnowledgeNotReady(t *testing.T) { WithScheme(scheme). Build() - api := &HTTPAPI{ - client: k8sClient, - } + api := NewAPI(k8sClient) req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) w := httptest.NewRecorder() @@ -62,9 +60,7 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { WithScheme(scheme). Build() - api := &HTTPAPI{ - client: k8sClient, - } + api := NewAPI(k8sClient) // Use POST instead of GET req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) @@ -80,6 +76,67 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { } } +func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { + // Test that a 500 Internal Server Error is returned when a flavor group has invalid data. + // + // A flavor with memoryMB=0 is invalid and should trigger an HTTP 500 error. + // Such data could occur from a bug in the flavor groups extractor. + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // Create flavor group with memoryMB=0 (invalid data that could come from a buggy extractor) + features := []map[string]interface{}{ + { + "name": "invalid_group", + "flavors": []map[string]interface{}{ + {"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + }, + "largestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + "smallestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + "ramCoreRatio": 4096, + }, + } + + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("failed to box features: %v", err) + } + + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}}, + Raw: raw, + LastContentChange: v1.Now(), + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + Build() + + api := NewAPI(k8sClient) + + req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 500 Internal Server Error when unit creation fails + if resp.StatusCode != http.StatusInternalServerError { + t.Errorf("expected status code %d (Internal Server Error), got %d", http.StatusInternalServerError, resp.StatusCode) + } +} + func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { // Test that HasCapacity == HandlesCommitments for all resources // Both should be true only for groups with fixed RAM/core ratio @@ -138,7 +195,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { WithObjects(knowledge). Build() - api := &HTTPAPI{client: k8sClient} + api := NewAPI(k8sClient) req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) w := httptest.NewRecorder() diff --git a/internal/scheduling/reservations/commitments/context.go b/internal/scheduling/reservations/commitments/context.go index 64257fcae..aabee0241 100644 --- a/internal/scheduling/reservations/commitments/context.go +++ b/internal/scheduling/reservations/commitments/context.go @@ -21,6 +21,12 @@ func WithNewGlobalRequestID(ctx context.Context) context.Context { return reservations.WithGlobalRequestID(ctx, "committed-resource-"+uuid.New().String()) } +// WithGlobalRequestID creates a new context with the specified global request ID. +// This is used to propagate existing request IDs (e.g., from the creator annotation). +func WithGlobalRequestID(ctx context.Context, greq string) context.Context { + return reservations.WithGlobalRequestID(ctx, greq) +} + // LoggerFromContext returns a logger with greq and req values from the context. // This creates a child logger with the request tracking values pre-attached, // so you don't need to repeat them in every log call. diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index e03958151..71530cb59 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -49,20 +49,21 @@ type CommitmentReservationController struct { // move the current state of the cluster closer to the desired state. // Note: This controller only handles commitment reservations, as filtered by the predicate. func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - ctx = WithNewGlobalRequestID(ctx) - logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) - - // Fetch the reservation object. + // Fetch the reservation object first to check for creator request ID. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { // Ignore not-found errors, since they can't be fixed by an immediate requeue return ctrl.Result{}, client.IgnoreNotFound(err) } - // Extract creator request ID from annotation for end-to-end traceability + // Use creator request ID from annotation for end-to-end traceability if available, + // otherwise generate a new one for this reconcile loop. if creatorReq := res.Annotations[v1alpha1.AnnotationCreatorRequestID]; creatorReq != "" { - logger = logger.WithValues("creatorReq", creatorReq) + ctx = WithGlobalRequestID(ctx, creatorReq) + } else { + ctx = WithNewGlobalRequestID(ctx) } + logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) // filter for CR reservations resourceName := "" diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go new file mode 100644 index 000000000..6272979bf --- /dev/null +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -0,0 +1,77 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + + liquid "github.com/sapcc/go-api-declarations/liquid" + "github.com/sapcc/go-bits/must" +) + +const ( + // Default URL for the commitments API endpoint. + // This should match the service name in the helm chart. + defaultCommitmentsAPIURL = "http://cortex-nova-scheduler:8080" +) + +// E2EChecksConfig holds the configuration for CR e2e checks. +type E2EChecksConfig struct { + // Base URL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL. + BaseURL string `json:"baseURL"` +} + +// CheckCommitmentsInfoEndpoint sends a GET request to the /v1/commitments/info endpoint +// and verifies that it returns HTTP 200 with a valid ServiceInfo response. +func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { + baseURL := config.BaseURL + if baseURL == "" { + baseURL = defaultCommitmentsAPIURL + } + apiURL := baseURL + "/v1/commitments/info" + slog.Info("checking commitments info endpoint", "apiURL", apiURL) + + httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody)) + httpReq.Header.Set("Accept", "application/json") + + //nolint:bodyclose // Body is closed in the deferred function below. + resp := must.Return(http.DefaultClient.Do(httpReq)) + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + bodyBytes := must.Return(io.ReadAll(resp.Body)) + slog.Error("commitments info API returned non-200 status code", + "statusCode", resp.StatusCode, + "responseBody", string(bodyBytes), + ) + panic(fmt.Sprintf("commitments info API returned status %d, expected 200", resp.StatusCode)) + } + + var serviceInfo liquid.ServiceInfo + if err := json.NewDecoder(resp.Body).Decode(&serviceInfo); err != nil { + panic(fmt.Sprintf("failed to decode ServiceInfo response: %v", err)) + } + + // Basic validation of the response + if serviceInfo.Version < 0 { + slog.Warn("commitments info returned version -1, knowledge may not be ready yet") + } + + slog.Info("commitments info endpoint check passed", + "version", serviceInfo.Version, + "resourceCount", len(serviceInfo.Resources), + ) +} + +// RunCommitmentsE2EChecks runs all e2e checks for the commitments API. +func RunCommitmentsE2EChecks(ctx context.Context, config E2EChecksConfig) { + slog.Info("running commitments e2e checks") + CheckCommitmentsInfoEndpoint(ctx, config) + slog.Info("all commitments e2e checks passed") +} diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index b7db11351..ace9488b2 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -42,12 +42,15 @@ type Syncer struct { CommitmentsClient // Kubernetes client for CRD operations client.Client + // Monitor for metrics + monitor *SyncerMonitor } -func NewSyncer(k8sClient client.Client) *Syncer { +func NewSyncer(k8sClient client.Client, monitor *SyncerMonitor) *Syncer { return &Syncer{ CommitmentsClient: NewCommitmentsClient(), Client: k8sClient, + monitor: monitor, } } @@ -58,7 +61,16 @@ func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { return nil } -func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) ([]*CommitmentState, error) { +// getCommitmentStatesResult holds both processed and skipped commitment UUIDs +type getCommitmentStatesResult struct { + // states are the commitments that were successfully processed + states []*CommitmentState + // skippedUUIDs are commitment UUIDs that were skipped (e.g., due to unit mismatch) + // but should NOT have their existing CRDs deleted + skippedUUIDs map[string]bool +} + +func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) (*getCommitmentStatesResult, error) { allProjects, err := s.ListProjects(ctx) if err != nil { return nil, err @@ -69,7 +81,10 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo } // Filter for compute commitments with RAM flavor group resources - var commitmentStates []*CommitmentState + result := &getCommitmentStatesResult{ + states: []*CommitmentState{}, + skippedUUIDs: make(map[string]bool), + } for id, commitment := range commitments { if commitment.ServiceType != "compute" { log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) @@ -99,6 +114,29 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo continue } + // Validate unit matches between Limes commitment and Cortex flavor group + // Expected format: " MiB" e.g. "131072 MiB" for 128 GiB + expectedUnit := fmt.Sprintf("%d MiB", flavorGroup.SmallestFlavor.MemoryMB) + if commitment.Unit != "" && commitment.Unit != expectedUnit { + // Unit mismatch: Limes has not yet updated this commitment to the new unit. + // Skip this commitment - trust what Cortex already has stored in CRDs. + // On the next sync cycle after Limes updates, this will be processed. + log.V(0).Info("WARNING: skipping commitment due to unit mismatch - Limes unit differs from Cortex flavor group, waiting for Limes to update", + "commitmentUUID", commitment.UUID, + "flavorGroup", flavorGroupName, + "limesUnit", commitment.Unit, + "expectedUnit", expectedUnit, + "smallestFlavorMemoryMB", flavorGroup.SmallestFlavor.MemoryMB) + if s.monitor != nil { + s.monitor.RecordUnitMismatch(flavorGroupName) + } + // Track skipped commitment so its existing CRDs won't be deleted + if commitment.UUID != "" { + result.skippedUUIDs[commitment.UUID] = true + } + continue + } + // Skip commitments with empty UUID if commitment.UUID == "" { log.Info("skipping commitment with empty UUID", @@ -121,10 +159,10 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "amount", commitment.Amount, "totalMemoryBytes", state.TotalMemoryBytes) - commitmentStates = append(commitmentStates, state) + result.states = append(result.states, state) } - return commitmentStates, nil + return result, nil } // SyncReservations fetches commitments from Limes and synchronizes Reservation CRDs. @@ -158,7 +196,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { } // Get all commitments as states - commitmentStates, err := s.getCommitmentStates(ctx, logger, flavorGroups) + commitmentResult, err := s.getCommitmentStates(ctx, logger, flavorGroups) if err != nil { logger.Error(err, "failed to get compute commitments") return err @@ -168,7 +206,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { manager := NewReservationManager(s.Client) // Apply each commitment state using the manager - for _, state := range commitmentStates { + for _, state := range commitmentResult.states { logger.Info("applying commitment state", "commitmentUUID", state.CommitmentUUID, "projectID", state.ProjectID, @@ -194,11 +232,15 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { return err } - // Build set of commitment UUIDs we should have + // Build set of commitment UUIDs we should have (processed + skipped) activeCommitments := make(map[string]bool) - for _, state := range commitmentStates { + for _, state := range commitmentResult.states { activeCommitments[state.CommitmentUUID] = true } + // Also include skipped commitments - don't delete their CRDs + for uuid := range commitmentResult.skippedUUIDs { + activeCommitments[uuid] = true + } // Delete reservations for commitments that no longer exist for _, existing := range existingReservations.Items { @@ -221,6 +263,8 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { } } - logger.Info("synced reservations", "commitmentCount", len(commitmentStates)) + logger.Info("synced reservations", + "processedCount", len(commitmentResult.states), + "skippedCount", len(commitmentResult.skippedUUIDs)) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go new file mode 100644 index 000000000..0bbc7fe7f --- /dev/null +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -0,0 +1,63 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// SyncerMonitor provides metrics for the commitment syncer. +type SyncerMonitor struct { + syncRuns prometheus.Counter + syncErrors prometheus.Counter + unitMismatch *prometheus.CounterVec +} + +// NewSyncerMonitor creates a new monitor with Prometheus metrics. +func NewSyncerMonitor() *SyncerMonitor { + m := &SyncerMonitor{ + syncRuns: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_runs_total", + Help: "Total number of commitment syncer runs", + }), + syncErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_errors_total", + Help: "Total number of commitment syncer errors", + }), + unitMismatch: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_unit_mismatch_total", + Help: "Total number of commitments with unit mismatch between Limes and Cortex flavor group knowledge", + }, []string{"flavor_group"}), + } + return m +} + +// RecordUnitMismatch records a unit mismatch for a flavor group. +func (m *SyncerMonitor) RecordUnitMismatch(flavorGroup string) { + m.unitMismatch.WithLabelValues(flavorGroup).Inc() +} + +// RecordSyncRun records a syncer run. +func (m *SyncerMonitor) RecordSyncRun() { + m.syncRuns.Inc() +} + +// RecordSyncError records a syncer error. +func (m *SyncerMonitor) RecordSyncError() { + m.syncErrors.Inc() +} + +// Describe implements prometheus.Collector. +func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) { + m.syncRuns.Describe(ch) + m.syncErrors.Describe(ch) + m.unitMismatch.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) { + m.syncRuns.Collect(ch) + m.syncErrors.Collect(ch) + m.unitMismatch.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 75512299a..4c438ea40 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -429,6 +429,171 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { } } +func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + // Create flavor group knowledge CRD with smallest flavor of 1024MB + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor-large", + LargestFlavorVCPUs: 8, + LargestFlavorMemoryMB: 8192, + SmallestFlavorName: "test-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). + Build() + + // Create mock commitment with a unit that doesn't match Cortex's understanding + // Limes says "2048 MiB" but Cortex's smallest flavor is 1024 MB + mockCommitments := []Commitment{ + { + ID: 1, + UUID: "unit-mismatch-test-uuid", + ServiceType: "compute", + ResourceName: "ram_test_group_v1", + AvailabilityZone: "az1", + Amount: 2, + Unit: "2048 MiB", // Mismatched unit - should be "1024 MiB" + ProjectID: "test-project", + DomainID: "test-domain", + }, + } + + // Create monitor to capture the mismatch metric + monitor := NewSyncerMonitor() + + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + result := make(map[string]Commitment) + for _, c := range mockCommitments { + result[c.UUID] = c + } + return result, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{ + {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, + }, nil + }, + } + + syncer := &Syncer{ + CommitmentsClient: mockClient, + Client: k8sClient, + monitor: monitor, + } + + err := syncer.SyncReservations(context.Background()) + if err != nil { + t.Errorf("SyncReservations() error = %v", err) + return + } + + // Verify that NO reservations were created due to unit mismatch + // The commitment is skipped and Cortex trusts existing CRDs + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) + if err != nil { + t.Errorf("Failed to list reservations: %v", err) + return + } + + // Should have 0 reservations - commitment is skipped due to unit mismatch + // Cortex waits for Limes to update the unit before processing + if len(reservations.Items) != 0 { + t.Errorf("Expected 0 reservations (commitment skipped due to unit mismatch), got %d", len(reservations.Items)) + } +} + +func TestSyncer_SyncReservations_UnitMatch(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + // Create flavor group knowledge CRD with smallest flavor of 1024MB + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor-large", + LargestFlavorVCPUs: 8, + LargestFlavorMemoryMB: 8192, + SmallestFlavorName: "test-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). + Build() + + // Create mock commitment with correct unit matching Cortex's smallest flavor + mockCommitments := []Commitment{ + { + ID: 1, + UUID: "unit-match-test-uuid", + ServiceType: "compute", + ResourceName: "ram_test_group_v1", + AvailabilityZone: "az1", + Amount: 2, + Unit: "1024 MiB", // Correct unit matching smallest flavor + ProjectID: "test-project", + DomainID: "test-domain", + }, + } + + monitor := NewSyncerMonitor() + + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + result := make(map[string]Commitment) + for _, c := range mockCommitments { + result[c.UUID] = c + } + return result, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{ + {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, + }, nil + }, + } + + syncer := &Syncer{ + CommitmentsClient: mockClient, + Client: k8sClient, + monitor: monitor, + } + + err := syncer.SyncReservations(context.Background()) + if err != nil { + t.Errorf("SyncReservations() error = %v", err) + return + } + + // Verify that reservations were created + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) + if err != nil { + t.Errorf("Failed to list reservations: %v", err) + return + } + + if len(reservations.Items) != 2 { + t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) + } +} + func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { diff --git a/internal/scheduling/reservations/scheduler_client.go b/internal/scheduling/reservations/scheduler_client.go index e89628b34..db7829e23 100644 --- a/internal/scheduling/reservations/scheduler_client.go +++ b/internal/scheduling/reservations/scheduler_client.go @@ -118,6 +118,7 @@ func (c *SchedulerClient) ScheduleReservation(ctx context.Context, req ScheduleR Context: api.NovaRequestContext{ RequestID: RequestIDFromContext(ctx), GlobalRequestID: globalReqID, + ProjectID: req.ProjectID, }, Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ From b19510ad923d5b1aa621a43daec975c27ed2300b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 10:42:38 +0000 Subject: [PATCH 06/30] Bump cortex chart appVersions to sha-76321376 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 7d49f3bf4..9fff54a34 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-72e6a960" +appVersion: "sha-76321376" icon: "https://example.com/icon.png" dependencies: [] From aa037aefd773a990767bb7c29983b1806e92dde0 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 12:00:33 +0100 Subject: [PATCH 07/30] Commitment resources use new naming schema (#629) --- .../api_change_commitments_monitor_test.go | 12 +-- .../api_change_commitments_test.go | 78 +++++++++---------- .../reservations/commitments/api_info.go | 2 +- .../reservations/commitments/api_info_test.go | 24 +++--- .../commitments/api_report_capacity_test.go | 4 +- .../commitments/api_report_usage_test.go | 22 +++--- .../reservations/commitments/capacity.go | 4 +- .../reservations/commitments/state.go | 24 +++++- .../reservations/commitments/state_test.go | 32 ++++++-- .../reservations/commitments/syncer.go | 7 +- .../reservations/commitments/syncer_test.go | 10 +-- .../reservations/commitments/usage.go | 2 +- .../reservations/commitments/usage_test.go | 10 +-- 13 files changed, 132 insertions(+), 99 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go index 255f05bd8..74fc0c938 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -172,24 +172,24 @@ func TestCountCommitments(t *testing.T) { { name: "Single commitment", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), ), expected: 1, }, { name: "Multiple commitments same project", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), - createCommitment("ram_hana_2", "project-A", "uuid-2", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_2_ram", "project-A", "uuid-2", "confirmed", 2), ), expected: 2, }, { name: "Multiple commitments multiple projects", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-2", "confirmed", 3), - createCommitment("ram_gp_1", "project-C", "uuid-3", "confirmed", 1), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-2", "confirmed", 3), + createCommitment("hw_version_gp_1_ram", "project-C", "uuid-3", "confirmed", 1), ), expected: 3, }, diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 6ceb8b65f..a30db33c5 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -55,7 +55,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-123", "confirmed", 2)), + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-123", "confirmed", 2)), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, @@ -67,7 +67,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { VMs: []*TestVM{}, Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 3)), + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 3)), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), @@ -78,7 +78,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", strings.Repeat("long-", 13), "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("long-", 13), "confirmed", 3), ), AvailableResources: &AvailableResources{}, ExpectedReservations: []*TestReservation{}, @@ -90,7 +90,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid with space", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid with space", "confirmed", 3), ), AvailableResources: &AvailableResources{}, ExpectedReservations: []*TestReservation{}, @@ -103,8 +103,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 2), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -119,8 +119,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 0), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -133,7 +133,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { // Greedy selection: 10GB request with 8/4/1GB flavors → picks 1×8GB + 2×1GB Flavors: []*TestFlavor{m1XL, m1Large, m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-binpack", "confirmed", 10), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-binpack", "confirmed", 10), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"}, @@ -147,7 +147,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { // InfoVersion validation prevents stale requests (1233 vs 1234) Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1233, - createCommitment("ram_hana_1", "project-A", "uuid-version", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-version", "confirmed", 2), ), EnvInfoVersion: 1234, ExpectedReservations: []*TestReservation{}, @@ -161,8 +161,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-project-a", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-project-b", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-project-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-project-b", "confirmed", 2), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -180,8 +180,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "commitment-A", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "commitment-B", "confirmed", 6), + createCommitment("hw_version_hana_1_ram", "project-A", "commitment-A", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "commitment-B", "confirmed", 6), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}}, ExpectedReservations: []*TestReservation{ @@ -195,7 +195,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "New commitment creation - from zero to N reservations", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, @@ -208,7 +208,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "New commitment creation - large batch", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 200), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 200), ), ExpectedReservations: func() []*TestReservation { var reservations []*TestReservation @@ -232,7 +232,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 4), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 4), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -249,7 +249,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 6), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 6), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -268,7 +268,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -287,7 +287,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-delete", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-delete", "confirmed", 0), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, @@ -304,7 +304,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-growth", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-growth", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, @@ -317,8 +317,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Multi-project success - both projects succeed", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, @@ -329,15 +329,15 @@ func TestCommitmentChangeIntegration(t *testing.T) { ExpectedAPIResponse: newAPIResponse(), }, { - Name: "Multiple flavor groups - ram_hana_1 and ram_hana_2", + Name: "Multiple flavor groups - hw_version_hana_1_ram and hw_version_hana_2_ram", // Amount in multiples of smallest flavor: hana_1 (2×1GB), hana_2 (2×2GB) Flavors: []*TestFlavor{ m1Small, {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-hana1", "confirmed", 2), - createCommitment("ram_hana_2", "project-A", "uuid-hana2", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-hana1", "confirmed", 2), + createCommitment("hw_version_hana_2_ram", "project-A", "uuid-hana2", "confirmed", 2), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, @@ -351,7 +351,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Unknown flavor group - clear rejection message", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_nonexistent", "project-A", "uuid-unknown", "confirmed", 2), + createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unknown", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: newAPIResponse("flavor group not found"), @@ -366,9 +366,9 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 0), - createCommitment("ram_hana_1", "project-C", "uuid-c", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-C", "uuid-c", "confirmed", 3), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}}, ExpectedReservations: []*TestReservation{ @@ -388,7 +388,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-repair", "confirmed", 8, "az-a"), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-repair", "confirmed", 8, "az-a"), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, @@ -410,7 +410,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Dry run request - feature not yet implemented", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", true, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-dryrun", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dryrun", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: newAPIResponse("Dry run not supported"), @@ -419,7 +419,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Knowledge not ready - clear rejection with RetryAt", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-knowledge", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-knowledge", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: APIResponseExpectation{ @@ -432,7 +432,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "API disabled - returns 503 Service Unavailable", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-disabled", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-disabled", "confirmed", 2), ), CustomConfig: func() *Config { cfg := DefaultConfig() @@ -449,9 +449,9 @@ func TestCommitmentChangeIntegration(t *testing.T) { // Tests that multiple failed commitments are all mentioned in the rejection reason Flavors: []*TestFlavor{m1Small, m1Tiny}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-multi-fail-1", "confirmed", 3), - createCommitment("ram_hana_1", "project-B", "uuid-multi-fail-2", "confirmed", 3), - createCommitment("ram_gp_1", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-multi-fail-1", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-multi-fail-2", "confirmed", 3), + createCommitment("hw_version_gp_1_ram", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}}, ExpectedReservations: []*TestReservation{}, @@ -470,7 +470,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "commitment-1", "confirmed", 4), + createCommitment("hw_version_hana_1_ram", "project-A", "commitment-1", "confirmed", 4), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 1024}}, ExpectedReservations: []*TestReservation{ @@ -485,7 +485,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Watch timeout with custom config - triggers rollback with timeout error", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-timeout", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2), ), // With 0ms timeout, the watch will timeout immediately before reservations become ready CustomConfig: func() *Config { diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index 7c8c72748..d25235d99 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -107,7 +107,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l // Build resources map resources := make(map[liquid.ResourceName]liquid.ResourceInfo) for groupName, groupData := range flavorGroups { - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + groupName) + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(groupName)) flavorNames := make([]string, 0, len(groupData.Flavors)) for _, flavor := range groupData.Flavors { diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go index 4f42cd2a2..846d4c279 100644 --- a/internal/scheduling/reservations/commitments/api_info_test.go +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -218,38 +218,38 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Fatalf("expected 2 resources, got %d", len(serviceInfo.Resources)) } - // Test fixed ratio group: ram_hana_fixed - fixedResource, ok := serviceInfo.Resources["ram_hana_fixed"] + // Test fixed ratio group: hw_version_hana_fixed_ram + fixedResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { - t.Fatal("expected ram_hana_fixed resource to exist") + t.Fatal("expected hw_version_hana_fixed_ram resource to exist") } if !fixedResource.HasCapacity { - t.Error("ram_hana_fixed: expected HasCapacity=true") + t.Error("hw_version_hana_fixed_ram: expected HasCapacity=true") } if !fixedResource.HandlesCommitments { - t.Error("ram_hana_fixed: expected HandlesCommitments=true (fixed ratio group)") + t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (fixed ratio group)") } if fixedResource.HasCapacity != fixedResource.HandlesCommitments { - t.Errorf("ram_hana_fixed: HasCapacity (%v) should equal HandlesCommitments (%v)", + t.Errorf("hw_version_hana_fixed_ram: HasCapacity (%v) should equal HandlesCommitments (%v)", fixedResource.HasCapacity, fixedResource.HandlesCommitments) } - // Test variable ratio group: ram_v2_variable - variableResource, ok := serviceInfo.Resources["ram_v2_variable"] + // Test variable ratio group: hw_version_v2_variable_ram + variableResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] if !ok { - t.Fatal("expected ram_v2_variable resource to exist") + t.Fatal("expected hw_version_v2_variable_ram resource to exist") } // Variable ratio groups don't accept commitments, and we only report capacity for groups // that accept commitments, so both HasCapacity and HandlesCommitments should be false if variableResource.HasCapacity { - t.Error("ram_v2_variable: expected HasCapacity=false (variable ratio groups don't report capacity)") + t.Error("hw_version_v2_variable_ram: expected HasCapacity=false (variable ratio groups don't report capacity)") } if variableResource.HandlesCommitments { - t.Error("ram_v2_variable: expected HandlesCommitments=false (variable ratio group)") + t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (variable ratio group)") } // Verify HasCapacity == HandlesCommitments for consistency if variableResource.HasCapacity != variableResource.HandlesCommitments { - t.Errorf("ram_v2_variable: HasCapacity (%v) should equal HandlesCommitments (%v)", + t.Errorf("hw_version_v2_variable_ram: HasCapacity (%v) should equal HandlesCommitments (%v)", variableResource.HasCapacity, variableResource.HandlesCommitments) } } diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go index beb08c0b4..455127a8d 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -187,9 +187,9 @@ func TestCapacityCalculator(t *testing.T) { t.Fatalf("Expected 1 resource, got %d", len(report.Resources)) } - resource := report.Resources[liquid.ResourceName("ram_test-group")] + resource := report.Resources[liquid.ResourceName("hw_version_test-group_ram")] if resource == nil { - t.Fatal("Expected ram_test-group resource to exist") + t.Fatal("Expected hw_version_test-group_ram resource to exist") } // Should have empty perAZ map when no host details diff --git a/internal/scheduling/reservations/commitments/api_report_usage_test.go b/internal/scheduling/reservations/commitments/api_report_usage_test.go index 2a81fa468..21dddb766 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api_report_usage_test.go @@ -57,7 +57,7 @@ func TestReportUsageIntegration(t *testing.T) { Reservations: []*UsageTestReservation{}, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": {Usage: 0, VMs: []ExpectedVMUsage{}}, }, @@ -77,7 +77,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 4096 MB / 1024 MB = 4 units @@ -99,7 +99,7 @@ func TestReportUsageIntegration(t *testing.T) { Reservations: []*UsageTestReservation{}, // No commitments AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, @@ -127,7 +127,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 12, // 12 units total @@ -157,7 +157,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 12, @@ -187,7 +187,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 13, // 1 + 4 + 8 = 13 units @@ -216,7 +216,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 8, @@ -243,7 +243,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a", "az-b"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, @@ -275,7 +275,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 4096 MB / 1024 MB = 4 units @@ -285,7 +285,7 @@ func TestReportUsageIntegration(t *testing.T) { }, }, }, - "ram_gp_1": { + "hw_version_gp_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 2048 MB / 512 MB = 4 units @@ -332,7 +332,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go index 1dff26b71..7726b1dbe 100644 --- a/internal/scheduling/reservations/commitments/capacity.go +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -53,8 +53,8 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context) (liquid.Serv continue } - // Resource name follows pattern: ram_ - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + groupName) + // Resource name follows pattern: hw_version__ram + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(groupName)) // Calculate per-AZ capacity and usage azCapacity, err := c.calculateAZCapacity(ctx, groupName, groupData) diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index c8e75ff83..4ed288d16 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -18,14 +18,30 @@ import ( // commitmentUUIDPattern validates commitment UUID format. var commitmentUUIDPattern = regexp.MustCompile(`^[a-zA-Z0-9-]{6,40}$`) -// Limes LIQUID resource naming convention: ram_ -const commitmentResourceNamePrefix = "ram_" +// Limes LIQUID resource naming convention: hw_version__ram +const ( + resourceNamePrefix = "hw_version_" + resourceNameSuffix = "_ram" +) + +// ResourceNameFromFlavorGroup creates a LIQUID resource name from a flavor group name. +// Format: hw_version__ram +func ResourceNameFromFlavorGroup(flavorGroup string) string { + return resourceNamePrefix + flavorGroup + resourceNameSuffix +} func getFlavorGroupNameFromResource(resourceName string) (string, error) { - if !strings.HasPrefix(resourceName, commitmentResourceNamePrefix) { + if !strings.HasPrefix(resourceName, resourceNamePrefix) || !strings.HasSuffix(resourceName, resourceNameSuffix) { return "", fmt.Errorf("invalid resource name: %s", resourceName) } - return strings.TrimPrefix(resourceName, commitmentResourceNamePrefix), nil + // Remove prefix and suffix + name := strings.TrimPrefix(resourceName, resourceNamePrefix) + name = strings.TrimSuffix(name, resourceNameSuffix) + // Validate that the extracted group name is not empty + if name == "" { + return "", fmt.Errorf("invalid resource name: %s (empty group name)", resourceName) + } + return name, nil } // CommitmentState represents desired or current commitment resource allocation. diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go index 7060300db..717069708 100644 --- a/internal/scheduling/reservations/commitments/state_test.go +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -36,7 +36,7 @@ func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { commitment := Commitment{ UUID: "test-uuid", ProjectID: "project-1", - ResourceName: "ram_test-group", + ResourceName: "hw_version_test-group_ram", Amount: 5, // 5 multiples of smallest flavor } @@ -236,7 +236,8 @@ func TestFromReservations_NonCommittedResourceType(t *testing.T) { } func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { - name, err := getFlavorGroupNameFromResource("ram_hana_medium_v2") + // Test valid resource names with underscores in flavor group + name, err := getFlavorGroupNameFromResource("hw_version_hana_medium_v2_ram") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -246,8 +247,29 @@ func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { } func TestGetFlavorGroupNameFromResource_Invalid(t *testing.T) { - _, err := getFlavorGroupNameFromResource("invalid_resource") - if err == nil { - t.Fatal("expected error for invalid resource name, got nil") + invalidCases := []string{ + "ram_2101", // old format + "invalid", // completely wrong + "hw_version__ram", // empty group name + "hw_version_2101", // missing suffix + } + for _, input := range invalidCases { + if _, err := getFlavorGroupNameFromResource(input); err == nil { + t.Errorf("expected error for %q, got nil", input) + } + } +} + +func TestResourceNameRoundTrip(t *testing.T) { + // Test that ResourceNameFromFlavorGroup and getFlavorGroupNameFromResource are inverses + for _, groupName := range []string{"2101", "hana_1", "hana_medium_v2"} { + resourceName := ResourceNameFromFlavorGroup(groupName) + recovered, err := getFlavorGroupNameFromResource(resourceName) + if err != nil { + t.Fatalf("round-trip failed for %q: %v", groupName, err) + } + if recovered != groupName { + t.Errorf("round-trip mismatch: %q -> %q -> %q", groupName, resourceName, recovered) + } } } diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index ace9488b2..d6588a3a8 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -6,7 +6,6 @@ package commitments import ( "context" "fmt" - "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -90,12 +89,8 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) continue } - if !strings.HasPrefix(commitment.ResourceName, commitmentResourceNamePrefix) { - log.Info("skipping non-RAM-flavor-group commitment", "id", id, "resourceName", commitment.ResourceName) - continue - } - // Extract flavor group name from resource name + // Extract flavor group name from resource name (validates format: hw_version__ram) flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) if err != nil { log.Info("skipping commitment with invalid resource name", diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 4c438ea40..ef773e676 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -221,7 +221,7 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, // 2 multiples of smallest flavor (2 * 1024MB = 2048MB total) Unit: "", @@ -357,7 +357,7 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "ram_new_group_v1", + ResourceName: "hw_version_new_group_v1_ram", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -459,7 +459,7 @@ func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { ID: 1, UUID: "unit-mismatch-test-uuid", ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, Unit: "2048 MiB", // Mismatched unit - should be "1024 MiB" @@ -543,7 +543,7 @@ func TestSyncer_SyncReservations_UnitMatch(t *testing.T) { ID: 1, UUID: "unit-match-test-uuid", ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, Unit: "1024 MiB", // Correct unit matching smallest flavor @@ -623,7 +623,7 @@ func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { ID: 1, UUID: "", // Empty UUID ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 1, Unit: "", diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index b71d4e752..536440209 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -396,7 +396,7 @@ func (c *UsageCalculator) buildUsageResponse( if !FlavorGroupAcceptsCommitments(&groupData) { continue } - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + flavorGroupName) + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(flavorGroupName)) perAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) diff --git a/internal/scheduling/reservations/commitments/usage_test.go b/internal/scheduling/reservations/commitments/usage_test.go index 371667913..04b32fffa 100644 --- a/internal/scheduling/reservations/commitments/usage_test.go +++ b/internal/scheduling/reservations/commitments/usage_test.go @@ -53,7 +53,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { reservations: []*v1alpha1.Reservation{}, allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 0, + "hw_version_hana_1_ram": 0, }, }, { @@ -75,7 +75,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { }, allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 4, // 4096 MB / 1024 MB = 4 units + "hw_version_hana_1_ram": 4, // 4096 MB / 1024 MB = 4 units }, }, { @@ -92,7 +92,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { reservations: []*v1alpha1.Reservation{}, // No commitments allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 4, + "hw_version_hana_1_ram": 4, }, }, } @@ -584,9 +584,9 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { } // Find the VM in subresources and check its commitment assignment - res, ok := report.Resources["ram_hana_1"] + res, ok := report.Resources["hw_version_hana_1_ram"] if !ok { - t.Fatal("Resource ram_hana_1 not found") + t.Fatal("Resource hw_version_hana_1_ram not found") } var foundCommitment any From 16f76e8724c1dcfdbdf64e55ad3bec5244389f4e Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Wed, 25 Mar 2026 12:02:58 +0100 Subject: [PATCH 08/30] Add "Last Scheduled" column to history crd (#630) --- api/v1alpha1/history_types.go | 1 + helm/library/cortex/files/crds/cortex.cloud_histories.yaml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/api/v1alpha1/history_types.go b/api/v1alpha1/history_types.go index df5e3a65d..47a37dec6 100644 --- a/api/v1alpha1/history_types.go +++ b/api/v1alpha1/history_types.go @@ -93,6 +93,7 @@ type HistoryStatus struct { // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" // +kubebuilder:printcolumn:name="Target Host",type="string",JSONPath=".status.current.targetHost" // +kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason" +// +kubebuilder:printcolumn:name="Last Scheduled",type="date",JSONPath=".status.current.timestamp" // +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" // The history is a CRD that provides a record of past scheduling decisions for a given resource (e.g., a nova instance). diff --git a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml index f91ad0cfc..cba52fa9b 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml @@ -30,6 +30,9 @@ spec: - jsonPath: .status.conditions[?(@.type=='Ready')].reason name: Status type: string + - jsonPath: .status.current.timestamp + name: Last Scheduled + type: date - jsonPath: .metadata.creationTimestamp name: Created type: date From 8f8fa6a0e5ed627af72b2563e43a78057f4d93d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 11:12:23 +0000 Subject: [PATCH 09/30] Bump cortex chart appVersions to sha-16f76e87 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 9fff54a34..7c2a2405f 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-76321376" +appVersion: "sha-16f76e87" icon: "https://example.com/icon.png" dependencies: [] From 8ed86c79b9d826f6256fe7ab70d8a0ab0a094553 Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Wed, 25 Mar 2026 12:56:33 +0100 Subject: [PATCH 10/30] Disable history for kvm-new-failover-reservation reservation (#628) --- helm/bundles/cortex-nova/templates/pipelines_kvm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 1913ff39a..445a7f4c1 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -291,7 +291,7 @@ spec: This is the pipeline used for KVM hypervisors (qemu and cloud-hypervisor). type: filter-weigher - createHistory: true + createHistory: false # Fetch all placement candidates, ignoring nova's preselection. ignorePreselection: true filters: From 1b0dddbeaa690127483c4dd5f9db8f73cc833584 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 12:05:57 +0000 Subject: [PATCH 11/30] Bump cortex chart appVersions to sha-8ed86c79 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 7c2a2405f..f4f4c9d54 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-16f76e87" +appVersion: "sha-8ed86c79" icon: "https://example.com/icon.png" dependencies: [] From 5bbde50409e045fe63819c0a620e9656ccf62c23 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 13:53:34 +0100 Subject: [PATCH 12/30] Committed resources syncer with more alerts (#631) --- .../cortex-nova/alerts/nova.alerts.yaml | 112 +++++++++++++++++ .../commitments/api_change_commitments.go | 8 +- .../commitments/reservation_manager.go | 62 +++++---- .../commitments/reservation_manager_test.go | 48 +++---- .../reservations/commitments/syncer.go | 53 +++++++- .../commitments/syncer_monitor.go | 119 +++++++++++++++--- 6 files changed, 335 insertions(+), 67 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index a69f3be51..784830aac 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -483,3 +483,115 @@ groups: The committed resource capacity API (Limes LIQUID integration) is experiencing high latency (p95 > 5s). This may indicate slow database queries or knowledge CRD retrieval. Limes scrapes may time out, affecting capacity reporting. + + # Committed Resource Syncer Alerts + - alert: CortexNovaCommittedResourceSyncerNotRunning + expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer not running" + description: > + The committed resource syncer has not run in the last 2 hours. This indicates + that the syncer may have stopped or is encountering errors. Check the syncer logs for errors. + + - alert: CortexNovaCommittedResourceSyncerErrorsHigh + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer experiencing errors" + description: > + The committed resource syncer has encountered multiple errors in the last hour. + This may indicate connectivity issues with Limes. Check the syncer logs for error details. + + - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unit mismatch rate >5%" + description: > + More than 5% of commitments are being skipped due to unit mismatches between + Limes and Cortex flavor groups. This happens when Limes has not yet been + updated to use the new unit format after a flavor group change. The affected + commitments will keep their existing reservations until Limes notices the update. + Check the logs if this error persists for longer time. + + - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unknown flavor group rate >0%" + description: > + Some commitments reference flavor groups that don't exist in + Cortex Knowledge (anymore). This may indicate that flavor group configuration is + out of sync between Limes and Cortex, or that Knowledge extraction is failing. + Check the flavor group Knowledge CRD and history to see what was changed. + + - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh + expr: | + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer local change rate >1%" + description: > + More than 1% of synced commitments are requiring reservation changes + (creates, deletes, or repairs). This is higher than expected for steady-state + operation and may indicate data inconsistencies, external modifications to + reservations, or issues with the CRDs. Check Cortex logs for details. + + - alert: CortexNovaCommittedResourceSyncerRepairRateHigh + expr: | + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer repair rate >0%" + description: > + Some commitments have reservations that needed repair + (wrong metadata like project ID or flavor group). This may indicate data + corruption, bugs in reservation creation, or external modifications. + Reservations are automatically repaired, but the root cause should be + investigated if this alert persists. diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 953319dbe..3281fc6ec 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -253,15 +253,15 @@ ProcessLoop: logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes) - touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") + applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } - logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) - reservationsToWatch = append(reservationsToWatch, touchedReservations...) + logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations)) + reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...) } } } @@ -305,7 +305,7 @@ ProcessLoop: for commitmentUUID, state := range statesBefore { // Rollback to statesBefore for this commitment logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") + _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") if err != nil { logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) // continue with best effort rollback for other projects diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 40cfa08ef..6db72d21a 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -17,6 +17,20 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// ApplyResult contains the result of applying a commitment state. +type ApplyResult struct { + // Created is the number of reservations created + Created int + // Deleted is the number of reservations deleted + Deleted int + // Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config) + Repaired int + // TouchedReservations are reservations that were created or updated + TouchedReservations []v1alpha1.Reservation + // RemovedReservations are reservations that were deleted + RemovedReservations []v1alpha1.Reservation +} + // ReservationManager handles CRUD operations for Reservation CRDs. type ReservationManager struct { client.Client @@ -42,14 +56,16 @@ func NewReservationManager(k8sClient client.Client) *ReservationManager { // - Deleting unused/excess slots when capacity decreases // - Syncing reservation metadata for all remaining slots // -// Returns touched reservations (created/updated) and removed reservations for caller tracking. +// Returns ApplyResult containing touched/removed reservations and counts for metrics. func (m *ReservationManager) ApplyCommitmentState( ctx context.Context, log logr.Logger, desiredState *CommitmentState, flavorGroups map[string]compute.FlavorGroupFeature, creator string, -) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { +) (*ApplyResult, error) { + + result := &ApplyResult{} log = log.WithName("ReservationManager") @@ -58,7 +74,7 @@ func (m *ReservationManager) ApplyCommitmentState( if err := m.List(ctx, &allReservations, client.MatchingLabels{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { - return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + return nil, fmt.Errorf("failed to list reservations: %w", err) } // Filter by name prefix to find reservations for this commitment @@ -74,7 +90,7 @@ func (m *ReservationManager) ApplyCommitmentState( flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] if !exists { - return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) } deltaMemoryBytes := desiredState.TotalMemoryBytes for _, res := range existing { @@ -90,7 +106,6 @@ func (m *ReservationManager) ApplyCommitmentState( // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) // They will be recreated with correct metadata in subsequent phases. var validReservations []v1alpha1.Reservation - var repairedCount int for _, res := range existing { if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { @@ -101,13 +116,13 @@ func (m *ReservationManager) ApplyCommitmentState( "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, "expectedProjectID", desiredState.ProjectID, "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) - repairedCount++ - removedReservations = append(removedReservations, res) + result.Repaired++ + result.RemovedReservations = append(result.RemovedReservations, res) memValue := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, &res); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) } } else { validReservations = append(validReservations, res) @@ -139,33 +154,33 @@ func (m *ReservationManager) ApplyCommitmentState( reservationToDelete = &existing[len(existing)-1] existing = existing[:len(existing)-1] // remove from existing list } - removedReservations = append(removedReservations, *reservationToDelete) + result.RemovedReservations = append(result.RemovedReservations, *reservationToDelete) + result.Deleted++ memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, reservationToDelete); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) } } // Phase 5 (CREATE): Create new reservations (capacity increased) - var createdCount int for deltaMemoryBytes > 0 { // Need to create new reservation slots, always prefer largest flavor within the group // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) - touchedReservations = append(touchedReservations, *reservation) + result.TouchedReservations = append(result.TouchedReservations, *reservation) memValue := reservation.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memValue.Value() - createdCount++ + result.Created++ if err := m.Create(ctx, reservation); err != nil { if apierrors.IsAlreadyExists(err) { - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "reservation %s already exists (collision detected): %w", reservation.Name, err) } - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "failed to create reservation slot %d: %w", nextSlotIndex, err) } @@ -177,24 +192,25 @@ func (m *ReservationManager) ApplyCommitmentState( for i := range existing { updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) if err != nil { - return touchedReservations, removedReservations, err + return result, err } if updated != nil { - touchedReservations = append(touchedReservations, *updated) + result.TouchedReservations = append(result.TouchedReservations, *updated) + result.Repaired++ } } // Only log if there were actual changes - if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 { + if hasChanges || result.Created > 0 || len(result.RemovedReservations) > 0 || result.Repaired > 0 { log.Info("commitment state sync completed", "commitmentUUID", desiredState.CommitmentUUID, - "created", createdCount, - "deleted", len(removedReservations), - "repaired", repairedCount, - "total", len(existing)+createdCount) + "created", result.Created, + "deleted", result.Deleted, + "repaired", result.Repaired, + "total", len(existing)+result.Created) } - return touchedReservations, removedReservations, nil + return result, nil } // syncReservationMetadata updates reservation metadata if it differs from desired state. diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index 2f20f33cc..7733cb6c2 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -42,7 +42,7 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { TotalMemoryBytes: 3 * 8192 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -54,18 +54,18 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != 0 { - t.Errorf("expected 0 removed reservations, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 0 { + t.Errorf("expected 0 applyResult.RemovedReservations reservations, got %d", len(applyResult.RemovedReservations)) } // Should create reservations to fulfill the commitment - if len(touched) == 0 { + if len(applyResult.TouchedReservations) == 0 { t.Fatal("expected at least one reservation to be created") } // Verify created reservations sum to desired state totalMemory := int64(0) - for _, res := range touched { + for _, res := range applyResult.TouchedReservations { memQuantity := res.Spec.Resources[hv1.ResourceMemory] totalMemory += memQuantity.Value() } @@ -142,7 +142,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -158,7 +158,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { // This is expected behavior based on the slot sizing algorithm // Should remove excess reservations - if len(removed) == 0 { + if len(applyResult.RemovedReservations) == 0 { t.Fatal("expected reservations to be removed") } @@ -481,7 +481,7 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { TotalMemoryBytes: tt.desiredMemoryBytes, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -493,12 +493,12 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != tt.expectedRemovedCount { - t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(removed)) + if len(applyResult.RemovedReservations) != tt.expectedRemovedCount { + t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(applyResult.RemovedReservations)) } if tt.validateRemoved != nil { - tt.validateRemoved(t, removed) + tt.validateRemoved(t, applyResult.RemovedReservations) } // Get remaining reservations @@ -560,7 +560,7 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { TotalMemoryBytes: 0, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -572,13 +572,13 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(touched) != 0 { - t.Errorf("expected 0 new reservations, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(applyResult.TouchedReservations)) } // Should remove all reservations - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } // Verify no reservations remain @@ -638,7 +638,7 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -651,18 +651,18 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { } // Should remove wrong reservation and create new one - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } - if len(touched) != 1 { - t.Fatalf("expected 1 new reservation, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(applyResult.TouchedReservations)) } // Verify new reservation has correct flavor group - if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + if applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { t.Errorf("expected flavor group test-group, got %s", - touched[0].Spec.CommittedResourceReservation.ResourceGroup) + applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup) } } @@ -686,7 +686,7 @@ func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, _, err := manager.ApplyCommitmentState( + _, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index d6588a3a8..cb3d2fdb4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -85,8 +85,16 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo skippedUUIDs: make(map[string]bool), } for id, commitment := range commitments { + // Record each commitment seen from Limes + if s.monitor != nil { + s.monitor.RecordCommitmentSeen() + } + if commitment.ServiceType != "compute" { log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonNonCompute) + } continue } @@ -97,6 +105,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "id", id, "resourceName", commitment.ResourceName, "error", err) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonInvalidResource) + } continue } @@ -106,6 +117,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo log.Info("skipping commitment with unknown flavor group", "id", id, "flavorGroup", flavorGroupName) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonUnknownFlavorGroup) + } continue } @@ -136,6 +150,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo if commitment.UUID == "" { log.Info("skipping commitment with empty UUID", "id", id) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonEmptyUUID) + } continue } @@ -155,6 +172,11 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "totalMemoryBytes", state.TotalMemoryBytes) result.states = append(result.states, state) + + // Record successfully processed commitment + if s.monitor != nil { + s.monitor.RecordCommitmentProcessed() + } } return result, nil @@ -171,6 +193,11 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval) + // Record sync run + if s.monitor != nil { + s.monitor.RecordSyncRun() + } + // Check if flavor group knowledge is ready knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} knowledgeCRD, err := knowledge.Get(ctx) @@ -201,6 +228,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { manager := NewReservationManager(s.Client) // Apply each commitment state using the manager + var totalCreated, totalDeleted, totalRepaired int for _, state := range commitmentResult.states { logger.Info("applying commitment state", "commitmentUUID", state.CommitmentUUID, @@ -208,13 +236,17 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { "flavorGroup", state.FlavorGroupName, "totalMemoryBytes", state.TotalMemoryBytes) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) + applyResult, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) if err != nil { logger.Error(err, "failed to apply commitment state", "commitmentUUID", state.CommitmentUUID) // Continue with other commitments even if one fails continue } + + totalCreated += applyResult.Created + totalDeleted += applyResult.Deleted + totalRepaired += applyResult.Repaired } // Delete reservations that are no longer in commitments @@ -255,11 +287,28 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("deleted reservation for expired commitment", "name", existing.Name, "commitmentUUID", commitmentUUID) + totalDeleted++ + } + } + + // Record reservation change metrics + if s.monitor != nil { + if totalCreated > 0 { + s.monitor.RecordReservationsCreated(totalCreated) + } + if totalDeleted > 0 { + s.monitor.RecordReservationsDeleted(totalDeleted) + } + if totalRepaired > 0 { + s.monitor.RecordReservationsRepaired(totalRepaired) } } logger.Info("synced reservations", "processedCount", len(commitmentResult.states), - "skippedCount", len(commitmentResult.skippedUUIDs)) + "skippedCount", len(commitmentResult.skippedUUIDs), + "created", totalCreated, + "deleted", totalDeleted, + "repaired", totalRepaired) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go index 0bbc7fe7f..efa25b0a4 100644 --- a/internal/scheduling/reservations/commitments/syncer_monitor.go +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -7,11 +7,30 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +// Skip reason labels for commitment processing +const ( + SkipReasonUnitMismatch = "unit_mismatch" + SkipReasonUnknownFlavorGroup = "unknown_flavor_group" + SkipReasonInvalidResource = "invalid_resource_name" + SkipReasonEmptyUUID = "empty_uuid" + SkipReasonNonCompute = "non_compute" +) + // SyncerMonitor provides metrics for the commitment syncer. type SyncerMonitor struct { - syncRuns prometheus.Counter - syncErrors prometheus.Counter - unitMismatch *prometheus.CounterVec + // Sync lifecycle + syncRuns prometheus.Counter + syncErrors prometheus.Counter + + // Commitment processing + commitmentsTotal prometheus.Counter // all commitments seen from Limes + commitmentsProcessed prometheus.Counter // successfully processed + commitmentsSkipped *prometheus.CounterVec // skipped with reason label + + // Reservation changes + reservationsCreated prometheus.Counter + reservationsDeleted prometheus.Counter + reservationsRepaired prometheus.Counter } // NewSyncerMonitor creates a new monitor with Prometheus metrics. @@ -25,17 +44,44 @@ func NewSyncerMonitor() *SyncerMonitor { Name: "cortex_committed_resource_syncer_errors_total", Help: "Total number of commitment syncer errors", }), - unitMismatch: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_unit_mismatch_total", - Help: "Total number of commitments with unit mismatch between Limes and Cortex flavor group knowledge", - }, []string{"flavor_group"}), + commitmentsTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_total", + Help: "Total number of commitments seen from Limes", + }), + commitmentsProcessed: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_processed_total", + Help: "Total number of commitments successfully processed", + }), + commitmentsSkipped: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_skipped_total", + Help: "Total number of commitments skipped during sync", + }, []string{"reason"}), + reservationsCreated: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_created_total", + Help: "Total number of reservations created during sync", + }), + reservationsDeleted: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_deleted_total", + Help: "Total number of reservations deleted during sync", + }), + reservationsRepaired: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_repaired_total", + Help: "Total number of reservations repaired during sync (wrong metadata)", + }), } - return m -} -// RecordUnitMismatch records a unit mismatch for a flavor group. -func (m *SyncerMonitor) RecordUnitMismatch(flavorGroup string) { - m.unitMismatch.WithLabelValues(flavorGroup).Inc() + // Pre-initialize skip reason labels + for _, reason := range []string{ + SkipReasonUnitMismatch, + SkipReasonUnknownFlavorGroup, + SkipReasonInvalidResource, + SkipReasonEmptyUUID, + SkipReasonNonCompute, + } { + m.commitmentsSkipped.WithLabelValues(reason) + } + + return m } // RecordSyncRun records a syncer run. @@ -48,16 +94,61 @@ func (m *SyncerMonitor) RecordSyncError() { m.syncErrors.Inc() } +// RecordCommitmentSeen records a commitment seen from Limes. +func (m *SyncerMonitor) RecordCommitmentSeen() { + m.commitmentsTotal.Inc() +} + +// RecordCommitmentProcessed records a commitment successfully processed. +func (m *SyncerMonitor) RecordCommitmentProcessed() { + m.commitmentsProcessed.Inc() +} + +// RecordCommitmentSkipped records a commitment skipped with a reason. +func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) { + m.commitmentsSkipped.WithLabelValues(reason).Inc() +} + +// RecordUnitMismatch records a unit mismatch skip (convenience method). +func (m *SyncerMonitor) RecordUnitMismatch(_ string) { + m.commitmentsSkipped.WithLabelValues(SkipReasonUnitMismatch).Inc() +} + +// RecordReservationsCreated records reservations created. +func (m *SyncerMonitor) RecordReservationsCreated(count int) { + m.reservationsCreated.Add(float64(count)) +} + +// RecordReservationsDeleted records reservations deleted. +func (m *SyncerMonitor) RecordReservationsDeleted(count int) { + m.reservationsDeleted.Add(float64(count)) +} + +// RecordReservationsRepaired records reservations repaired. +func (m *SyncerMonitor) RecordReservationsRepaired(count int) { + m.reservationsRepaired.Add(float64(count)) +} + // Describe implements prometheus.Collector. func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) { m.syncRuns.Describe(ch) m.syncErrors.Describe(ch) - m.unitMismatch.Describe(ch) + m.commitmentsTotal.Describe(ch) + m.commitmentsProcessed.Describe(ch) + m.commitmentsSkipped.Describe(ch) + m.reservationsCreated.Describe(ch) + m.reservationsDeleted.Describe(ch) + m.reservationsRepaired.Describe(ch) } // Collect implements prometheus.Collector. func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) { m.syncRuns.Collect(ch) m.syncErrors.Collect(ch) - m.unitMismatch.Collect(ch) + m.commitmentsTotal.Collect(ch) + m.commitmentsProcessed.Collect(ch) + m.commitmentsSkipped.Collect(ch) + m.reservationsCreated.Collect(ch) + m.reservationsDeleted.Collect(ch) + m.reservationsRepaired.Collect(ch) } From 60c140794aa4164ec8cf081025e1de4bff20f7c6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 13:02:15 +0000 Subject: [PATCH 13/30] Bump cortex chart appVersions to sha-5bbde504 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index f4f4c9d54..87940e0f8 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-8ed86c79" +appVersion: "sha-5bbde504" icon: "https://example.com/icon.png" dependencies: [] From c5930f285291db40d085f0e74c757b58c5c2b243 Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Wed, 25 Mar 2026 14:03:03 +0100 Subject: [PATCH 14/30] Add attr to wide view of failover reservations (#627) Co-authored-by: mblos --- api/v1alpha1/reservation_types.go | 10 ++++++ .../files/crds/cortex.cloud_reservations.yaml | 36 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 97950a395..e0f141ebc 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -211,6 +211,16 @@ type ReservationStatus struct { // +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.cloud/type']" // +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" +// +kubebuilder:printcolumn:name="ResourceGroup",type="string",JSONPath=".spec.committedResourceReservation.resourceGroup" +// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.committedResourceReservation.projectID" +// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" +// +kubebuilder:printcolumn:name="StartTime",type="string",JSONPath=".spec.startTime",priority=1 +// +kubebuilder:printcolumn:name="EndTime",type="string",JSONPath=".spec.endTime" +// +kubebuilder:printcolumn:name="Resources",type="string",JSONPath=".spec.resources",priority=1 +// +kubebuilder:printcolumn:name="LastChanged",type="date",JSONPath=".status.failoverReservation.lastChanged",priority=1 +// +kubebuilder:printcolumn:name="AcknowledgedAt",type="date",JSONPath=".status.failoverReservation.acknowledgedAt",priority=1 +// +kubebuilder:printcolumn:name="CR Allocations",type="string",JSONPath=".status.committedResourceReservation.allocations",priority=1 +// +kubebuilder:printcolumn:name="HA Allocations",type="string",JSONPath=".status.failoverReservation.allocations",priority=1 // Reservation is the Schema for the reservations API type Reservation struct { diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index ab622c1c0..64931bc4d 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -24,6 +24,42 @@ spec: - jsonPath: .status.conditions[?(@.type=='Ready')].status name: Ready type: string + - jsonPath: .spec.committedResourceReservation.resourceGroup + name: ResourceGroup + type: string + - jsonPath: .spec.committedResourceReservation.projectID + name: Project + type: string + - jsonPath: .spec.availabilityZone + name: AZ + type: string + - jsonPath: .spec.startTime + name: StartTime + priority: 1 + type: string + - jsonPath: .spec.endTime + name: EndTime + type: string + - jsonPath: .spec.resources + name: Resources + priority: 1 + type: string + - jsonPath: .status.failoverReservation.lastChanged + name: LastChanged + priority: 1 + type: date + - jsonPath: .status.failoverReservation.acknowledgedAt + name: AcknowledgedAt + priority: 1 + type: date + - jsonPath: .status.committedResourceReservation.allocations + name: CR Allocations + priority: 1 + type: string + - jsonPath: .status.failoverReservation.allocations + name: HA Allocations + priority: 1 + type: string name: v1alpha1 schema: openAPIV3Schema: From 364c4a1c8a8170a5900836ce9187cb23bb326714 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 13:12:36 +0000 Subject: [PATCH 15/30] Bump cortex chart appVersions to sha-c5930f28 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 87940e0f8..c298b38a0 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-5bbde504" +appVersion: "sha-c5930f28" icon: "https://example.com/icon.png" dependencies: [] From b3501102aa11dd6e124654a87434548809c63f86 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 14:25:37 +0100 Subject: [PATCH 16/30] Commitments base URL updated (#632) --- internal/scheduling/reservations/commitments/api.go | 8 ++++---- .../commitments/api_change_commitments_test.go | 2 +- .../reservations/commitments/api_report_capacity.go | 4 ++-- .../reservations/commitments/api_report_usage.go | 8 ++++---- .../reservations/commitments/api_report_usage_test.go | 2 +- .../scheduling/reservations/commitments/e2e_checks.go | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index 3c6179d83..b35bece97 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -54,10 +54,10 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) registry.MustRegister(&api.infoMonitor) - mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) - mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity) - mux.HandleFunc("/v1/commitments/info", api.HandleInfo) - mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage + mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) + mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) + mux.HandleFunc("/commitments/v1/info", api.HandleInfo) + mux.HandleFunc("/commitments/v1/projects/", api.HandleReportUsage) // matches /commitments/v1/projects/:project_id/report-usage log.Info("commitments API initialized", "changeCommitmentsEnabled", api.config.EnableChangeCommitmentsAPI, diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index a30db33c5..6fd5c17f0 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -1085,7 +1085,7 @@ func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liq }() // Make HTTP request - url := env.HTTPServer.URL + "/v1/commitments/change-commitments" + url := env.HTTPServer.URL + "/commitments/v1/change-commitments" httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input if err != nil { cancel() diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go index 19b7fb24c..194106d92 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -14,7 +14,7 @@ import ( "github.com/sapcc/go-api-declarations/liquid" ) -// handles POST /v1/commitments/report-capacity requests from Limes: +// handles POST /commitments/v1/report-capacity requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. @@ -38,7 +38,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) } ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/report-capacity") // Only accept POST method if r.Method != http.MethodPost { diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go index 558758bab..53f210599 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage.go +++ b/internal/scheduling/reservations/commitments/api_report_usage.go @@ -15,7 +15,7 @@ import ( "github.com/sapcc/go-api-declarations/liquid" ) -// HandleReportUsage implements POST /v1/commitments/projects/:project_id/report-usage from Limes LIQUID API. +// HandleReportUsage implements POST /commitments/v1/projects/:project_id/report-usage from Limes LIQUID API. // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/report_usage.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // @@ -51,7 +51,7 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { } // Extract project UUID from URL path - // URL pattern: /v1/commitments/projects/:project_id/report-usage + // URL pattern: /commitments/v1/projects/:project_id/report-usage projectID, err := extractProjectIDFromPath(r.URL.Path) if err != nil { log.Error(err, "failed to extract project ID from path") @@ -99,9 +99,9 @@ func (api *HTTPAPI) recordUsageMetrics(statusCode int, startTime time.Time) { } // extractProjectIDFromPath extracts the project UUID from the URL path. -// Expected path format: /v1/commitments/projects/:project_id/report-usage +// Expected path format: /commitments/v1/projects/:project_id/report-usage func extractProjectIDFromPath(path string) (string, error) { - // Path: /v1/commitments/projects//report-usage + // Path: /commitments/v1/projects//report-usage parts := strings.Split(strings.Trim(path, "/"), "/") // Expected: ["v1", "commitments", "projects", "", "report-usage"] if len(parts) < 5 { diff --git a/internal/scheduling/reservations/commitments/api_report_usage_test.go b/internal/scheduling/reservations/commitments/api_report_usage_test.go index 21dddb766..26a55332d 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api_report_usage_test.go @@ -576,7 +576,7 @@ func (env *UsageTestEnv) CallReportUsageAPI(projectID string, allAZs []string, u } // Build URL - url := env.HTTPServer.URL + "/v1/commitments/projects/" + projectID + "/report-usage" + url := env.HTTPServer.URL + "/commitments/v1/projects/" + projectID + "/report-usage" method := http.MethodPost if useGET { diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go index 6272979bf..2292bcaa1 100644 --- a/internal/scheduling/reservations/commitments/e2e_checks.go +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -27,14 +27,14 @@ type E2EChecksConfig struct { BaseURL string `json:"baseURL"` } -// CheckCommitmentsInfoEndpoint sends a GET request to the /v1/commitments/info endpoint +// CheckCommitmentsInfoEndpoint sends a GET request to the /commitments/v1/info endpoint // and verifies that it returns HTTP 200 with a valid ServiceInfo response. func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { baseURL := config.BaseURL if baseURL == "" { baseURL = defaultCommitmentsAPIURL } - apiURL := baseURL + "/v1/commitments/info" + apiURL := baseURL + "/commitments/v1/info" slog.Info("checking commitments info endpoint", "apiURL", apiURL) httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody)) From 079c9ca10e35b9458b95cb6c404476cd7eb47ee5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 13:35:12 +0000 Subject: [PATCH 17/30] Bump cortex chart appVersions to sha-b3501102 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index c298b38a0..760db0f01 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-c5930f28" +appVersion: "sha-b3501102" icon: "https://example.com/icon.png" dependencies: [] From f7d4b9a6586b33e1716b4c47f8d3053299f1b4d3 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 14:45:27 +0100 Subject: [PATCH 18/30] Added commitments quota endpoint (no-op) (#633) --- .../reservations/commitments/api.go | 16 +++++++- .../commitments/api_change_commitments.go | 4 +- .../reservations/commitments/api_info.go | 6 +-- .../reservations/commitments/api_info_test.go | 8 ++-- .../reservations/commitments/api_quota.go | 40 +++++++++++++++++++ .../commitments/api_report_capacity_test.go | 4 +- .../commitments/api_report_usage.go | 9 +++-- 7 files changed, 71 insertions(+), 16 deletions(-) create mode 100644 internal/scheduling/reservations/commitments/api_quota.go diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index b35bece97..06fb97be1 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -6,6 +6,7 @@ package commitments import ( "context" "net/http" + "strings" "sync" "github.com/cobaltcore-dev/cortex/internal/scheduling/nova" @@ -57,10 +58,23 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/commitments/v1/info", api.HandleInfo) - mux.HandleFunc("/commitments/v1/projects/", api.HandleReportUsage) // matches /commitments/v1/projects/:project_id/report-usage + mux.HandleFunc("/commitments/v1/projects/", api.handleProjectEndpoint) // routes to report-usage or quota log.Info("commitments API initialized", "changeCommitmentsEnabled", api.config.EnableChangeCommitmentsAPI, "reportUsageEnabled", api.config.EnableReportUsageAPI, "reportCapacityEnabled", api.config.EnableReportCapacityAPI) } + +// handleProjectEndpoint routes /commitments/v1/projects/:project_id/... requests to the appropriate handler. +func (api *HTTPAPI) handleProjectEndpoint(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + switch { + case strings.HasSuffix(path, "/report-usage"): + api.HandleReportUsage(w, r) + case strings.HasSuffix(path, "/quota"): + api.HandleQuota(w, r) + default: + http.Error(w, "Not found", http.StatusNotFound) + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 3281fc6ec..5c2436267 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -36,7 +36,7 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { return keys } -// implements POST /v1/change-commitments from Limes LIQUID API: +// implements POST /commitments/v1/change-commitments from Limes LIQUID API: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // @@ -69,7 +69,7 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque defer api.changeMutex.Unlock() ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/change-commitments") // Only accept POST method if r.Method != http.MethodPost { diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index d25235d99..9b2b8eb4a 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -22,7 +22,7 @@ import ( // errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration) var errInternalServiceInfo = errors.New("internal error building service info") -// handles GET /v1/info requests from Limes: +// handles GET /commitments/v1/info requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { @@ -38,7 +38,7 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Request-ID", requestID) ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/info") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/info") // Only accept GET method if r.Method != http.MethodGet { @@ -152,7 +152,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l Unit: unit, // Non-standard unit: multiples of smallest flavor RAM Topology: liquid.AZAwareTopology, // Commitments are per-AZ NeedsResourceDemand: false, // Capacity planning out of scope for now - HasCapacity: handlesCommitments, // We report capacity via /v1/report-capacity only for groups that accept commitments + HasCapacity: handlesCommitments, // We report capacity via /commitments/v1/report-capacity only for groups that accept commitments HasQuota: false, // No quota enforcement as of now HandlesCommitments: handlesCommitments, // Only for groups with fixed RAM/core ratio Attributes: attrsJSON, diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go index 846d4c279..256709957 100644 --- a/internal/scheduling/reservations/commitments/api_info_test.go +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -30,7 +30,7 @@ func TestHandleInfo_KnowledgeNotReady(t *testing.T) { api := NewAPI(k8sClient) - req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -63,7 +63,7 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { api := NewAPI(k8sClient) // Use POST instead of GET - req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodPost, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -124,7 +124,7 @@ func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { api := NewAPI(k8sClient) - req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -197,7 +197,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { api := NewAPI(k8sClient) - req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) diff --git a/internal/scheduling/reservations/commitments/api_quota.go b/internal/scheduling/reservations/commitments/api_quota.go new file mode 100644 index 000000000..184728f39 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_quota.go @@ -0,0 +1,40 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + + "github.com/google/uuid" +) + +// HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// +// This is a no-op endpoint that accepts quota requests but doesn't store them. +// Cortex does not enforce quotas for committed resources - quota enforcement +// happens through commitment validation at change-commitments time. +// The endpoint exists for API compatibility with the LIQUID specification. +func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + w.Header().Set("X-Request-ID", requestID) + + log := baseLog.WithValues("requestID", requestID, "endpoint", "quota") + + if r.Method != http.MethodPut { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // No-op: Accept the quota request but don't store it + // Cortex handles capacity through commitments, not quotas + log.V(1).Info("received quota request (no-op)", "path", r.URL.Path) + + // Return 204 No Content as expected by the LIQUID API + w.WriteHeader(http.StatusNoContent) +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go index 455127a8d..4dd642c0e 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -93,9 +93,9 @@ func TestHandleReportCapacity(t *testing.T) { if err != nil { t.Fatal(err) } - req = httptest.NewRequest(tt.method, "/v1/report-capacity", bytes.NewReader(bodyBytes)) + req = httptest.NewRequest(tt.method, "/commitments/v1/report-capacity", bytes.NewReader(bodyBytes)) } else { - req = httptest.NewRequest(tt.method, "/v1/report-capacity", http.NoBody) + req = httptest.NewRequest(tt.method, "/commitments/v1/report-capacity", http.NoBody) } req = req.WithContext(context.Background()) diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go index 53f210599..f54917e1a 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage.go +++ b/internal/scheduling/reservations/commitments/api_report_usage.go @@ -99,15 +99,16 @@ func (api *HTTPAPI) recordUsageMetrics(statusCode int, startTime time.Time) { } // extractProjectIDFromPath extracts the project UUID from the URL path. -// Expected path format: /commitments/v1/projects/:project_id/report-usage +// Expected path format: /commitments/v1/projects/:project_id/ +// where is "report-usage" or "quota" func extractProjectIDFromPath(path string) (string, error) { - // Path: /commitments/v1/projects//report-usage + // Path: /commitments/v1/projects// parts := strings.Split(strings.Trim(path, "/"), "/") - // Expected: ["v1", "commitments", "projects", "", "report-usage"] + // Expected: ["commitments", "v1", "projects", "", ""] if len(parts) < 5 { return "", fmt.Errorf("path too short: %s", path) } - if parts[2] != "projects" || parts[4] != "report-usage" { + if parts[0] != "commitments" || parts[1] != "v1" || parts[2] != "projects" { return "", fmt.Errorf("unexpected path format: %s", path) } projectID := parts[3] From 73c5d143eb826c5432b355a5d737cc08816d9509 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 13:54:49 +0000 Subject: [PATCH 19/30] Bump cortex chart appVersions to sha-f7d4b9a6 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 760db0f01..c31f0e822 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.31 -appVersion: "sha-b3501102" +appVersion: "sha-f7d4b9a6" icon: "https://example.com/icon.png" dependencies: [] From 18591fa97adb5a00ae6cb2937dbf23f476d520f1 Mon Sep 17 00:00:00 2001 From: mblos Date: Wed, 25 Mar 2026 14:58:50 +0100 Subject: [PATCH 20/30] Bump core to 0.0.32 and bundles to 0.0.45 * Added commitments quota endpoint (no-op) * Commitments base URL updated * Add attr to wide view of failover reservations * Committed resources syncer with more alerts * Disable history for kvm-new-failover-reservation reservation * Add "Last Scheduled" column to history crd * Commitment resources use new naming schema * Update commitments with support for non-standard units * Add multicluster support for the event recorder * Set SchedulerHints in failover request and removed incorrect prefix * Upgrade hypervisor crd and add aggregate metadata filter * Failover controller to support multicluster watches for reservations * Added commitment docs --- helm/bundles/cortex-cinder/Chart.yaml | 6 +++--- helm/bundles/cortex-crds/Chart.yaml | 4 ++-- helm/bundles/cortex-ironcore/Chart.yaml | 4 ++-- helm/bundles/cortex-manila/Chart.yaml | 6 +++--- helm/bundles/cortex-nova/Chart.yaml | 6 +++--- helm/bundles/cortex-pods/Chart.yaml | 4 ++-- helm/library/cortex/Chart.yaml | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 4c4c5889b..428644f0a 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 0d59d0740..034340d59 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 5f975675d..b1fea5199 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index c253c3da9..329533c08 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index e77dfe3b3..4f971cb64 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index c44db2b25..a4077b180 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index c31f0e822..2f46d70fc 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.31 +version: 0.0.32 appVersion: "sha-f7d4b9a6" icon: "https://example.com/icon.png" dependencies: [] From 12a55fd18ae2730d3e72f6950b5a90ec4bd90261 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 14:11:40 +0000 Subject: [PATCH 21/30] Bump cortex chart appVersions to sha-18591fa9 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 2f46d70fc..f2a299e54 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.32 -appVersion: "sha-f7d4b9a6" +appVersion: "sha-18591fa9" icon: "https://example.com/icon.png" dependencies: [] From 3721350c842bb888bd0479a0d9ebac1129d87553 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Wed, 25 Mar 2026 15:18:37 +0100 Subject: [PATCH 22/30] Change log level of VM missing failover log --- internal/scheduling/reservations/failover/controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go index f2175adc3..e31a4aa1c 100644 --- a/internal/scheduling/reservations/failover/controller.go +++ b/internal/scheduling/reservations/failover/controller.go @@ -684,7 +684,7 @@ func (c *FailoverReservationController) calculateVMsMissingFailover( needed := requiredCount - currentCount totalReservationsNeeded += needed - logger.V(2).Info("VM needs more failover reservations", + logger.V(1).Info("VM needs more failover reservations", "vmUUID", vm.UUID, "flavorName", vm.FlavorName, "currentCount", currentCount, From e4a04318ea55c5c5d274abec6a434353e1ce182a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 14:29:05 +0000 Subject: [PATCH 23/30] Bump cortex chart appVersions to sha-3721350c [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index f2a299e54..ff3de5733 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.32 -appVersion: "sha-18591fa9" +appVersion: "sha-3721350c" icon: "https://example.com/icon.png" dependencies: [] From 30995c2e919e6a86d9b528bebb5f6559762bb1a6 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:58:24 +0100 Subject: [PATCH 24/30] fix test flakiness (#636) --- .../api_change_commitments_test.go | 36 ++++++++++++++++--- .../reservations/commitments/syncer_test.go | 13 +++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 6fd5c17f0..c304d9e5a 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -615,8 +615,16 @@ func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { groupMap[groupName] = append(groupMap[groupName], flavor) } + // Sort group names for deterministic iteration + sortedGroupNames := make([]string, 0, len(groupMap)) + for groupName := range groupMap { + sortedGroupNames = append(sortedGroupNames, groupName) + } + sort.Strings(sortedGroupNames) + var groups []compute.FlavorGroupFeature - for groupName, groupFlavors := range groupMap { + for _, groupName := range sortedGroupNames { + groupFlavors := groupMap[groupName] if len(groupFlavors) == 0 { continue } @@ -1765,6 +1773,7 @@ func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation { // buildRequestJSON converts a test CommitmentChangeRequest to JSON string. // Builds the nested JSON structure directly for simplicity. +// Uses sorted iteration to ensure deterministic JSON output. func buildRequestJSON(req CommitmentChangeRequest) string { // Group commitments by project and resource for nested structure type projectResources map[liquid.ResourceName][]TestCommitment @@ -1780,11 +1789,30 @@ func buildRequestJSON(req CommitmentChangeRequest) string { ) } - // Build nested JSON structure + // Sort projects for deterministic iteration + sortedProjects := make([]string, 0, len(byProject)) + for projectID := range byProject { + sortedProjects = append(sortedProjects, projectID) + } + sort.Strings(sortedProjects) + + // Build nested JSON structure with sorted iteration var projectParts []string - for projectID, resources := range byProject { + for _, projectID := range sortedProjects { + resources := byProject[projectID] + + // Sort resource names for deterministic iteration + sortedResources := make([]liquid.ResourceName, 0, len(resources)) + for resourceName := range resources { + sortedResources = append(sortedResources, resourceName) + } + sort.Slice(sortedResources, func(i, j int) bool { + return string(sortedResources[i]) < string(sortedResources[j]) + }) + var resourceParts []string - for resourceName, commits := range resources { + for _, resourceName := range sortedResources { + commits := resources[resourceName] var commitParts []string for _, c := range commits { expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour) diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index ef773e676..b09115453 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -5,6 +5,7 @@ package commitments import ( "context" + "sort" "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -32,9 +33,17 @@ type FlavorGroupData struct { func createFlavorGroupKnowledge(t *testing.T, groups map[string]FlavorGroupData) *v1alpha1.Knowledge { t.Helper() - // Build flavor group features + // Sort group names for deterministic iteration + sortedGroupNames := make([]string, 0, len(groups)) + for groupName := range groups { + sortedGroupNames = append(sortedGroupNames, groupName) + } + sort.Strings(sortedGroupNames) + + // Build flavor group features with sorted iteration features := make([]compute.FlavorGroupFeature, 0, len(groups)) - for groupName, data := range groups { + for _, groupName := range sortedGroupNames { + data := groups[groupName] features = append(features, compute.FlavorGroupFeature{ Name: groupName, Flavors: []compute.FlavorInGroup{ From ecd97fb5ad67afeaed79ff6dc7f7800df31a0848 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 15:08:35 +0000 Subject: [PATCH 25/30] Bump cortex chart appVersions to sha-30995c2e [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index ff3de5733..6c8de0160 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.32 -appVersion: "sha-3721350c" +appVersion: "sha-30995c2e" icon: "https://example.com/icon.png" dependencies: [] From f213f2170c3ba0c9f2c01e8873c31aded8c1156d Mon Sep 17 00:00:00 2001 From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:40:48 +0100 Subject: [PATCH 26/30] Fix history client tests (#635) --- .../scheduling/lib/history_client_test.go | 61 +------------------ 1 file changed, 2 insertions(+), 59 deletions(-) diff --git a/internal/scheduling/lib/history_client_test.go b/internal/scheduling/lib/history_client_test.go index d9e919cb7..56572f3eb 100644 --- a/internal/scheduling/lib/history_client_test.go +++ b/internal/scheduling/lib/history_client_test.go @@ -8,9 +8,7 @@ import ( "errors" "fmt" "strings" - "sync" "testing" - "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" testlib "github.com/cobaltcore-dev/cortex/pkg/testing" @@ -207,7 +205,7 @@ func newTestScheme(t *testing.T) *runtime.Scheme { return scheme } -func TestHistoryManager_Upsert(t *testing.T) { +func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { tests := []struct { name string // setup returns a fake client and any pre-existing objects. @@ -563,62 +561,7 @@ func TestHistoryManager_Upsert(t *testing.T) { } } -func TestHistoryManager_UpsertFromGoroutine(t *testing.T) { - c := fake.NewClientBuilder(). - WithScheme(newTestScheme(t)). - WithStatusSubresource(&v1alpha1.History{}). - Build() - hm := HistoryClient{Client: c} - - decision := &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "async-uuid", - PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, - Intent: v1alpha1.SchedulingIntentUnknown, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("compute-async"), - }, - }, - } - - // Mirrors the pattern used in pipeline controllers. - var wg sync.WaitGroup - var upsertErr error - wg.Go(func() { - upsertErr = hm.CreateOrUpdateHistory(context.Background(), decision, nil, nil) - }) - - // Poll for history creation. - var histories v1alpha1.HistoryList - deadline := time.Now().Add(2 * time.Second) - for { - if err := c.List(context.Background(), &histories); err != nil { - t.Fatalf("Failed to list histories: %v", err) - } - if len(histories.Items) > 0 { - break - } - if time.Now().After(deadline) { - t.Fatal("timed out waiting for async history creation") - } - time.Sleep(5 * time.Millisecond) - } - - wg.Wait() - if upsertErr != nil { - t.Fatalf("Upsert() returned error: %v", upsertErr) - } - - got := histories.Items[0].Status.Current.TargetHost - if got == nil || *got != "compute-async" { - t.Errorf("target host = %v, want %q", got, "compute-async") - } -} - -func TestHistoryManager_Delete(t *testing.T) { +func TestHistoryClient_Delete(t *testing.T) { tests := []struct { name string domain v1alpha1.SchedulingDomain From 4d30ee73ca5ff2ad4d856b0bf1d8ed6b3fabf970 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 15:50:18 +0000 Subject: [PATCH 27/30] Bump cortex chart appVersions to sha-f213f217 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 6c8de0160..5e1133c25 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.32 -appVersion: "sha-30995c2e" +appVersion: "sha-f213f217" icon: "https://example.com/icon.png" dependencies: [] From d1eafecc64df21a53655fa74d1c25b6fd0682c9e Mon Sep 17 00:00:00 2001 From: Malte <140147670+umswmayj@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:24:24 +0100 Subject: [PATCH 28/30] Support concatenated aggregates in filter requested destination (#637) --- .../filters/filter_requested_destination.go | 35 +++++--- .../filter_requested_destination_test.go | 87 ++++++++++++++++++- 2 files changed, 108 insertions(+), 14 deletions(-) diff --git a/internal/scheduling/nova/plugins/filters/filter_requested_destination.go b/internal/scheduling/nova/plugins/filters/filter_requested_destination.go index c6d307b78..dd1804dc8 100644 --- a/internal/scheduling/nova/plugins/filters/filter_requested_destination.go +++ b/internal/scheduling/nova/plugins/filters/filter_requested_destination.go @@ -7,6 +7,7 @@ import ( "context" "log/slog" "slices" + "strings" api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" @@ -38,8 +39,11 @@ type FilterRequestedDestinationStep struct { } // processRequestedAggregates filters hosts based on the requested aggregates. -// It removes hosts that are not part of any of the requested aggregates, -// respecting the IgnoredAggregates option. Returns early without filtering +// The aggregates list uses AND logic between elements, meaning a host must match +// ALL elements to pass. Each element can contain comma-separated UUIDs which use +// OR logic, meaning the host only needs to match ONE of the UUIDs in that group. +// Example: ["agg1", "agg2,agg3"] means host must be in agg1 AND (agg2 OR agg3). +// Respects the IgnoredAggregates option and returns early without filtering // if all requested aggregates are in the ignored list. func (s *FilterRequestedDestinationStep) processRequestedAggregates( traceLog *slog.Logger, @@ -51,13 +55,12 @@ func (s *FilterRequestedDestinationStep) processRequestedAggregates( if len(aggregates) == 0 { return } + // Filter out ignored aggregates aggregatesToConsider := make([]string, 0, len(aggregates)) for _, agg := range aggregates { - if slices.Contains(s.Options.IgnoredAggregates, agg) { - traceLog.Info("ignoring aggregate in requested_destination as it is in the ignored list", "aggregate", agg) - continue + if !slices.Contains(s.Options.IgnoredAggregates, agg) { + aggregatesToConsider = append(aggregatesToConsider, agg) } - aggregatesToConsider = append(aggregatesToConsider, agg) } if len(aggregatesToConsider) == 0 { traceLog.Info("all aggregates in requested_destination are in the ignored list, skipping aggregate filtering") @@ -74,14 +77,24 @@ func (s *FilterRequestedDestinationStep) processRequestedAggregates( for _, agg := range hv.Status.Aggregates { hvAggregateUUIDs = append(hvAggregateUUIDs, agg.UUID) } - found := false - for _, reqAgg := range aggregatesToConsider { - if slices.Contains(hvAggregateUUIDs, reqAgg) { - found = true + // All outer elements must match (AND logic) + // Each element can be comma-separated UUIDs (OR logic within the group) + allMatch := true + for _, reqAggGroup := range aggregatesToConsider { + reqAggs := strings.Split(reqAggGroup, ",") + groupMatch := false + for _, reqAgg := range reqAggs { + if slices.Contains(hvAggregateUUIDs, reqAgg) { + groupMatch = true + break + } + } + if !groupMatch { + allMatch = false break } } - if !found { + if !allMatch { delete(activations, host) traceLog.Info( "filtered out host not in requested_destination aggregates", diff --git a/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go b/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go index b5307efd8..a2b7e1055 100644 --- a/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go @@ -200,13 +200,14 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { expectErr: false, }, { - name: "Filter by multiple aggregates", + name: "Filter by multiple aggregates with OR syntax (comma-separated)", request: api.ExternalSchedulerRequest{ Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ Data: api.NovaRequestedDestination{ - Aggregates: []string{"aggregate1", "aggregate3"}, + // "aggregate1,aggregate3" means host must be in aggregate1 OR aggregate3 + Aggregates: []string{"aggregate1,aggregate3"}, }, }, }, @@ -241,12 +242,13 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { expectErr: false, }, { - name: "Filter by aggregates - hosts in both spec and status", + name: "Filter by multiple aggregates with AND logic", request: api.ExternalSchedulerRequest{ Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ Data: api.NovaRequestedDestination{ + // ["aggregate1", "aggregate2"] means host must be in aggregate1 AND aggregate2 Aggregates: []string{"aggregate1", "aggregate2"}, }, }, @@ -258,6 +260,85 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { {ComputeHost: "host3"}, }, }, + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: metav1.ObjectMeta{Name: "host1"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host2"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}, {UUID: "aggregate2"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host3"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate2"}}}, + }, + }, + expectedHosts: []string{"host2"}, + filteredHosts: []string{"host1", "host3"}, + expectErr: false, + }, + { + name: "Filter by mixed AND/OR aggregates", + request: api.ExternalSchedulerRequest{ + Spec: api.NovaObject[api.NovaSpec]{ + Data: api.NovaSpec{ + RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ + Data: api.NovaRequestedDestination{ + // Host must be in (aggregate1 OR aggregate2) AND aggregate3 + Aggregates: []string{"aggregate1,aggregate2", "aggregate3"}, + }, + }, + }, + }, + Hosts: []api.ExternalSchedulerHost{ + {ComputeHost: "host1"}, + {ComputeHost: "host2"}, + {ComputeHost: "host3"}, + {ComputeHost: "host4"}, + }, + }, + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: metav1.ObjectMeta{Name: "host1"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}, {UUID: "aggregate3"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host2"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate2"}, {UUID: "aggregate3"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host3"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host4"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate3"}}}, + }, + }, + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{"host3", "host4"}, + expectErr: false, + }, + { + name: "Filter by aggregates with OR syntax - hosts in both spec and status", + request: api.ExternalSchedulerRequest{ + Spec: api.NovaObject[api.NovaSpec]{ + Data: api.NovaSpec{ + RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ + Data: api.NovaRequestedDestination{ + // "aggregate1,aggregate2" means host must be in aggregate1 OR aggregate2 + Aggregates: []string{"aggregate1,aggregate2"}, + }, + }, + }, + }, + Hosts: []api.ExternalSchedulerHost{ + {ComputeHost: "host1"}, + {ComputeHost: "host2"}, + {ComputeHost: "host3"}, + }, + }, hypervisors: []hv1.Hypervisor{ { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, From 627009475bbb16739ac6e54eead5c083157bd2bc Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:24:35 +0100 Subject: [PATCH 29/30] Committed resource watch fixed (#638) --- .../reservations/commitments/controller.go | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 71530cb59..4a318d882 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -16,6 +16,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -502,10 +503,22 @@ func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl })); err != nil { return err } - return multicluster.BuildController(mcl, mgr). - For(&v1alpha1.Reservation{}). - WithEventFilter(commitmentReservationPredicate). - Named("commitment-reservation"). + + // Use WatchesMulticluster to watch Reservations across all configured clusters + // (home + remotes). This is required because Reservation CRDs may be stored + // in remote clusters, not just the home cluster. Without this, the controller + // would only see reservations in the home cluster's cache. + bldr := multicluster.BuildController(mcl, mgr) + bldr, err := bldr.WatchesMulticluster( + &v1alpha1.Reservation{}, + &handler.EnqueueRequestForObject{}, + commitmentReservationPredicate, + ) + if err != nil { + return err + } + + return bldr.Named("commitment-reservation"). WithOptions(controller.Options{ // We want to process reservations one at a time to avoid overbooking. MaxConcurrentReconciles: 1, From 97f76d5c937bb710437d9ec5e28657ce7ae24f7c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Mar 2026 16:35:30 +0000 Subject: [PATCH 30/30] Bump cortex chart appVersions to sha-62700947 [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 5e1133c25..c836985a0 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.32 -appVersion: "sha-f213f217" +appVersion: "sha-62700947" icon: "https://example.com/icon.png" dependencies: []