diff --git a/Tiltfile b/Tiltfile index a42fe43f4..6871d18b3 100644 --- a/Tiltfile +++ b/Tiltfile @@ -189,6 +189,13 @@ if 'nova' in ACTIVE_DEPLOYMENTS: trigger_mode=TRIGGER_MODE_MANUAL, auto_init=False, ) + local_resource( + 'Commitments E2E Tests', + '/bin/sh -c "kubectl exec deploy/cortex-nova-scheduling-controller-manager -- /manager e2e-commitments"', + labels=['Cortex-Nova'], + trigger_mode=TRIGGER_MODE_MANUAL, + auto_init=False, + ) if 'manila' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Manila bundle") diff --git a/api/external/nova/messages.go b/api/external/nova/messages.go index 15c652ba4..a0a5a8616 100644 --- a/api/external/nova/messages.go +++ b/api/external/nova/messages.go @@ -138,6 +138,8 @@ const ( EvacuateIntent v1alpha1.SchedulingIntent = "evacuate" // CreateIntent indicates that the request is intended for creating a new VM. CreateIntent v1alpha1.SchedulingIntent = "create" + // ReserveForFailoverIntent indicates that the request is for failover reservation scheduling. + ReserveForFailoverIntent v1alpha1.SchedulingIntent = "reserve_for_failover" ) // GetIntent analyzes the request spec and determines the intent of the scheduling request. @@ -160,6 +162,9 @@ func (req ExternalSchedulerRequest) GetIntent() (v1alpha1.SchedulingIntent, erro // See: https://github.com/sapcc/nova/blob/c88393/nova/compute/api.py#L5770 case "evacuate": return EvacuateIntent, nil + // Used by cortex failover reservation controller + case "reserve_for_failover": + return ReserveForFailoverIntent, nil default: return CreateIntent, nil } diff --git a/api/v1alpha1/history_types.go b/api/v1alpha1/history_types.go index df5e3a65d..47a37dec6 100644 --- a/api/v1alpha1/history_types.go +++ b/api/v1alpha1/history_types.go @@ -93,6 +93,7 @@ type HistoryStatus struct { // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" // +kubebuilder:printcolumn:name="Target Host",type="string",JSONPath=".status.current.targetHost" // +kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason" +// +kubebuilder:printcolumn:name="Last Scheduled",type="date",JSONPath=".status.current.timestamp" // +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" // The history is a CRD that provides a record of past scheduling decisions for a given resource (e.g., a nova instance). diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 97950a395..e0f141ebc 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -211,6 +211,16 @@ type ReservationStatus struct { // +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.cloud/type']" // +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" +// +kubebuilder:printcolumn:name="ResourceGroup",type="string",JSONPath=".spec.committedResourceReservation.resourceGroup" +// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.committedResourceReservation.projectID" +// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" +// +kubebuilder:printcolumn:name="StartTime",type="string",JSONPath=".spec.startTime",priority=1 +// +kubebuilder:printcolumn:name="EndTime",type="string",JSONPath=".spec.endTime" +// +kubebuilder:printcolumn:name="Resources",type="string",JSONPath=".spec.resources",priority=1 +// +kubebuilder:printcolumn:name="LastChanged",type="date",JSONPath=".status.failoverReservation.lastChanged",priority=1 +// +kubebuilder:printcolumn:name="AcknowledgedAt",type="date",JSONPath=".status.failoverReservation.acknowledgedAt",priority=1 +// +kubebuilder:printcolumn:name="CR Allocations",type="string",JSONPath=".status.committedResourceReservation.allocations",priority=1 +// +kubebuilder:printcolumn:name="HA Allocations",type="string",JSONPath=".status.failoverReservation.allocations",priority=1 // Reservation is the Schema for the reservations API type Reservation struct { diff --git a/cmd/main.go b/cmd/main.go index 351bf29eb..5d8acf1e7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -110,6 +110,10 @@ func main() { manilaChecksConfig := conf.GetConfigOrDie[manila.ChecksConfig]() manila.RunChecks(ctx, client, manilaChecksConfig) return + case "e2e-commitments": + commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]() + commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig) + return } } @@ -665,7 +669,9 @@ func main() { if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") { setupLog.Info("starting commitments syncer") - syncer := commitments.NewSyncer(multiclusterClient) + syncerMonitor := commitments.NewSyncerMonitor() + must.Succeed(metrics.Registry.Register(syncerMonitor)) + syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor) syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]() syncerDefaults := commitments.DefaultSyncerConfig() if syncerConfig.SyncInterval == 0 { diff --git a/docs/guides/multicluster/readme.md b/docs/guides/multicluster/readme.md index f65d19486..ab9c9e60a 100644 --- a/docs/guides/multicluster/readme.md +++ b/docs/guides/multicluster/readme.md @@ -1,7 +1,7 @@ # Cortex Multi-Cluster Testing > [!NOTE] -> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. +> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. If you want to test the multi-cluster setup you can run the `schedule.sh` script, which will create a scheduling request and show you how it gets processed across the clusters. Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. Similarly, another use case is to have multiple remote clusters that maintain all the compute workloads and expose resources that Cortex needs to access, such as the `Hypervisor` resource. diff --git a/docs/guides/multicluster/run.sh b/docs/guides/multicluster/run.sh index 063de8ab6..f68956870 100755 --- a/docs/guides/multicluster/run.sh +++ b/docs/guides/multicluster/run.sh @@ -52,16 +52,20 @@ global: gvks: - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList + - cortex.cloud/v1alpha1/History + - cortex.cloud/v1alpha1/HistoryList labels: - az: cortex-remote-az-a + availabilityZone: cortex-remote-az-a caCert: | $(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /') - host: https://host.docker.internal:8445 gvks: - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList + - cortex.cloud/v1alpha1/History + - cortex.cloud/v1alpha1/HistoryList labels: - az: cortex-remote-az-b + availabilityZone: cortex-remote-az-b caCert: | $(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /') EOF diff --git a/docs/guides/multicluster/schedule.sh b/docs/guides/multicluster/schedule.sh new file mode 100755 index 000000000..a1caaf24d --- /dev/null +++ b/docs/guides/multicluster/schedule.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +set -e + +API_URL="http://localhost:8001/scheduler/nova/external" +INSTANCE_UUID="cortex-test-instance-001" +HISTORY_NAME="nova-$INSTANCE_UUID" + +# --- Step 1: Apply the test pipeline ----------------------------------------- + +echo "=== Step 1: Apply test pipeline ===" +echo "" +echo "The test pipeline is a minimal filter-weigher pipeline with:" +echo " - createHistory: true (so a History CRD is created for each decision)" +echo " - filter_correct_az (filters hosts not matching the requested AZ)" +echo " - no weighers (hosts are returned in their original order)" +echo "" + +kubectl --context kind-cortex-home apply -f docs/guides/multicluster/test-pipeline.yaml + +echo "" +echo "Press enter to send a scheduling request..." +read -r + +# --- Step 2: Send scheduling request ----------------------------------------- + +echo "=== Step 2: Send scheduling request ===" +echo "" +echo "Sending a Nova external scheduler request to the cortex API." +echo "" +echo " Instance UUID: $INSTANCE_UUID" +echo " Availability Zone: cortex-remote-az-b" +echo " Pipeline: multicluster-test" +echo " Candidate hosts: hypervisor-{1,2}-az-{a,b} (4 hosts across 2 AZs)" +echo "" +echo "The pipeline's filter_correct_az step should filter out the az-a hosts," +echo "leaving only hypervisor-1-az-b and hypervisor-2-az-b." +echo "" + +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -d @- </dev/null || echo "$BODY" + +if [ "$HTTP_CODE" != "200" ]; then + echo "" + echo "ERROR: Scheduling request failed. Check the controller logs:" + echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager" + exit 1 +fi + +echo "" +echo "Press enter to check History CRDs and events across all clusters..." +read -r + +# --- Step 3: Check History and Events ---------------------------------------- + +echo "=== Step 3: Check History CRDs and Events ===" +echo "" +echo "The pipeline has createHistory: true, so a History CRD named '$HISTORY_NAME'" +echo "should have been created. An event should also have been recorded on it." +echo "Based on the multicluster config, this should be on the remote cluster cortex-remote-az-b." +echo "" + +sleep 1 + +for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do + echo "--- $CLUSTER ---" + echo "Histories:" + kubectl --context "$CLUSTER" get histories 2>/dev/null || echo " (none)" + echo "Events:" + kubectl --context "$CLUSTER" get events --field-selector reason=SchedulingSucceeded 2>/dev/null || echo " (none)" + echo "" +done + +echo "Press enter to describe the History CRD and see the full scheduling result..." +read -r + +# --- Step 4: Describe History ------------------------------------------------ + +echo "=== Step 4: Describe History CRD ===" +echo "" +echo "The History CRD contains the full scheduling decision context:" +echo " - Which pipeline was used" +echo " - The target host that was selected" +echo " - An explanation of each filter/weigher step" +echo " - The Ready condition (True = host selected, False = no host found)" +echo "" + +# Try all clusters to find where the History ended up. +for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do + if kubectl --context "$CLUSTER" get history "$HISTORY_NAME" &>/dev/null; then + echo "Found History '$HISTORY_NAME' in $CLUSTER:" + echo "" + kubectl --context "$CLUSTER" describe history "$HISTORY_NAME" + exit 0 + fi +done + +echo "WARNING: History '$HISTORY_NAME' was not found in any cluster." +echo "Check the controller logs for errors:" +echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager | grep -i history" diff --git a/docs/guides/multicluster/test-pipeline.yaml b/docs/guides/multicluster/test-pipeline.yaml new file mode 100644 index 000000000..667f4a7fe --- /dev/null +++ b/docs/guides/multicluster/test-pipeline.yaml @@ -0,0 +1,12 @@ +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: multicluster-test +spec: + schedulingDomain: nova + description: Minimal test pipeline for the multicluster guide. + type: filter-weigher + createHistory: true + filters: + - name: filter_correct_az + weighers: [] diff --git a/go.mod b/go.mod index cf74f39f1..a6c3b9ce7 100644 --- a/go.mod +++ b/go.mod @@ -72,7 +72,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/sapcc/go-api-declarations v1.20.2 + github.com/sapcc/go-api-declarations v1.21.0 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect github.com/spf13/pflag v1.0.10 // indirect @@ -98,7 +98,7 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.42.0 // indirect - golang.org/x/term v0.41.0 // indirect + golang.org/x/term v0.41.0 golang.org/x/text v0.33.0 // indirect golang.org/x/time v0.14.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect @@ -108,7 +108,7 @@ require ( google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect + gopkg.in/yaml.v3 v3.0.1 gotest.tools v2.2.0+incompatible // indirect k8s.io/apiextensions-apiserver v0.35.0 // indirect k8s.io/apiserver v0.35.0 // indirect diff --git a/go.sum b/go.sum index 98a4f9477..6dba2b5cc 100644 --- a/go.sum +++ b/go.sum @@ -176,8 +176,8 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ= -github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= +github.com/sapcc/go-api-declarations v1.21.0 h1:Ag6GXgJLTFdBDKmrJU4QFllQbgGSenSGeHpLuvuxeDk= +github.com/sapcc/go-api-declarations v1.21.0/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ= github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= @@ -251,12 +251,8 @@ golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 4c4c5889b..428644f0a 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 0d59d0740..034340d59 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 5f975675d..b1fea5199 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index c253c3da9..329533c08 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index e77dfe3b3..4f971cb64 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 9b80e079b..784830aac 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -276,7 +276,25 @@ groups: configuration. It is recommended to investigate the pipeline status and logs for more details. - # Committed Resource (Limes Integration) Alerts + # Committed Resource Info API Alerts + - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource info API HTTP 500 errors too high" + description: > + The committed resource info API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems building service info, + such as invalid flavor group data. Limes will not be able to discover available + resources until the issue is resolved. + + # Committed Resource Change API Alerts - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 for: 5m @@ -465,3 +483,115 @@ groups: The committed resource capacity API (Limes LIQUID integration) is experiencing high latency (p95 > 5s). This may indicate slow database queries or knowledge CRD retrieval. Limes scrapes may time out, affecting capacity reporting. + + # Committed Resource Syncer Alerts + - alert: CortexNovaCommittedResourceSyncerNotRunning + expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer not running" + description: > + The committed resource syncer has not run in the last 2 hours. This indicates + that the syncer may have stopped or is encountering errors. Check the syncer logs for errors. + + - alert: CortexNovaCommittedResourceSyncerErrorsHigh + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer experiencing errors" + description: > + The committed resource syncer has encountered multiple errors in the last hour. + This may indicate connectivity issues with Limes. Check the syncer logs for error details. + + - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unit mismatch rate >5%" + description: > + More than 5% of commitments are being skipped due to unit mismatches between + Limes and Cortex flavor groups. This happens when Limes has not yet been + updated to use the new unit format after a flavor group change. The affected + commitments will keep their existing reservations until Limes notices the update. + Check the logs if this error persists for longer time. + + - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unknown flavor group rate >0%" + description: > + Some commitments reference flavor groups that don't exist in + Cortex Knowledge (anymore). This may indicate that flavor group configuration is + out of sync between Limes and Cortex, or that Knowledge extraction is failing. + Check the flavor group Knowledge CRD and history to see what was changed. + + - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh + expr: | + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer local change rate >1%" + description: > + More than 1% of synced commitments are requiring reservation changes + (creates, deletes, or repairs). This is higher than expected for steady-state + operation and may indicate data inconsistencies, external modifications to + reservations, or issues with the CRDs. Check Cortex logs for details. + + - alert: CortexNovaCommittedResourceSyncerRepairRateHigh + expr: | + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer repair rate >0%" + description: > + Some commitments have reservations that needed repair + (wrong metadata like project ID or flavor group). This may indicate data + corruption, bugs in reservation creation, or external modifications. + Reservations are automatically repaired, but the root cause should be + investigated if this alert persists. diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 1913ff39a..445a7f4c1 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -291,7 +291,7 @@ spec: This is the pipeline used for KVM hypervisors (qemu and cloud-hypervisor). type: filter-weigher - createHistory: true + createHistory: false # Fetch all placement candidates, ignoring nova's preselection. ignorePreselection: true filters: diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index c44db2b25..a4077b180 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.44 +version: 0.0.45 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.31 + version: 0.0.32 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index caf0030ea..c836985a0 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.31 -appVersion: "sha-88946405" +version: 0.0.32 +appVersion: "sha-62700947" icon: "https://example.com/icon.png" dependencies: [] diff --git a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml index f91ad0cfc..cba52fa9b 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml @@ -30,6 +30,9 @@ spec: - jsonPath: .status.conditions[?(@.type=='Ready')].reason name: Status type: string + - jsonPath: .status.current.timestamp + name: Last Scheduled + type: date - jsonPath: .metadata.creationTimestamp name: Created type: date diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index ab622c1c0..64931bc4d 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -24,6 +24,42 @@ spec: - jsonPath: .status.conditions[?(@.type=='Ready')].status name: Ready type: string + - jsonPath: .spec.committedResourceReservation.resourceGroup + name: ResourceGroup + type: string + - jsonPath: .spec.committedResourceReservation.projectID + name: Project + type: string + - jsonPath: .spec.availabilityZone + name: AZ + type: string + - jsonPath: .spec.startTime + name: StartTime + priority: 1 + type: string + - jsonPath: .spec.endTime + name: EndTime + type: string + - jsonPath: .spec.resources + name: Resources + priority: 1 + type: string + - jsonPath: .status.failoverReservation.lastChanged + name: LastChanged + priority: 1 + type: date + - jsonPath: .status.failoverReservation.acknowledgedAt + name: AcknowledgedAt + priority: 1 + type: date + - jsonPath: .status.committedResourceReservation.allocations + name: CR Allocations + priority: 1 + type: string + - jsonPath: .status.failoverReservation.allocations + name: HA Allocations + priority: 1 + type: string name: v1alpha1 schema: openAPIV3Schema: diff --git a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go index 7163cc5a6..52ec37306 100644 --- a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainCinder - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-cinder-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-cinder-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/lib/history_client_test.go b/internal/scheduling/lib/history_client_test.go index d9e919cb7..56572f3eb 100644 --- a/internal/scheduling/lib/history_client_test.go +++ b/internal/scheduling/lib/history_client_test.go @@ -8,9 +8,7 @@ import ( "errors" "fmt" "strings" - "sync" "testing" - "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" testlib "github.com/cobaltcore-dev/cortex/pkg/testing" @@ -207,7 +205,7 @@ func newTestScheme(t *testing.T) *runtime.Scheme { return scheme } -func TestHistoryManager_Upsert(t *testing.T) { +func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { tests := []struct { name string // setup returns a fake client and any pre-existing objects. @@ -563,62 +561,7 @@ func TestHistoryManager_Upsert(t *testing.T) { } } -func TestHistoryManager_UpsertFromGoroutine(t *testing.T) { - c := fake.NewClientBuilder(). - WithScheme(newTestScheme(t)). - WithStatusSubresource(&v1alpha1.History{}). - Build() - hm := HistoryClient{Client: c} - - decision := &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "async-uuid", - PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, - Intent: v1alpha1.SchedulingIntentUnknown, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("compute-async"), - }, - }, - } - - // Mirrors the pattern used in pipeline controllers. - var wg sync.WaitGroup - var upsertErr error - wg.Go(func() { - upsertErr = hm.CreateOrUpdateHistory(context.Background(), decision, nil, nil) - }) - - // Poll for history creation. - var histories v1alpha1.HistoryList - deadline := time.Now().Add(2 * time.Second) - for { - if err := c.List(context.Background(), &histories); err != nil { - t.Fatalf("Failed to list histories: %v", err) - } - if len(histories.Items) > 0 { - break - } - if time.Now().After(deadline) { - t.Fatal("timed out waiting for async history creation") - } - time.Sleep(5 * time.Millisecond) - } - - wg.Wait() - if upsertErr != nil { - t.Fatalf("Upsert() returned error: %v", upsertErr) - } - - got := histories.Items[0].Status.Current.TargetHost - if got == nil || *got != "compute-async" { - t.Errorf("target host = %v, want %q", got, "compute-async") - } -} - -func TestHistoryManager_Delete(t *testing.T) { +func TestHistoryClient_Delete(t *testing.T) { tests := []struct { name string domain v1alpha1.SchedulingDomain diff --git a/internal/scheduling/machines/filter_weigher_pipeline_controller.go b/internal/scheduling/machines/filter_weigher_pipeline_controller.go index d76203812..35d51708a 100644 --- a/internal/scheduling/machines/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/machines/filter_weigher_pipeline_controller.go @@ -222,7 +222,7 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainMachines - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-machines-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-machines-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/manila/filter_weigher_pipeline_controller.go b/internal/scheduling/manila/filter_weigher_pipeline_controller.go index 6ab938511..128b7d719 100644 --- a/internal/scheduling/manila/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/manila/filter_weigher_pipeline_controller.go @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainManila - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-manila-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-manila-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/nova/filter_weigher_pipeline_controller.go b/internal/scheduling/nova/filter_weigher_pipeline_controller.go index bb9c5bb07..279ac1c3e 100644 --- a/internal/scheduling/nova/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/nova/filter_weigher_pipeline_controller.go @@ -199,7 +199,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainNova - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-nova-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-nova-scheduler")} c.gatherer = &candidateGatherer{Client: mcl} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err diff --git a/internal/scheduling/nova/plugins/filters/filter_requested_destination.go b/internal/scheduling/nova/plugins/filters/filter_requested_destination.go index c6d307b78..dd1804dc8 100644 --- a/internal/scheduling/nova/plugins/filters/filter_requested_destination.go +++ b/internal/scheduling/nova/plugins/filters/filter_requested_destination.go @@ -7,6 +7,7 @@ import ( "context" "log/slog" "slices" + "strings" api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" @@ -38,8 +39,11 @@ type FilterRequestedDestinationStep struct { } // processRequestedAggregates filters hosts based on the requested aggregates. -// It removes hosts that are not part of any of the requested aggregates, -// respecting the IgnoredAggregates option. Returns early without filtering +// The aggregates list uses AND logic between elements, meaning a host must match +// ALL elements to pass. Each element can contain comma-separated UUIDs which use +// OR logic, meaning the host only needs to match ONE of the UUIDs in that group. +// Example: ["agg1", "agg2,agg3"] means host must be in agg1 AND (agg2 OR agg3). +// Respects the IgnoredAggregates option and returns early without filtering // if all requested aggregates are in the ignored list. func (s *FilterRequestedDestinationStep) processRequestedAggregates( traceLog *slog.Logger, @@ -51,13 +55,12 @@ func (s *FilterRequestedDestinationStep) processRequestedAggregates( if len(aggregates) == 0 { return } + // Filter out ignored aggregates aggregatesToConsider := make([]string, 0, len(aggregates)) for _, agg := range aggregates { - if slices.Contains(s.Options.IgnoredAggregates, agg) { - traceLog.Info("ignoring aggregate in requested_destination as it is in the ignored list", "aggregate", agg) - continue + if !slices.Contains(s.Options.IgnoredAggregates, agg) { + aggregatesToConsider = append(aggregatesToConsider, agg) } - aggregatesToConsider = append(aggregatesToConsider, agg) } if len(aggregatesToConsider) == 0 { traceLog.Info("all aggregates in requested_destination are in the ignored list, skipping aggregate filtering") @@ -74,14 +77,24 @@ func (s *FilterRequestedDestinationStep) processRequestedAggregates( for _, agg := range hv.Status.Aggregates { hvAggregateUUIDs = append(hvAggregateUUIDs, agg.UUID) } - found := false - for _, reqAgg := range aggregatesToConsider { - if slices.Contains(hvAggregateUUIDs, reqAgg) { - found = true + // All outer elements must match (AND logic) + // Each element can be comma-separated UUIDs (OR logic within the group) + allMatch := true + for _, reqAggGroup := range aggregatesToConsider { + reqAggs := strings.Split(reqAggGroup, ",") + groupMatch := false + for _, reqAgg := range reqAggs { + if slices.Contains(hvAggregateUUIDs, reqAgg) { + groupMatch = true + break + } + } + if !groupMatch { + allMatch = false break } } - if !found { + if !allMatch { delete(activations, host) traceLog.Info( "filtered out host not in requested_destination aggregates", diff --git a/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go b/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go index b5307efd8..a2b7e1055 100644 --- a/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_requested_destination_test.go @@ -200,13 +200,14 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { expectErr: false, }, { - name: "Filter by multiple aggregates", + name: "Filter by multiple aggregates with OR syntax (comma-separated)", request: api.ExternalSchedulerRequest{ Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ Data: api.NovaRequestedDestination{ - Aggregates: []string{"aggregate1", "aggregate3"}, + // "aggregate1,aggregate3" means host must be in aggregate1 OR aggregate3 + Aggregates: []string{"aggregate1,aggregate3"}, }, }, }, @@ -241,12 +242,13 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { expectErr: false, }, { - name: "Filter by aggregates - hosts in both spec and status", + name: "Filter by multiple aggregates with AND logic", request: api.ExternalSchedulerRequest{ Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ Data: api.NovaRequestedDestination{ + // ["aggregate1", "aggregate2"] means host must be in aggregate1 AND aggregate2 Aggregates: []string{"aggregate1", "aggregate2"}, }, }, @@ -258,6 +260,85 @@ func TestFilterRequestedDestinationStep_Run(t *testing.T) { {ComputeHost: "host3"}, }, }, + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: metav1.ObjectMeta{Name: "host1"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host2"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}, {UUID: "aggregate2"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host3"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate2"}}}, + }, + }, + expectedHosts: []string{"host2"}, + filteredHosts: []string{"host1", "host3"}, + expectErr: false, + }, + { + name: "Filter by mixed AND/OR aggregates", + request: api.ExternalSchedulerRequest{ + Spec: api.NovaObject[api.NovaSpec]{ + Data: api.NovaSpec{ + RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ + Data: api.NovaRequestedDestination{ + // Host must be in (aggregate1 OR aggregate2) AND aggregate3 + Aggregates: []string{"aggregate1,aggregate2", "aggregate3"}, + }, + }, + }, + }, + Hosts: []api.ExternalSchedulerHost{ + {ComputeHost: "host1"}, + {ComputeHost: "host2"}, + {ComputeHost: "host3"}, + {ComputeHost: "host4"}, + }, + }, + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: metav1.ObjectMeta{Name: "host1"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}, {UUID: "aggregate3"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host2"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate2"}, {UUID: "aggregate3"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host3"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate1"}}}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "host4"}, + Status: hv1.HypervisorStatus{Aggregates: []hv1.Aggregate{{UUID: "aggregate3"}}}, + }, + }, + expectedHosts: []string{"host1", "host2"}, + filteredHosts: []string{"host3", "host4"}, + expectErr: false, + }, + { + name: "Filter by aggregates with OR syntax - hosts in both spec and status", + request: api.ExternalSchedulerRequest{ + Spec: api.NovaObject[api.NovaSpec]{ + Data: api.NovaSpec{ + RequestedDestination: &api.NovaObject[api.NovaRequestedDestination]{ + Data: api.NovaRequestedDestination{ + // "aggregate1,aggregate2" means host must be in aggregate1 OR aggregate2 + Aggregates: []string{"aggregate1,aggregate2"}, + }, + }, + }, + }, + Hosts: []api.ExternalSchedulerHost{ + {ComputeHost: "host1"}, + {ComputeHost: "host2"}, + {ComputeHost: "host3"}, + }, + }, hypervisors: []hv1.Hypervisor{ { ObjectMeta: metav1.ObjectMeta{Name: "host1"}, diff --git a/internal/scheduling/pods/filter_weigher_pipeline_controller.go b/internal/scheduling/pods/filter_weigher_pipeline_controller.go index ceba977b8..0ceee6485 100644 --- a/internal/scheduling/pods/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/pods/filter_weigher_pipeline_controller.go @@ -234,7 +234,7 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainPods - c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-pods-scheduler")} + c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-pods-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go index daedb8a2f..06fb97be1 100644 --- a/internal/scheduling/reservations/commitments/api.go +++ b/internal/scheduling/reservations/commitments/api.go @@ -6,6 +6,7 @@ package commitments import ( "context" "net/http" + "strings" "sync" "github.com/cobaltcore-dev/cortex/internal/scheduling/nova" @@ -28,6 +29,7 @@ type HTTPAPI struct { monitor ChangeCommitmentsAPIMonitor usageMonitor ReportUsageAPIMonitor capacityMonitor ReportCapacityAPIMonitor + infoMonitor InfoAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -44,6 +46,7 @@ func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaC monitor: NewChangeCommitmentsAPIMonitor(), usageMonitor: NewReportUsageAPIMonitor(), capacityMonitor: NewReportCapacityAPIMonitor(), + infoMonitor: NewInfoAPIMonitor(), } } @@ -51,13 +54,27 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.monitor) registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) - mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments) - mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity) - mux.HandleFunc("/v1/commitments/info", api.HandleInfo) - mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage + registry.MustRegister(&api.infoMonitor) + mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) + mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) + mux.HandleFunc("/commitments/v1/info", api.HandleInfo) + mux.HandleFunc("/commitments/v1/projects/", api.handleProjectEndpoint) // routes to report-usage or quota log.Info("commitments API initialized", "changeCommitmentsEnabled", api.config.EnableChangeCommitmentsAPI, "reportUsageEnabled", api.config.EnableReportUsageAPI, "reportCapacityEnabled", api.config.EnableReportCapacityAPI) } + +// handleProjectEndpoint routes /commitments/v1/projects/:project_id/... requests to the appropriate handler. +func (api *HTTPAPI) handleProjectEndpoint(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + switch { + case strings.HasSuffix(path, "/report-usage"): + api.HandleReportUsage(w, r) + case strings.HasSuffix(path, "/quota"): + api.HandleQuota(w, r) + default: + http.Error(w, "Not found", http.StatusNotFound) + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 953319dbe..5c2436267 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -36,7 +36,7 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { return keys } -// implements POST /v1/change-commitments from Limes LIQUID API: +// implements POST /commitments/v1/change-commitments from Limes LIQUID API: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // @@ -69,7 +69,7 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque defer api.changeMutex.Unlock() ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/change-commitments") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/change-commitments") // Only accept POST method if r.Method != http.MethodPost { @@ -253,15 +253,15 @@ ProcessLoop: logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes) - touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") + applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } - logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) - reservationsToWatch = append(reservationsToWatch, touchedReservations...) + logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations)) + reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...) } } } @@ -305,7 +305,7 @@ ProcessLoop: for commitmentUUID, state := range statesBefore { // Rollback to statesBefore for this commitment logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") + _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") if err != nil { logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) // continue with best effort rollback for other projects diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go index 255f05bd8..74fc0c938 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_monitor_test.go @@ -172,24 +172,24 @@ func TestCountCommitments(t *testing.T) { { name: "Single commitment", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), ), expected: 1, }, { name: "Multiple commitments same project", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), - createCommitment("ram_hana_2", "project-A", "uuid-2", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_2_ram", "project-A", "uuid-2", "confirmed", 2), ), expected: 2, }, { name: "Multiple commitments multiple projects", request: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-1", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-2", "confirmed", 3), - createCommitment("ram_gp_1", "project-C", "uuid-3", "confirmed", 1), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-1", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-2", "confirmed", 3), + createCommitment("hw_version_gp_1_ram", "project-C", "uuid-3", "confirmed", 1), ), expected: 3, }, diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 6ceb8b65f..c304d9e5a 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -55,7 +55,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-123", "confirmed", 2)), + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-123", "confirmed", 2)), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, @@ -67,7 +67,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { VMs: []*TestVM{}, Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 3)), + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 3)), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), @@ -78,7 +78,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", strings.Repeat("long-", 13), "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("long-", 13), "confirmed", 3), ), AvailableResources: &AvailableResources{}, ExpectedReservations: []*TestReservation{}, @@ -90,7 +90,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Flavors: []*TestFlavor{m1Small}, ExistingReservations: []*TestReservation{}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid with space", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid with space", "confirmed", 3), ), AvailableResources: &AvailableResources{}, ExpectedReservations: []*TestReservation{}, @@ -103,8 +103,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 2), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -119,8 +119,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-123", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 0), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -133,7 +133,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { // Greedy selection: 10GB request with 8/4/1GB flavors → picks 1×8GB + 2×1GB Flavors: []*TestFlavor{m1XL, m1Large, m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-binpack", "confirmed", 10), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-binpack", "confirmed", 10), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"}, @@ -147,7 +147,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { // InfoVersion validation prevents stale requests (1233 vs 1234) Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1233, - createCommitment("ram_hana_1", "project-A", "uuid-version", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-version", "confirmed", 2), ), EnvInfoVersion: 1234, ExpectedReservations: []*TestReservation{}, @@ -161,8 +161,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-project-a", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-project-b", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-project-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-project-b", "confirmed", 2), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, ExpectedReservations: []*TestReservation{ @@ -180,8 +180,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "commitment-A", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "commitment-B", "confirmed", 6), + createCommitment("hw_version_hana_1_ram", "project-A", "commitment-A", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "commitment-B", "confirmed", 6), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}}, ExpectedReservations: []*TestReservation{ @@ -195,7 +195,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "New commitment creation - from zero to N reservations", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, @@ -208,7 +208,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "New commitment creation - large batch", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-new", "confirmed", 200), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 200), ), ExpectedReservations: func() []*TestReservation { var reservations []*TestReservation @@ -232,7 +232,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 4), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 4), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -249,7 +249,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 6), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 6), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -268,7 +268,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-custom", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, @@ -287,7 +287,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-delete", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-delete", "confirmed", 0), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, @@ -304,7 +304,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-growth", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-growth", "confirmed", 3), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, @@ -317,8 +317,8 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Multi-project success - both projects succeed", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 2), - createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, @@ -329,15 +329,15 @@ func TestCommitmentChangeIntegration(t *testing.T) { ExpectedAPIResponse: newAPIResponse(), }, { - Name: "Multiple flavor groups - ram_hana_1 and ram_hana_2", + Name: "Multiple flavor groups - hw_version_hana_1_ram and hw_version_hana_2_ram", // Amount in multiples of smallest flavor: hana_1 (2×1GB), hana_2 (2×2GB) Flavors: []*TestFlavor{ m1Small, {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-hana1", "confirmed", 2), - createCommitment("ram_hana_2", "project-A", "uuid-hana2", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-hana1", "confirmed", 2), + createCommitment("hw_version_hana_2_ram", "project-A", "uuid-hana2", "confirmed", 2), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, @@ -351,7 +351,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Unknown flavor group - clear rejection message", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_nonexistent", "project-A", "uuid-unknown", "confirmed", 2), + createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unknown", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: newAPIResponse("flavor group not found"), @@ -366,9 +366,9 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-a", "confirmed", 0), - createCommitment("ram_hana_1", "project-B", "uuid-b", "confirmed", 0), - createCommitment("ram_hana_1", "project-C", "uuid-c", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 0), + createCommitment("hw_version_hana_1_ram", "project-C", "uuid-c", "confirmed", 3), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}}, ExpectedReservations: []*TestReservation{ @@ -388,7 +388,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-repair", "confirmed", 8, "az-a"), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-repair", "confirmed", 8, "az-a"), ), ExpectedReservations: []*TestReservation{ {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, @@ -410,7 +410,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Dry run request - feature not yet implemented", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", true, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-dryrun", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dryrun", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: newAPIResponse("Dry run not supported"), @@ -419,7 +419,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Knowledge not ready - clear rejection with RetryAt", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-knowledge", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-knowledge", "confirmed", 2), ), ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: APIResponseExpectation{ @@ -432,7 +432,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "API disabled - returns 503 Service Unavailable", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-disabled", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-disabled", "confirmed", 2), ), CustomConfig: func() *Config { cfg := DefaultConfig() @@ -449,9 +449,9 @@ func TestCommitmentChangeIntegration(t *testing.T) { // Tests that multiple failed commitments are all mentioned in the rejection reason Flavors: []*TestFlavor{m1Small, m1Tiny}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-multi-fail-1", "confirmed", 3), - createCommitment("ram_hana_1", "project-B", "uuid-multi-fail-2", "confirmed", 3), - createCommitment("ram_gp_1", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-multi-fail-1", "confirmed", 3), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-multi-fail-2", "confirmed", 3), + createCommitment("hw_version_gp_1_ram", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}}, ExpectedReservations: []*TestReservation{}, @@ -470,7 +470,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "commitment-1", "confirmed", 4), + createCommitment("hw_version_hana_1_ram", "project-A", "commitment-1", "confirmed", 4), ), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 1024}}, ExpectedReservations: []*TestReservation{ @@ -485,7 +485,7 @@ func TestCommitmentChangeIntegration(t *testing.T) { Name: "Watch timeout with custom config - triggers rollback with timeout error", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("ram_hana_1", "project-A", "uuid-timeout", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2), ), // With 0ms timeout, the watch will timeout immediately before reservations become ready CustomConfig: func() *Config { @@ -615,8 +615,16 @@ func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { groupMap[groupName] = append(groupMap[groupName], flavor) } + // Sort group names for deterministic iteration + sortedGroupNames := make([]string, 0, len(groupMap)) + for groupName := range groupMap { + sortedGroupNames = append(sortedGroupNames, groupName) + } + sort.Strings(sortedGroupNames) + var groups []compute.FlavorGroupFeature - for groupName, groupFlavors := range groupMap { + for _, groupName := range sortedGroupNames { + groupFlavors := groupMap[groupName] if len(groupFlavors) == 0 { continue } @@ -1085,7 +1093,7 @@ func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liq }() // Make HTTP request - url := env.HTTPServer.URL + "/v1/commitments/change-commitments" + url := env.HTTPServer.URL + "/commitments/v1/change-commitments" httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input if err != nil { cancel() @@ -1765,6 +1773,7 @@ func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation { // buildRequestJSON converts a test CommitmentChangeRequest to JSON string. // Builds the nested JSON structure directly for simplicity. +// Uses sorted iteration to ensure deterministic JSON output. func buildRequestJSON(req CommitmentChangeRequest) string { // Group commitments by project and resource for nested structure type projectResources map[liquid.ResourceName][]TestCommitment @@ -1780,11 +1789,30 @@ func buildRequestJSON(req CommitmentChangeRequest) string { ) } - // Build nested JSON structure + // Sort projects for deterministic iteration + sortedProjects := make([]string, 0, len(byProject)) + for projectID := range byProject { + sortedProjects = append(sortedProjects, projectID) + } + sort.Strings(sortedProjects) + + // Build nested JSON structure with sorted iteration var projectParts []string - for projectID, resources := range byProject { + for _, projectID := range sortedProjects { + resources := byProject[projectID] + + // Sort resource names for deterministic iteration + sortedResources := make([]liquid.ResourceName, 0, len(resources)) + for resourceName := range resources { + sortedResources = append(sortedResources, resourceName) + } + sort.Slice(sortedResources, func(i, j int) bool { + return string(sortedResources[i]) < string(sortedResources[j]) + }) + var resourceParts []string - for resourceName, commits := range resources { + for _, resourceName := range sortedResources { + commits := resources[resourceName] var commitParts []string for _, c := range commits { expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour) diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go index c189c859a..9b2b8eb4a 100644 --- a/internal/scheduling/reservations/commitments/api_info.go +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -6,9 +6,12 @@ package commitments import ( "context" "encoding/json" + "errors" "fmt" "net/http" + "strconv" "strings" + "time" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" @@ -16,10 +19,16 @@ import ( liquid "github.com/sapcc/go-api-declarations/liquid" ) -// handles GET /v1/info requests from Limes: +// errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration) +var errInternalServiceInfo = errors.New("internal error building service info") + +// handles GET /commitments/v1/info requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + statusCode := http.StatusOK + // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { @@ -29,11 +38,13 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Request-ID", requestID) ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/info") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/info") // Only accept GET method if r.Method != http.MethodGet { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + statusCode = http.StatusMethodNotAllowed + http.Error(w, "Method not allowed", statusCode) + api.recordInfoMetrics(statusCode, startTime) return } @@ -42,20 +53,35 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { // Build info response info, err := api.buildServiceInfo(ctx, logger) if err != nil { - // Use Info level for expected conditions like knowledge not being ready yet - logger.Info("service info not available yet", "error", err.Error()) - http.Error(w, "Service temporarily unavailable: "+err.Error(), - http.StatusServiceUnavailable) + if errors.Is(err, errInternalServiceInfo) { + logger.Error(err, "internal error building service info") + statusCode = http.StatusInternalServerError + http.Error(w, "Internal server error: "+err.Error(), statusCode) + } else { + // Use Info level for expected conditions like knowledge not being ready yet + logger.Info("service info not available yet", "error", err.Error()) + statusCode = http.StatusServiceUnavailable + http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode) + } + api.recordInfoMetrics(statusCode, startTime) return } // Return response w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) + w.WriteHeader(statusCode) if err := json.NewEncoder(w).Encode(info); err != nil { logger.Error(err, "failed to encode service info") - return } + api.recordInfoMetrics(statusCode, startTime) +} + +// recordInfoMetrics records Prometheus metrics for an info API request. +func (api *HTTPAPI) recordInfoMetrics(statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + api.infoMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.infoMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) } // resourceAttributes holds the custom attributes for a resource in the info API response. @@ -81,7 +107,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l // Build resources map resources := make(map[liquid.ResourceName]liquid.ResourceInfo) for groupName, groupData := range flavorGroups { - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + groupName) + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(groupName)) flavorNames := make([]string, 0, len(groupData.Flavors)) for _, flavor := range groupData.Flavors { @@ -108,12 +134,25 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l attrsJSON = nil } + // Build unit from smallest flavor memory (e.g., "131072 MiB" for 128 GiB) + // Validate memory is positive to avoid panic in MultiplyBy (which panics on factor=0) + if groupData.SmallestFlavor.MemoryMB == 0 { + return liquid.ServiceInfo{}, fmt.Errorf("%w: flavor group %q has invalid smallest flavor with memoryMB=0", + errInternalServiceInfo, groupName) + } + unit, err := liquid.UnitMebibytes.MultiplyBy(groupData.SmallestFlavor.MemoryMB) + if err != nil { + // Note: This error only occurs on uint64 overflow, which is unrealistic for memory values + return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", + errInternalServiceInfo, groupName, err) + } + resources[resourceName] = liquid.ResourceInfo{ DisplayName: displayName, - Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances + Unit: unit, // Non-standard unit: multiples of smallest flavor RAM Topology: liquid.AZAwareTopology, // Commitments are per-AZ NeedsResourceDemand: false, // Capacity planning out of scope for now - HasCapacity: handlesCommitments, // We report capacity via /v1/report-capacity only for groups that accept commitments + HasCapacity: handlesCommitments, // We report capacity via /commitments/v1/report-capacity only for groups that accept commitments HasQuota: false, // No quota enforcement as of now HandlesCommitments: handlesCommitments, // Only for groups with fixed RAM/core ratio Attributes: attrsJSON, diff --git a/internal/scheduling/reservations/commitments/api_info_monitor.go b/internal/scheduling/reservations/commitments/api_info_monitor.go new file mode 100644 index 000000000..aaae937ef --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info_monitor.go @@ -0,0 +1,53 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// InfoAPIMonitor provides metrics for the CR info API. +type InfoAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec +} + +// NewInfoAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. +func NewInfoAPIMonitor() InfoAPIMonitor { + m := InfoAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_info_api_requests_total", + Help: "Total number of committed resource info API requests by HTTP status code", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_info_api_request_duration_seconds", + Help: "Duration of committed resource info API requests in seconds by HTTP status code", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, []string{"status_code"}), + } + + // Pre-initialize metrics with zero values for common HTTP status codes. + // This ensures metrics exist in Prometheus before the first request, + // preventing "metric missing" warnings in alerting rules. + for _, statusCode := range []string{"200", "405", "500", "503"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + + return m +} + +// Describe implements prometheus.Collector. +func (m *InfoAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *InfoAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go index 828a255ff..256709957 100644 --- a/internal/scheduling/reservations/commitments/api_info_test.go +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -28,11 +28,9 @@ func TestHandleInfo_KnowledgeNotReady(t *testing.T) { WithScheme(scheme). Build() - api := &HTTPAPI{ - client: k8sClient, - } + api := NewAPI(k8sClient) - req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -62,12 +60,10 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { WithScheme(scheme). Build() - api := &HTTPAPI{ - client: k8sClient, - } + api := NewAPI(k8sClient) // Use POST instead of GET - req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodPost, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -80,6 +76,67 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { } } +func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { + // Test that a 500 Internal Server Error is returned when a flavor group has invalid data. + // + // A flavor with memoryMB=0 is invalid and should trigger an HTTP 500 error. + // Such data could occur from a bug in the flavor groups extractor. + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // Create flavor group with memoryMB=0 (invalid data that could come from a buggy extractor) + features := []map[string]interface{}{ + { + "name": "invalid_group", + "flavors": []map[string]interface{}{ + {"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + }, + "largestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + "smallestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50}, + "ramCoreRatio": 4096, + }, + } + + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("failed to box features: %v", err) + } + + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}}, + Raw: raw, + LastContentChange: v1.Now(), + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + Build() + + api := NewAPI(k8sClient) + + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) + w := httptest.NewRecorder() + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 500 Internal Server Error when unit creation fails + if resp.StatusCode != http.StatusInternalServerError { + t.Errorf("expected status code %d (Internal Server Error), got %d", http.StatusInternalServerError, resp.StatusCode) + } +} + func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { // Test that HasCapacity == HandlesCommitments for all resources // Both should be true only for groups with fixed RAM/core ratio @@ -138,9 +195,9 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { WithObjects(knowledge). Build() - api := &HTTPAPI{client: k8sClient} + api := NewAPI(k8sClient) - req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() api.HandleInfo(w, req) @@ -161,38 +218,38 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Fatalf("expected 2 resources, got %d", len(serviceInfo.Resources)) } - // Test fixed ratio group: ram_hana_fixed - fixedResource, ok := serviceInfo.Resources["ram_hana_fixed"] + // Test fixed ratio group: hw_version_hana_fixed_ram + fixedResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { - t.Fatal("expected ram_hana_fixed resource to exist") + t.Fatal("expected hw_version_hana_fixed_ram resource to exist") } if !fixedResource.HasCapacity { - t.Error("ram_hana_fixed: expected HasCapacity=true") + t.Error("hw_version_hana_fixed_ram: expected HasCapacity=true") } if !fixedResource.HandlesCommitments { - t.Error("ram_hana_fixed: expected HandlesCommitments=true (fixed ratio group)") + t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (fixed ratio group)") } if fixedResource.HasCapacity != fixedResource.HandlesCommitments { - t.Errorf("ram_hana_fixed: HasCapacity (%v) should equal HandlesCommitments (%v)", + t.Errorf("hw_version_hana_fixed_ram: HasCapacity (%v) should equal HandlesCommitments (%v)", fixedResource.HasCapacity, fixedResource.HandlesCommitments) } - // Test variable ratio group: ram_v2_variable - variableResource, ok := serviceInfo.Resources["ram_v2_variable"] + // Test variable ratio group: hw_version_v2_variable_ram + variableResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] if !ok { - t.Fatal("expected ram_v2_variable resource to exist") + t.Fatal("expected hw_version_v2_variable_ram resource to exist") } // Variable ratio groups don't accept commitments, and we only report capacity for groups // that accept commitments, so both HasCapacity and HandlesCommitments should be false if variableResource.HasCapacity { - t.Error("ram_v2_variable: expected HasCapacity=false (variable ratio groups don't report capacity)") + t.Error("hw_version_v2_variable_ram: expected HasCapacity=false (variable ratio groups don't report capacity)") } if variableResource.HandlesCommitments { - t.Error("ram_v2_variable: expected HandlesCommitments=false (variable ratio group)") + t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (variable ratio group)") } // Verify HasCapacity == HandlesCommitments for consistency if variableResource.HasCapacity != variableResource.HandlesCommitments { - t.Errorf("ram_v2_variable: HasCapacity (%v) should equal HandlesCommitments (%v)", + t.Errorf("hw_version_v2_variable_ram: HasCapacity (%v) should equal HandlesCommitments (%v)", variableResource.HasCapacity, variableResource.HandlesCommitments) } } diff --git a/internal/scheduling/reservations/commitments/api_quota.go b/internal/scheduling/reservations/commitments/api_quota.go new file mode 100644 index 000000000..184728f39 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_quota.go @@ -0,0 +1,40 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + + "github.com/google/uuid" +) + +// HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// +// This is a no-op endpoint that accepts quota requests but doesn't store them. +// Cortex does not enforce quotas for committed resources - quota enforcement +// happens through commitment validation at change-commitments time. +// The endpoint exists for API compatibility with the LIQUID specification. +func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = uuid.New().String() + } + w.Header().Set("X-Request-ID", requestID) + + log := baseLog.WithValues("requestID", requestID, "endpoint", "quota") + + if r.Method != http.MethodPut { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // No-op: Accept the quota request but don't store it + // Cortex handles capacity through commitments, not quotas + log.V(1).Info("received quota request (no-op)", "path", r.URL.Path) + + // Return 204 No Content as expected by the LIQUID API + w.WriteHeader(http.StatusNoContent) +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go index 19b7fb24c..194106d92 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -14,7 +14,7 @@ import ( "github.com/sapcc/go-api-declarations/liquid" ) -// handles POST /v1/commitments/report-capacity requests from Limes: +// handles POST /commitments/v1/report-capacity requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. @@ -38,7 +38,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) } ctx := reservations.WithGlobalRequestID(r.Context(), "committed-resource-"+requestID) - logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity") + logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/report-capacity") // Only accept POST method if r.Method != http.MethodPost { diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go index beb08c0b4..4dd642c0e 100644 --- a/internal/scheduling/reservations/commitments/api_report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -93,9 +93,9 @@ func TestHandleReportCapacity(t *testing.T) { if err != nil { t.Fatal(err) } - req = httptest.NewRequest(tt.method, "/v1/report-capacity", bytes.NewReader(bodyBytes)) + req = httptest.NewRequest(tt.method, "/commitments/v1/report-capacity", bytes.NewReader(bodyBytes)) } else { - req = httptest.NewRequest(tt.method, "/v1/report-capacity", http.NoBody) + req = httptest.NewRequest(tt.method, "/commitments/v1/report-capacity", http.NoBody) } req = req.WithContext(context.Background()) @@ -187,9 +187,9 @@ func TestCapacityCalculator(t *testing.T) { t.Fatalf("Expected 1 resource, got %d", len(report.Resources)) } - resource := report.Resources[liquid.ResourceName("ram_test-group")] + resource := report.Resources[liquid.ResourceName("hw_version_test-group_ram")] if resource == nil { - t.Fatal("Expected ram_test-group resource to exist") + t.Fatal("Expected hw_version_test-group_ram resource to exist") } // Should have empty perAZ map when no host details diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go index 558758bab..f54917e1a 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage.go +++ b/internal/scheduling/reservations/commitments/api_report_usage.go @@ -15,7 +15,7 @@ import ( "github.com/sapcc/go-api-declarations/liquid" ) -// HandleReportUsage implements POST /v1/commitments/projects/:project_id/report-usage from Limes LIQUID API. +// HandleReportUsage implements POST /commitments/v1/projects/:project_id/report-usage from Limes LIQUID API. // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/report_usage.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // @@ -51,7 +51,7 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { } // Extract project UUID from URL path - // URL pattern: /v1/commitments/projects/:project_id/report-usage + // URL pattern: /commitments/v1/projects/:project_id/report-usage projectID, err := extractProjectIDFromPath(r.URL.Path) if err != nil { log.Error(err, "failed to extract project ID from path") @@ -99,15 +99,16 @@ func (api *HTTPAPI) recordUsageMetrics(statusCode int, startTime time.Time) { } // extractProjectIDFromPath extracts the project UUID from the URL path. -// Expected path format: /v1/commitments/projects/:project_id/report-usage +// Expected path format: /commitments/v1/projects/:project_id/ +// where is "report-usage" or "quota" func extractProjectIDFromPath(path string) (string, error) { - // Path: /v1/commitments/projects//report-usage + // Path: /commitments/v1/projects// parts := strings.Split(strings.Trim(path, "/"), "/") - // Expected: ["v1", "commitments", "projects", "", "report-usage"] + // Expected: ["commitments", "v1", "projects", "", ""] if len(parts) < 5 { return "", fmt.Errorf("path too short: %s", path) } - if parts[2] != "projects" || parts[4] != "report-usage" { + if parts[0] != "commitments" || parts[1] != "v1" || parts[2] != "projects" { return "", fmt.Errorf("unexpected path format: %s", path) } projectID := parts[3] diff --git a/internal/scheduling/reservations/commitments/api_report_usage_test.go b/internal/scheduling/reservations/commitments/api_report_usage_test.go index 2a81fa468..26a55332d 100644 --- a/internal/scheduling/reservations/commitments/api_report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api_report_usage_test.go @@ -57,7 +57,7 @@ func TestReportUsageIntegration(t *testing.T) { Reservations: []*UsageTestReservation{}, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": {Usage: 0, VMs: []ExpectedVMUsage{}}, }, @@ -77,7 +77,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 4096 MB / 1024 MB = 4 units @@ -99,7 +99,7 @@ func TestReportUsageIntegration(t *testing.T) { Reservations: []*UsageTestReservation{}, // No commitments AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, @@ -127,7 +127,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 12, // 12 units total @@ -157,7 +157,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 12, @@ -187,7 +187,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 13, // 1 + 4 + 8 = 13 units @@ -216,7 +216,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 8, @@ -243,7 +243,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a", "az-b"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, @@ -275,7 +275,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 4096 MB / 1024 MB = 4 units @@ -285,7 +285,7 @@ func TestReportUsageIntegration(t *testing.T) { }, }, }, - "ram_gp_1": { + "hw_version_gp_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, // 2048 MB / 512 MB = 4 units @@ -332,7 +332,7 @@ func TestReportUsageIntegration(t *testing.T) { }, AllAZs: []string{"az-a"}, Expected: map[string]ExpectedResourceUsage{ - "ram_hana_1": { + "hw_version_hana_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { Usage: 4, @@ -576,7 +576,7 @@ func (env *UsageTestEnv) CallReportUsageAPI(projectID string, allAZs []string, u } // Build URL - url := env.HTTPServer.URL + "/v1/commitments/projects/" + projectID + "/report-usage" + url := env.HTTPServer.URL + "/commitments/v1/projects/" + projectID + "/report-usage" method := http.MethodPost if useGET { diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go index 1dff26b71..7726b1dbe 100644 --- a/internal/scheduling/reservations/commitments/capacity.go +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -53,8 +53,8 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context) (liquid.Serv continue } - // Resource name follows pattern: ram_ - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + groupName) + // Resource name follows pattern: hw_version__ram + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(groupName)) // Calculate per-AZ capacity and usage azCapacity, err := c.calculateAZCapacity(ctx, groupName, groupData) diff --git a/internal/scheduling/reservations/commitments/context.go b/internal/scheduling/reservations/commitments/context.go index 64257fcae..aabee0241 100644 --- a/internal/scheduling/reservations/commitments/context.go +++ b/internal/scheduling/reservations/commitments/context.go @@ -21,6 +21,12 @@ func WithNewGlobalRequestID(ctx context.Context) context.Context { return reservations.WithGlobalRequestID(ctx, "committed-resource-"+uuid.New().String()) } +// WithGlobalRequestID creates a new context with the specified global request ID. +// This is used to propagate existing request IDs (e.g., from the creator annotation). +func WithGlobalRequestID(ctx context.Context, greq string) context.Context { + return reservations.WithGlobalRequestID(ctx, greq) +} + // LoggerFromContext returns a logger with greq and req values from the context. // This creates a child logger with the request tracking values pre-attached, // so you don't need to repeat them in every log call. diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index e03958151..4a318d882 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -16,6 +16,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -49,20 +50,21 @@ type CommitmentReservationController struct { // move the current state of the cluster closer to the desired state. // Note: This controller only handles commitment reservations, as filtered by the predicate. func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - ctx = WithNewGlobalRequestID(ctx) - logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) - - // Fetch the reservation object. + // Fetch the reservation object first to check for creator request ID. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { // Ignore not-found errors, since they can't be fixed by an immediate requeue return ctrl.Result{}, client.IgnoreNotFound(err) } - // Extract creator request ID from annotation for end-to-end traceability + // Use creator request ID from annotation for end-to-end traceability if available, + // otherwise generate a new one for this reconcile loop. if creatorReq := res.Annotations[v1alpha1.AnnotationCreatorRequestID]; creatorReq != "" { - logger = logger.WithValues("creatorReq", creatorReq) + ctx = WithGlobalRequestID(ctx, creatorReq) + } else { + ctx = WithNewGlobalRequestID(ctx) } + logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) // filter for CR reservations resourceName := "" @@ -501,10 +503,22 @@ func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl })); err != nil { return err } - return multicluster.BuildController(mcl, mgr). - For(&v1alpha1.Reservation{}). - WithEventFilter(commitmentReservationPredicate). - Named("commitment-reservation"). + + // Use WatchesMulticluster to watch Reservations across all configured clusters + // (home + remotes). This is required because Reservation CRDs may be stored + // in remote clusters, not just the home cluster. Without this, the controller + // would only see reservations in the home cluster's cache. + bldr := multicluster.BuildController(mcl, mgr) + bldr, err := bldr.WatchesMulticluster( + &v1alpha1.Reservation{}, + &handler.EnqueueRequestForObject{}, + commitmentReservationPredicate, + ) + if err != nil { + return err + } + + return bldr.Named("commitment-reservation"). WithOptions(controller.Options{ // We want to process reservations one at a time to avoid overbooking. MaxConcurrentReconciles: 1, diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go new file mode 100644 index 000000000..2292bcaa1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -0,0 +1,77 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + + liquid "github.com/sapcc/go-api-declarations/liquid" + "github.com/sapcc/go-bits/must" +) + +const ( + // Default URL for the commitments API endpoint. + // This should match the service name in the helm chart. + defaultCommitmentsAPIURL = "http://cortex-nova-scheduler:8080" +) + +// E2EChecksConfig holds the configuration for CR e2e checks. +type E2EChecksConfig struct { + // Base URL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL. + BaseURL string `json:"baseURL"` +} + +// CheckCommitmentsInfoEndpoint sends a GET request to the /commitments/v1/info endpoint +// and verifies that it returns HTTP 200 with a valid ServiceInfo response. +func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { + baseURL := config.BaseURL + if baseURL == "" { + baseURL = defaultCommitmentsAPIURL + } + apiURL := baseURL + "/commitments/v1/info" + slog.Info("checking commitments info endpoint", "apiURL", apiURL) + + httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody)) + httpReq.Header.Set("Accept", "application/json") + + //nolint:bodyclose // Body is closed in the deferred function below. + resp := must.Return(http.DefaultClient.Do(httpReq)) + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + bodyBytes := must.Return(io.ReadAll(resp.Body)) + slog.Error("commitments info API returned non-200 status code", + "statusCode", resp.StatusCode, + "responseBody", string(bodyBytes), + ) + panic(fmt.Sprintf("commitments info API returned status %d, expected 200", resp.StatusCode)) + } + + var serviceInfo liquid.ServiceInfo + if err := json.NewDecoder(resp.Body).Decode(&serviceInfo); err != nil { + panic(fmt.Sprintf("failed to decode ServiceInfo response: %v", err)) + } + + // Basic validation of the response + if serviceInfo.Version < 0 { + slog.Warn("commitments info returned version -1, knowledge may not be ready yet") + } + + slog.Info("commitments info endpoint check passed", + "version", serviceInfo.Version, + "resourceCount", len(serviceInfo.Resources), + ) +} + +// RunCommitmentsE2EChecks runs all e2e checks for the commitments API. +func RunCommitmentsE2EChecks(ctx context.Context, config E2EChecksConfig) { + slog.Info("running commitments e2e checks") + CheckCommitmentsInfoEndpoint(ctx, config) + slog.Info("all commitments e2e checks passed") +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 40cfa08ef..6db72d21a 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -17,6 +17,20 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// ApplyResult contains the result of applying a commitment state. +type ApplyResult struct { + // Created is the number of reservations created + Created int + // Deleted is the number of reservations deleted + Deleted int + // Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config) + Repaired int + // TouchedReservations are reservations that were created or updated + TouchedReservations []v1alpha1.Reservation + // RemovedReservations are reservations that were deleted + RemovedReservations []v1alpha1.Reservation +} + // ReservationManager handles CRUD operations for Reservation CRDs. type ReservationManager struct { client.Client @@ -42,14 +56,16 @@ func NewReservationManager(k8sClient client.Client) *ReservationManager { // - Deleting unused/excess slots when capacity decreases // - Syncing reservation metadata for all remaining slots // -// Returns touched reservations (created/updated) and removed reservations for caller tracking. +// Returns ApplyResult containing touched/removed reservations and counts for metrics. func (m *ReservationManager) ApplyCommitmentState( ctx context.Context, log logr.Logger, desiredState *CommitmentState, flavorGroups map[string]compute.FlavorGroupFeature, creator string, -) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { +) (*ApplyResult, error) { + + result := &ApplyResult{} log = log.WithName("ReservationManager") @@ -58,7 +74,7 @@ func (m *ReservationManager) ApplyCommitmentState( if err := m.List(ctx, &allReservations, client.MatchingLabels{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { - return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + return nil, fmt.Errorf("failed to list reservations: %w", err) } // Filter by name prefix to find reservations for this commitment @@ -74,7 +90,7 @@ func (m *ReservationManager) ApplyCommitmentState( flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] if !exists { - return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) } deltaMemoryBytes := desiredState.TotalMemoryBytes for _, res := range existing { @@ -90,7 +106,6 @@ func (m *ReservationManager) ApplyCommitmentState( // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) // They will be recreated with correct metadata in subsequent phases. var validReservations []v1alpha1.Reservation - var repairedCount int for _, res := range existing { if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { @@ -101,13 +116,13 @@ func (m *ReservationManager) ApplyCommitmentState( "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, "expectedProjectID", desiredState.ProjectID, "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) - repairedCount++ - removedReservations = append(removedReservations, res) + result.Repaired++ + result.RemovedReservations = append(result.RemovedReservations, res) memValue := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, &res); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) } } else { validReservations = append(validReservations, res) @@ -139,33 +154,33 @@ func (m *ReservationManager) ApplyCommitmentState( reservationToDelete = &existing[len(existing)-1] existing = existing[:len(existing)-1] // remove from existing list } - removedReservations = append(removedReservations, *reservationToDelete) + result.RemovedReservations = append(result.RemovedReservations, *reservationToDelete) + result.Deleted++ memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, reservationToDelete); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) } } // Phase 5 (CREATE): Create new reservations (capacity increased) - var createdCount int for deltaMemoryBytes > 0 { // Need to create new reservation slots, always prefer largest flavor within the group // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) - touchedReservations = append(touchedReservations, *reservation) + result.TouchedReservations = append(result.TouchedReservations, *reservation) memValue := reservation.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memValue.Value() - createdCount++ + result.Created++ if err := m.Create(ctx, reservation); err != nil { if apierrors.IsAlreadyExists(err) { - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "reservation %s already exists (collision detected): %w", reservation.Name, err) } - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "failed to create reservation slot %d: %w", nextSlotIndex, err) } @@ -177,24 +192,25 @@ func (m *ReservationManager) ApplyCommitmentState( for i := range existing { updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) if err != nil { - return touchedReservations, removedReservations, err + return result, err } if updated != nil { - touchedReservations = append(touchedReservations, *updated) + result.TouchedReservations = append(result.TouchedReservations, *updated) + result.Repaired++ } } // Only log if there were actual changes - if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 { + if hasChanges || result.Created > 0 || len(result.RemovedReservations) > 0 || result.Repaired > 0 { log.Info("commitment state sync completed", "commitmentUUID", desiredState.CommitmentUUID, - "created", createdCount, - "deleted", len(removedReservations), - "repaired", repairedCount, - "total", len(existing)+createdCount) + "created", result.Created, + "deleted", result.Deleted, + "repaired", result.Repaired, + "total", len(existing)+result.Created) } - return touchedReservations, removedReservations, nil + return result, nil } // syncReservationMetadata updates reservation metadata if it differs from desired state. diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index 2f20f33cc..7733cb6c2 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -42,7 +42,7 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { TotalMemoryBytes: 3 * 8192 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -54,18 +54,18 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != 0 { - t.Errorf("expected 0 removed reservations, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 0 { + t.Errorf("expected 0 applyResult.RemovedReservations reservations, got %d", len(applyResult.RemovedReservations)) } // Should create reservations to fulfill the commitment - if len(touched) == 0 { + if len(applyResult.TouchedReservations) == 0 { t.Fatal("expected at least one reservation to be created") } // Verify created reservations sum to desired state totalMemory := int64(0) - for _, res := range touched { + for _, res := range applyResult.TouchedReservations { memQuantity := res.Spec.Resources[hv1.ResourceMemory] totalMemory += memQuantity.Value() } @@ -142,7 +142,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -158,7 +158,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { // This is expected behavior based on the slot sizing algorithm // Should remove excess reservations - if len(removed) == 0 { + if len(applyResult.RemovedReservations) == 0 { t.Fatal("expected reservations to be removed") } @@ -481,7 +481,7 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { TotalMemoryBytes: tt.desiredMemoryBytes, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -493,12 +493,12 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != tt.expectedRemovedCount { - t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(removed)) + if len(applyResult.RemovedReservations) != tt.expectedRemovedCount { + t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(applyResult.RemovedReservations)) } if tt.validateRemoved != nil { - tt.validateRemoved(t, removed) + tt.validateRemoved(t, applyResult.RemovedReservations) } // Get remaining reservations @@ -560,7 +560,7 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { TotalMemoryBytes: 0, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -572,13 +572,13 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(touched) != 0 { - t.Errorf("expected 0 new reservations, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(applyResult.TouchedReservations)) } // Should remove all reservations - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } // Verify no reservations remain @@ -638,7 +638,7 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -651,18 +651,18 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { } // Should remove wrong reservation and create new one - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } - if len(touched) != 1 { - t.Fatalf("expected 1 new reservation, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(applyResult.TouchedReservations)) } // Verify new reservation has correct flavor group - if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + if applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { t.Errorf("expected flavor group test-group, got %s", - touched[0].Spec.CommittedResourceReservation.ResourceGroup) + applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup) } } @@ -686,7 +686,7 @@ func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, _, err := manager.ApplyCommitmentState( + _, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index c8e75ff83..4ed288d16 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -18,14 +18,30 @@ import ( // commitmentUUIDPattern validates commitment UUID format. var commitmentUUIDPattern = regexp.MustCompile(`^[a-zA-Z0-9-]{6,40}$`) -// Limes LIQUID resource naming convention: ram_ -const commitmentResourceNamePrefix = "ram_" +// Limes LIQUID resource naming convention: hw_version__ram +const ( + resourceNamePrefix = "hw_version_" + resourceNameSuffix = "_ram" +) + +// ResourceNameFromFlavorGroup creates a LIQUID resource name from a flavor group name. +// Format: hw_version__ram +func ResourceNameFromFlavorGroup(flavorGroup string) string { + return resourceNamePrefix + flavorGroup + resourceNameSuffix +} func getFlavorGroupNameFromResource(resourceName string) (string, error) { - if !strings.HasPrefix(resourceName, commitmentResourceNamePrefix) { + if !strings.HasPrefix(resourceName, resourceNamePrefix) || !strings.HasSuffix(resourceName, resourceNameSuffix) { return "", fmt.Errorf("invalid resource name: %s", resourceName) } - return strings.TrimPrefix(resourceName, commitmentResourceNamePrefix), nil + // Remove prefix and suffix + name := strings.TrimPrefix(resourceName, resourceNamePrefix) + name = strings.TrimSuffix(name, resourceNameSuffix) + // Validate that the extracted group name is not empty + if name == "" { + return "", fmt.Errorf("invalid resource name: %s (empty group name)", resourceName) + } + return name, nil } // CommitmentState represents desired or current commitment resource allocation. diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go index 7060300db..717069708 100644 --- a/internal/scheduling/reservations/commitments/state_test.go +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -36,7 +36,7 @@ func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { commitment := Commitment{ UUID: "test-uuid", ProjectID: "project-1", - ResourceName: "ram_test-group", + ResourceName: "hw_version_test-group_ram", Amount: 5, // 5 multiples of smallest flavor } @@ -236,7 +236,8 @@ func TestFromReservations_NonCommittedResourceType(t *testing.T) { } func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { - name, err := getFlavorGroupNameFromResource("ram_hana_medium_v2") + // Test valid resource names with underscores in flavor group + name, err := getFlavorGroupNameFromResource("hw_version_hana_medium_v2_ram") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -246,8 +247,29 @@ func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { } func TestGetFlavorGroupNameFromResource_Invalid(t *testing.T) { - _, err := getFlavorGroupNameFromResource("invalid_resource") - if err == nil { - t.Fatal("expected error for invalid resource name, got nil") + invalidCases := []string{ + "ram_2101", // old format + "invalid", // completely wrong + "hw_version__ram", // empty group name + "hw_version_2101", // missing suffix + } + for _, input := range invalidCases { + if _, err := getFlavorGroupNameFromResource(input); err == nil { + t.Errorf("expected error for %q, got nil", input) + } + } +} + +func TestResourceNameRoundTrip(t *testing.T) { + // Test that ResourceNameFromFlavorGroup and getFlavorGroupNameFromResource are inverses + for _, groupName := range []string{"2101", "hana_1", "hana_medium_v2"} { + resourceName := ResourceNameFromFlavorGroup(groupName) + recovered, err := getFlavorGroupNameFromResource(resourceName) + if err != nil { + t.Fatalf("round-trip failed for %q: %v", groupName, err) + } + if recovered != groupName { + t.Errorf("round-trip mismatch: %q -> %q -> %q", groupName, resourceName, recovered) + } } } diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index b7db11351..cb3d2fdb4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -6,7 +6,6 @@ package commitments import ( "context" "fmt" - "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -42,12 +41,15 @@ type Syncer struct { CommitmentsClient // Kubernetes client for CRD operations client.Client + // Monitor for metrics + monitor *SyncerMonitor } -func NewSyncer(k8sClient client.Client) *Syncer { +func NewSyncer(k8sClient client.Client, monitor *SyncerMonitor) *Syncer { return &Syncer{ CommitmentsClient: NewCommitmentsClient(), Client: k8sClient, + monitor: monitor, } } @@ -58,7 +60,16 @@ func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { return nil } -func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) ([]*CommitmentState, error) { +// getCommitmentStatesResult holds both processed and skipped commitment UUIDs +type getCommitmentStatesResult struct { + // states are the commitments that were successfully processed + states []*CommitmentState + // skippedUUIDs are commitment UUIDs that were skipped (e.g., due to unit mismatch) + // but should NOT have their existing CRDs deleted + skippedUUIDs map[string]bool +} + +func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) (*getCommitmentStatesResult, error) { allProjects, err := s.ListProjects(ctx) if err != nil { return nil, err @@ -69,24 +80,34 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo } // Filter for compute commitments with RAM flavor group resources - var commitmentStates []*CommitmentState + result := &getCommitmentStatesResult{ + states: []*CommitmentState{}, + skippedUUIDs: make(map[string]bool), + } for id, commitment := range commitments { + // Record each commitment seen from Limes + if s.monitor != nil { + s.monitor.RecordCommitmentSeen() + } + if commitment.ServiceType != "compute" { log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) - continue - } - if !strings.HasPrefix(commitment.ResourceName, commitmentResourceNamePrefix) { - log.Info("skipping non-RAM-flavor-group commitment", "id", id, "resourceName", commitment.ResourceName) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonNonCompute) + } continue } - // Extract flavor group name from resource name + // Extract flavor group name from resource name (validates format: hw_version__ram) flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) if err != nil { log.Info("skipping commitment with invalid resource name", "id", id, "resourceName", commitment.ResourceName, "error", err) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonInvalidResource) + } continue } @@ -96,6 +117,32 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo log.Info("skipping commitment with unknown flavor group", "id", id, "flavorGroup", flavorGroupName) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonUnknownFlavorGroup) + } + continue + } + + // Validate unit matches between Limes commitment and Cortex flavor group + // Expected format: " MiB" e.g. "131072 MiB" for 128 GiB + expectedUnit := fmt.Sprintf("%d MiB", flavorGroup.SmallestFlavor.MemoryMB) + if commitment.Unit != "" && commitment.Unit != expectedUnit { + // Unit mismatch: Limes has not yet updated this commitment to the new unit. + // Skip this commitment - trust what Cortex already has stored in CRDs. + // On the next sync cycle after Limes updates, this will be processed. + log.V(0).Info("WARNING: skipping commitment due to unit mismatch - Limes unit differs from Cortex flavor group, waiting for Limes to update", + "commitmentUUID", commitment.UUID, + "flavorGroup", flavorGroupName, + "limesUnit", commitment.Unit, + "expectedUnit", expectedUnit, + "smallestFlavorMemoryMB", flavorGroup.SmallestFlavor.MemoryMB) + if s.monitor != nil { + s.monitor.RecordUnitMismatch(flavorGroupName) + } + // Track skipped commitment so its existing CRDs won't be deleted + if commitment.UUID != "" { + result.skippedUUIDs[commitment.UUID] = true + } continue } @@ -103,6 +150,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo if commitment.UUID == "" { log.Info("skipping commitment with empty UUID", "id", id) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonEmptyUUID) + } continue } @@ -121,10 +171,15 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "amount", commitment.Amount, "totalMemoryBytes", state.TotalMemoryBytes) - commitmentStates = append(commitmentStates, state) + result.states = append(result.states, state) + + // Record successfully processed commitment + if s.monitor != nil { + s.monitor.RecordCommitmentProcessed() + } } - return commitmentStates, nil + return result, nil } // SyncReservations fetches commitments from Limes and synchronizes Reservation CRDs. @@ -138,6 +193,11 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval) + // Record sync run + if s.monitor != nil { + s.monitor.RecordSyncRun() + } + // Check if flavor group knowledge is ready knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} knowledgeCRD, err := knowledge.Get(ctx) @@ -158,7 +218,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { } // Get all commitments as states - commitmentStates, err := s.getCommitmentStates(ctx, logger, flavorGroups) + commitmentResult, err := s.getCommitmentStates(ctx, logger, flavorGroups) if err != nil { logger.Error(err, "failed to get compute commitments") return err @@ -168,20 +228,25 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { manager := NewReservationManager(s.Client) // Apply each commitment state using the manager - for _, state := range commitmentStates { + var totalCreated, totalDeleted, totalRepaired int + for _, state := range commitmentResult.states { logger.Info("applying commitment state", "commitmentUUID", state.CommitmentUUID, "projectID", state.ProjectID, "flavorGroup", state.FlavorGroupName, "totalMemoryBytes", state.TotalMemoryBytes) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) + applyResult, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) if err != nil { logger.Error(err, "failed to apply commitment state", "commitmentUUID", state.CommitmentUUID) // Continue with other commitments even if one fails continue } + + totalCreated += applyResult.Created + totalDeleted += applyResult.Deleted + totalRepaired += applyResult.Repaired } // Delete reservations that are no longer in commitments @@ -194,11 +259,15 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { return err } - // Build set of commitment UUIDs we should have + // Build set of commitment UUIDs we should have (processed + skipped) activeCommitments := make(map[string]bool) - for _, state := range commitmentStates { + for _, state := range commitmentResult.states { activeCommitments[state.CommitmentUUID] = true } + // Also include skipped commitments - don't delete their CRDs + for uuid := range commitmentResult.skippedUUIDs { + activeCommitments[uuid] = true + } // Delete reservations for commitments that no longer exist for _, existing := range existingReservations.Items { @@ -218,9 +287,28 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("deleted reservation for expired commitment", "name", existing.Name, "commitmentUUID", commitmentUUID) + totalDeleted++ + } + } + + // Record reservation change metrics + if s.monitor != nil { + if totalCreated > 0 { + s.monitor.RecordReservationsCreated(totalCreated) + } + if totalDeleted > 0 { + s.monitor.RecordReservationsDeleted(totalDeleted) + } + if totalRepaired > 0 { + s.monitor.RecordReservationsRepaired(totalRepaired) } } - logger.Info("synced reservations", "commitmentCount", len(commitmentStates)) + logger.Info("synced reservations", + "processedCount", len(commitmentResult.states), + "skippedCount", len(commitmentResult.skippedUUIDs), + "created", totalCreated, + "deleted", totalDeleted, + "repaired", totalRepaired) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go new file mode 100644 index 000000000..efa25b0a4 --- /dev/null +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -0,0 +1,154 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// Skip reason labels for commitment processing +const ( + SkipReasonUnitMismatch = "unit_mismatch" + SkipReasonUnknownFlavorGroup = "unknown_flavor_group" + SkipReasonInvalidResource = "invalid_resource_name" + SkipReasonEmptyUUID = "empty_uuid" + SkipReasonNonCompute = "non_compute" +) + +// SyncerMonitor provides metrics for the commitment syncer. +type SyncerMonitor struct { + // Sync lifecycle + syncRuns prometheus.Counter + syncErrors prometheus.Counter + + // Commitment processing + commitmentsTotal prometheus.Counter // all commitments seen from Limes + commitmentsProcessed prometheus.Counter // successfully processed + commitmentsSkipped *prometheus.CounterVec // skipped with reason label + + // Reservation changes + reservationsCreated prometheus.Counter + reservationsDeleted prometheus.Counter + reservationsRepaired prometheus.Counter +} + +// NewSyncerMonitor creates a new monitor with Prometheus metrics. +func NewSyncerMonitor() *SyncerMonitor { + m := &SyncerMonitor{ + syncRuns: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_runs_total", + Help: "Total number of commitment syncer runs", + }), + syncErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_errors_total", + Help: "Total number of commitment syncer errors", + }), + commitmentsTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_total", + Help: "Total number of commitments seen from Limes", + }), + commitmentsProcessed: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_processed_total", + Help: "Total number of commitments successfully processed", + }), + commitmentsSkipped: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_skipped_total", + Help: "Total number of commitments skipped during sync", + }, []string{"reason"}), + reservationsCreated: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_created_total", + Help: "Total number of reservations created during sync", + }), + reservationsDeleted: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_deleted_total", + Help: "Total number of reservations deleted during sync", + }), + reservationsRepaired: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_repaired_total", + Help: "Total number of reservations repaired during sync (wrong metadata)", + }), + } + + // Pre-initialize skip reason labels + for _, reason := range []string{ + SkipReasonUnitMismatch, + SkipReasonUnknownFlavorGroup, + SkipReasonInvalidResource, + SkipReasonEmptyUUID, + SkipReasonNonCompute, + } { + m.commitmentsSkipped.WithLabelValues(reason) + } + + return m +} + +// RecordSyncRun records a syncer run. +func (m *SyncerMonitor) RecordSyncRun() { + m.syncRuns.Inc() +} + +// RecordSyncError records a syncer error. +func (m *SyncerMonitor) RecordSyncError() { + m.syncErrors.Inc() +} + +// RecordCommitmentSeen records a commitment seen from Limes. +func (m *SyncerMonitor) RecordCommitmentSeen() { + m.commitmentsTotal.Inc() +} + +// RecordCommitmentProcessed records a commitment successfully processed. +func (m *SyncerMonitor) RecordCommitmentProcessed() { + m.commitmentsProcessed.Inc() +} + +// RecordCommitmentSkipped records a commitment skipped with a reason. +func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) { + m.commitmentsSkipped.WithLabelValues(reason).Inc() +} + +// RecordUnitMismatch records a unit mismatch skip (convenience method). +func (m *SyncerMonitor) RecordUnitMismatch(_ string) { + m.commitmentsSkipped.WithLabelValues(SkipReasonUnitMismatch).Inc() +} + +// RecordReservationsCreated records reservations created. +func (m *SyncerMonitor) RecordReservationsCreated(count int) { + m.reservationsCreated.Add(float64(count)) +} + +// RecordReservationsDeleted records reservations deleted. +func (m *SyncerMonitor) RecordReservationsDeleted(count int) { + m.reservationsDeleted.Add(float64(count)) +} + +// RecordReservationsRepaired records reservations repaired. +func (m *SyncerMonitor) RecordReservationsRepaired(count int) { + m.reservationsRepaired.Add(float64(count)) +} + +// Describe implements prometheus.Collector. +func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) { + m.syncRuns.Describe(ch) + m.syncErrors.Describe(ch) + m.commitmentsTotal.Describe(ch) + m.commitmentsProcessed.Describe(ch) + m.commitmentsSkipped.Describe(ch) + m.reservationsCreated.Describe(ch) + m.reservationsDeleted.Describe(ch) + m.reservationsRepaired.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) { + m.syncRuns.Collect(ch) + m.syncErrors.Collect(ch) + m.commitmentsTotal.Collect(ch) + m.commitmentsProcessed.Collect(ch) + m.commitmentsSkipped.Collect(ch) + m.reservationsCreated.Collect(ch) + m.reservationsDeleted.Collect(ch) + m.reservationsRepaired.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 75512299a..b09115453 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -5,6 +5,7 @@ package commitments import ( "context" + "sort" "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -32,9 +33,17 @@ type FlavorGroupData struct { func createFlavorGroupKnowledge(t *testing.T, groups map[string]FlavorGroupData) *v1alpha1.Knowledge { t.Helper() - // Build flavor group features + // Sort group names for deterministic iteration + sortedGroupNames := make([]string, 0, len(groups)) + for groupName := range groups { + sortedGroupNames = append(sortedGroupNames, groupName) + } + sort.Strings(sortedGroupNames) + + // Build flavor group features with sorted iteration features := make([]compute.FlavorGroupFeature, 0, len(groups)) - for groupName, data := range groups { + for _, groupName := range sortedGroupNames { + data := groups[groupName] features = append(features, compute.FlavorGroupFeature{ Name: groupName, Flavors: []compute.FlavorInGroup{ @@ -221,7 +230,7 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, // 2 multiples of smallest flavor (2 * 1024MB = 2048MB total) Unit: "", @@ -357,7 +366,7 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "ram_new_group_v1", + ResourceName: "hw_version_new_group_v1_ram", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -429,6 +438,171 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { } } +func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + // Create flavor group knowledge CRD with smallest flavor of 1024MB + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor-large", + LargestFlavorVCPUs: 8, + LargestFlavorMemoryMB: 8192, + SmallestFlavorName: "test-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). + Build() + + // Create mock commitment with a unit that doesn't match Cortex's understanding + // Limes says "2048 MiB" but Cortex's smallest flavor is 1024 MB + mockCommitments := []Commitment{ + { + ID: 1, + UUID: "unit-mismatch-test-uuid", + ServiceType: "compute", + ResourceName: "hw_version_test_group_v1_ram", + AvailabilityZone: "az1", + Amount: 2, + Unit: "2048 MiB", // Mismatched unit - should be "1024 MiB" + ProjectID: "test-project", + DomainID: "test-domain", + }, + } + + // Create monitor to capture the mismatch metric + monitor := NewSyncerMonitor() + + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + result := make(map[string]Commitment) + for _, c := range mockCommitments { + result[c.UUID] = c + } + return result, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{ + {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, + }, nil + }, + } + + syncer := &Syncer{ + CommitmentsClient: mockClient, + Client: k8sClient, + monitor: monitor, + } + + err := syncer.SyncReservations(context.Background()) + if err != nil { + t.Errorf("SyncReservations() error = %v", err) + return + } + + // Verify that NO reservations were created due to unit mismatch + // The commitment is skipped and Cortex trusts existing CRDs + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) + if err != nil { + t.Errorf("Failed to list reservations: %v", err) + return + } + + // Should have 0 reservations - commitment is skipped due to unit mismatch + // Cortex waits for Limes to update the unit before processing + if len(reservations.Items) != 0 { + t.Errorf("Expected 0 reservations (commitment skipped due to unit mismatch), got %d", len(reservations.Items)) + } +} + +func TestSyncer_SyncReservations_UnitMatch(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + // Create flavor group knowledge CRD with smallest flavor of 1024MB + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor-large", + LargestFlavorVCPUs: 8, + LargestFlavorMemoryMB: 8192, + SmallestFlavorName: "test-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). + Build() + + // Create mock commitment with correct unit matching Cortex's smallest flavor + mockCommitments := []Commitment{ + { + ID: 1, + UUID: "unit-match-test-uuid", + ServiceType: "compute", + ResourceName: "hw_version_test_group_v1_ram", + AvailabilityZone: "az1", + Amount: 2, + Unit: "1024 MiB", // Correct unit matching smallest flavor + ProjectID: "test-project", + DomainID: "test-domain", + }, + } + + monitor := NewSyncerMonitor() + + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + result := make(map[string]Commitment) + for _, c := range mockCommitments { + result[c.UUID] = c + } + return result, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{ + {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, + }, nil + }, + } + + syncer := &Syncer{ + CommitmentsClient: mockClient, + Client: k8sClient, + monitor: monitor, + } + + err := syncer.SyncReservations(context.Background()) + if err != nil { + t.Errorf("SyncReservations() error = %v", err) + return + } + + // Verify that reservations were created + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) + if err != nil { + t.Errorf("Failed to list reservations: %v", err) + return + } + + if len(reservations.Items) != 2 { + t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) + } +} + func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { @@ -458,7 +632,7 @@ func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { ID: 1, UUID: "", // Empty UUID ServiceType: "compute", - ResourceName: "ram_test_group_v1", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 1, Unit: "", diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index b71d4e752..536440209 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -396,7 +396,7 @@ func (c *UsageCalculator) buildUsageResponse( if !FlavorGroupAcceptsCommitments(&groupData) { continue } - resourceName := liquid.ResourceName(commitmentResourceNamePrefix + flavorGroupName) + resourceName := liquid.ResourceName(ResourceNameFromFlavorGroup(flavorGroupName)) perAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) diff --git a/internal/scheduling/reservations/commitments/usage_test.go b/internal/scheduling/reservations/commitments/usage_test.go index 371667913..04b32fffa 100644 --- a/internal/scheduling/reservations/commitments/usage_test.go +++ b/internal/scheduling/reservations/commitments/usage_test.go @@ -53,7 +53,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { reservations: []*v1alpha1.Reservation{}, allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 0, + "hw_version_hana_1_ram": 0, }, }, { @@ -75,7 +75,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { }, allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 4, // 4096 MB / 1024 MB = 4 units + "hw_version_hana_1_ram": 4, // 4096 MB / 1024 MB = 4 units }, }, { @@ -92,7 +92,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { reservations: []*v1alpha1.Reservation{}, // No commitments allAZs: []liquid.AvailabilityZone{"az-a"}, expectedUsage: map[string]uint64{ - "ram_hana_1": 4, + "hw_version_hana_1_ram": 4, }, }, } @@ -584,9 +584,9 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { } // Find the VM in subresources and check its commitment assignment - res, ok := report.Resources["ram_hana_1"] + res, ok := report.Resources["hw_version_hana_1_ram"] if !ok { - t.Fatal("Resource ram_hana_1 not found") + t.Fatal("Resource hw_version_hana_1_ram not found") } var foundCommitment any diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go index fbb4f9c01..e31a4aa1c 100644 --- a/internal/scheduling/reservations/failover/controller.go +++ b/internal/scheduling/reservations/failover/controller.go @@ -684,7 +684,7 @@ func (c *FailoverReservationController) calculateVMsMissingFailover( needed := requiredCount - currentCount totalReservationsNeeded += needed - logger.V(2).Info("VM needs more failover reservations", + logger.V(1).Info("VM needs more failover reservations", "vmUUID", vm.UUID, "flavorName", vm.FlavorName, "currentCount", currentCount, @@ -766,7 +766,7 @@ func (c *FailoverReservationController) patchReservationStatus(ctx context.Conte // SetupWithManager sets up the watch-based reconciler with the Manager. // This handles per-reservation reconciliation triggered by CRD changes. func (c *FailoverReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { - c.Recorder = mgr.GetEventRecorder("failover-reservation-controller") + c.Recorder = mcl.GetEventRecorder("failover-reservation-controller") bldr := multicluster.BuildController(mcl, mgr) bldr, err := bldr.WatchesMulticluster( diff --git a/internal/scheduling/reservations/failover/reservation_scheduling.go b/internal/scheduling/reservations/failover/reservation_scheduling.go index 873e5dbca..6859d2cc4 100644 --- a/internal/scheduling/reservations/failover/reservation_scheduling.go +++ b/internal/scheduling/reservations/failover/reservation_scheduling.go @@ -79,7 +79,7 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex // Note: We pass all hypervisors (from all AZs) in EligibleHosts. The scheduler pipeline's // filter_correct_az filter will exclude hosts that are not in the VM's availability zone. scheduleReq := reservations.ScheduleReservationRequest{ - InstanceUUID: "failover-" + vm.UUID, + InstanceUUID: vm.UUID, ProjectID: vm.ProjectID, FlavorName: vm.FlavorName, FlavorExtraSpecs: flavorExtraSpecs, @@ -89,6 +89,7 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex IgnoreHosts: ignoreHypervisors, Pipeline: pipeline, AvailabilityZone: vm.AvailabilityZone, + SchedulerHints: map[string]any{"_nova_check_type": string(api.ReserveForFailoverIntent)}, } logger.V(1).Info("scheduling failover reservation", @@ -218,7 +219,7 @@ func (c *FailoverReservationController) validateVMViaSchedulerEvacuation( IgnoreHosts: []string{vm.CurrentHypervisor}, Pipeline: PipelineAcknowledgeFailoverReservation, AvailabilityZone: vm.AvailabilityZone, - SchedulerHints: map[string]any{"_nova_check_type": "evacuate"}, + SchedulerHints: map[string]any{"_nova_check_type": string(api.EvacuateIntent)}, } logger.V(1).Info("validating VM via scheduler evacuation", diff --git a/internal/scheduling/reservations/scheduler_client.go b/internal/scheduling/reservations/scheduler_client.go index e89628b34..db7829e23 100644 --- a/internal/scheduling/reservations/scheduler_client.go +++ b/internal/scheduling/reservations/scheduler_client.go @@ -118,6 +118,7 @@ func (c *SchedulerClient) ScheduleReservation(ctx context.Context, req ScheduleR Context: api.NovaRequestContext{ RequestID: RequestIDFromContext(ctx), GlobalRequestID: globalReqID, + ProjectID: req.ProjectID, }, Spec: api.NovaObject[api.NovaSpec]{ Data: api.NovaSpec{ diff --git a/pkg/multicluster/client_test.go b/pkg/multicluster/client_test.go index 64b0ae94c..4ea3900cb 100644 --- a/pkg/multicluster/client_test.go +++ b/pkg/multicluster/client_test.go @@ -13,6 +13,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/events" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" @@ -74,8 +76,9 @@ func (f *fakeCache) getIndexFieldCalls() []indexFieldCall { // fakeCluster implements cluster.Cluster interface for testing. type fakeCluster struct { cluster.Cluster - fakeClient client.Client - fakeCache *fakeCache + fakeClient client.Client + fakeCache *fakeCache + fakeRecorder events.EventRecorder } func (f *fakeCluster) GetClient() client.Client { @@ -86,6 +89,17 @@ func (f *fakeCluster) GetCache() cache.Cache { return f.fakeCache } +func (f *fakeCluster) GetEventRecorder(_ string) events.EventRecorder { + if f.fakeRecorder != nil { + return f.fakeRecorder + } + return &fakeEventRecorder{} +} + +func (f *fakeCluster) GetEventRecorderFor(_ string) record.EventRecorder { + return record.NewFakeRecorder(100) +} + func newFakeCluster(scheme *runtime.Scheme, objs ...client.Object) *fakeCluster { return &fakeCluster{ fakeClient: fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(), diff --git a/pkg/multicluster/recorder.go b/pkg/multicluster/recorder.go new file mode 100644 index 000000000..8c7c78d3d --- /dev/null +++ b/pkg/multicluster/recorder.go @@ -0,0 +1,83 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package multicluster + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cluster" +) + +// MultiClusterRecorder implements events.EventRecorder and routes events to the +// correct cluster based on the GVK of the "regarding" object. It uses the same +// routing logic as the multicluster Client's write path. +type MultiClusterRecorder struct { + client *Client + homeRecorder events.EventRecorder + recorders map[cluster.Cluster]events.EventRecorder +} + +// GetEventRecorder creates a multi-cluster-aware EventRecorder. It pre-creates +// a per-cluster recorder for the home cluster and every remote cluster currently +// registered in the client. The name parameter is passed through to each +// cluster's GetEventRecorder method (it becomes the reportingController in the +// Kubernetes Event). +func (c *Client) GetEventRecorder(name string) events.EventRecorder { + homeRecorder := c.HomeCluster.GetEventRecorder(name) + + recorders := make(map[cluster.Cluster]events.EventRecorder) + recorders[c.HomeCluster] = homeRecorder + + c.remoteClustersMu.RLock() + defer c.remoteClustersMu.RUnlock() + + for _, remotes := range c.remoteClusters { + for _, r := range remotes { + if _, exists := recorders[r.cluster]; !exists { + recorders[r.cluster] = r.cluster.GetEventRecorder(name) + } + } + } + + return &MultiClusterRecorder{ + client: c, + homeRecorder: homeRecorder, + recorders: recorders, + } +} + +// Eventf routes the event to the cluster that owns the "regarding" object. +// Falls back to the home cluster recorder if routing fails. +func (r *MultiClusterRecorder) Eventf(regarding, related runtime.Object, eventtype, reason, action, note string, args ...any) { + recorder := r.recorderFor(regarding) + recorder.Eventf(regarding, related, eventtype, reason, action, note, args...) +} + +// recorderFor resolves which per-cluster recorder to use for the given object. +func (r *MultiClusterRecorder) recorderFor(obj runtime.Object) events.EventRecorder { + if obj == nil { + return r.homeRecorder + } + + gvk, err := r.client.GVKFromHomeScheme(obj) + if err != nil { + ctrl.Log.V(1).Info("multi-cluster recorder: failed to resolve GVK, using home recorder", "error", err) + return r.homeRecorder + } + + cl, err := r.client.clusterForWrite(gvk, obj) + if err != nil { + ctrl.Log.V(1).Info("multi-cluster recorder: no cluster matched, using home recorder", "gvk", gvk, "error", err) + return r.homeRecorder + } + + recorder, ok := r.recorders[cl] + if !ok { + ctrl.Log.V(1).Info("multi-cluster recorder: no pre-created recorder for cluster, using home recorder", "gvk", gvk) + return r.homeRecorder + } + + return recorder +} diff --git a/pkg/multicluster/recorder_test.go b/pkg/multicluster/recorder_test.go new file mode 100644 index 000000000..90bdd71dc --- /dev/null +++ b/pkg/multicluster/recorder_test.go @@ -0,0 +1,259 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package multicluster + +import ( + "fmt" + "sync" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// fakeEventRecorder captures Eventf calls for assertions. +type fakeEventRecorder struct { + mu sync.Mutex + calls []eventfCall +} + +type eventfCall struct { + regarding runtime.Object + eventtype string + reason string + action string + note string +} + +func (f *fakeEventRecorder) Eventf(regarding, _ runtime.Object, eventtype, reason, action, note string, args ...any) { + f.mu.Lock() + defer f.mu.Unlock() + f.calls = append(f.calls, eventfCall{ + regarding: regarding, + eventtype: eventtype, + reason: reason, + action: action, + note: fmt.Sprintf(note, args...), + }) +} + +func (f *fakeEventRecorder) getCalls() []eventfCall { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]eventfCall, len(f.calls)) + copy(out, f.calls) + return out +} + +func TestMultiClusterRecorder_HomeGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: homeRecorder, + } + + historyGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{historyGVK: true}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + history := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: "nova-uuid-1"}, + } + recorder.Eventf(history, nil, corev1.EventTypeNormal, "SchedulingSucceeded", "Scheduled", "selected host: %s", "compute-1") + + calls := homeRecorder.getCalls() + if len(calls) != 1 { + t.Fatalf("expected 1 call, got %d", len(calls)) + } + if calls[0].eventtype != corev1.EventTypeNormal { + t.Errorf("expected event type %q, got %q", corev1.EventTypeNormal, calls[0].eventtype) + } + if calls[0].reason != "SchedulingSucceeded" { + t.Errorf("expected reason %q, got %q", "SchedulingSucceeded", calls[0].reason) + } + if calls[0].note != "selected host: compute-1" { + t.Errorf("expected note %q, got %q", "selected host: compute-1", calls[0].note) + } +} + +func TestMultiClusterRecorder_RemoteGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + remoteRecorder := &fakeEventRecorder{} + + homeCluster := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: homeRecorder, + } + remote := &fakeCluster{ + fakeClient: nil, + fakeCache: &fakeCache{}, + fakeRecorder: remoteRecorder, + } + + reservationGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + ResourceRouters: map[schema.GroupVersionKind]ResourceRouter{ + reservationGVK: ReservationsResourceRouter{}, + }, + remoteClusters: map[schema.GroupVersionKind][]remoteCluster{ + reservationGVK: {{cluster: remote, labels: map[string]string{"availabilityZone": "az-a"}}}, + }, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-1"}, + Spec: v1alpha1.ReservationSpec{AvailabilityZone: "az-a"}, + } + recorder.Eventf(res, nil, corev1.EventTypeNormal, "ValidationPassed", "Validated", "reservation validated") + + // Event should go to the remote recorder, not home. + homeCalls := homeRecorder.getCalls() + if len(homeCalls) != 0 { + t.Errorf("expected 0 home calls, got %d", len(homeCalls)) + } + remoteCalls := remoteRecorder.getCalls() + if len(remoteCalls) != 1 { + t.Fatalf("expected 1 remote call, got %d", len(remoteCalls)) + } + if remoteCalls[0].action != "Validated" { + t.Errorf("expected action %q, got %q", "Validated", remoteCalls[0].action) + } +} + +func TestMultiClusterRecorder_MultipleRemotes(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + remoteARecorder := &fakeEventRecorder{} + remoteBRecorder := &fakeEventRecorder{} + + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + remoteA := &fakeCluster{fakeRecorder: remoteARecorder, fakeCache: &fakeCache{}} + remoteB := &fakeCluster{fakeRecorder: remoteBRecorder, fakeCache: &fakeCache{}} + + reservationGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + ResourceRouters: map[schema.GroupVersionKind]ResourceRouter{ + reservationGVK: ReservationsResourceRouter{}, + }, + remoteClusters: map[schema.GroupVersionKind][]remoteCluster{ + reservationGVK: { + {cluster: remoteA, labels: map[string]string{"availabilityZone": "az-a"}}, + {cluster: remoteB, labels: map[string]string{"availabilityZone": "az-b"}}, + }, + }, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + // Event for az-b should go to remoteB. + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-b"}, + Spec: v1alpha1.ReservationSpec{AvailabilityZone: "az-b"}, + } + recorder.Eventf(res, nil, corev1.EventTypeWarning, "SchedulingFailed", "FailedScheduling", "no host found") + + remoteCallsA := remoteARecorder.getCalls() + if len(remoteCallsA) != 0 { + t.Errorf("expected 0 calls to remote-a, got %d", len(remoteCallsA)) + } + remoteCallsB := remoteBRecorder.getCalls() + if len(remoteCallsB) != 1 { + t.Fatalf("expected 1 call to remote-b, got %d", len(remoteCallsB)) + } + if remoteCallsB[0].reason != "SchedulingFailed" { + t.Errorf("expected reason %q, got %q", "SchedulingFailed", remoteCallsB[0].reason) + } +} + +func TestMultiClusterRecorder_FallbackOnUnknownGVK(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + // unknownType is not in the scheme — should fall back to home recorder. + obj := &unknownType{ObjectMeta: metav1.ObjectMeta{Name: "unknown-1"}} + recorder.Eventf(obj, nil, corev1.EventTypeNormal, "Test", "Test", "test message") + + if len(homeRecorder.getCalls()) != 1 { + t.Fatalf("expected 1 home call on fallback, got %d", len(homeRecorder.getCalls())) + } +} + +func TestMultiClusterRecorder_FallbackOnNilRegarding(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + } + + recorder := mcl.GetEventRecorder("test-recorder") + recorder.Eventf(nil, nil, corev1.EventTypeNormal, "Test", "Test", "nil regarding") + + if len(homeRecorder.getCalls()) != 1 { + t.Fatalf("expected 1 home call for nil regarding, got %d", len(homeRecorder.getCalls())) + } +} + +func TestMultiClusterRecorder_ConcurrentEventf(t *testing.T) { + scheme := newTestScheme(t) + homeRecorder := &fakeEventRecorder{} + homeCluster := &fakeCluster{fakeRecorder: homeRecorder, fakeCache: &fakeCache{}} + + historyGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"} + mcl := &Client{ + HomeCluster: homeCluster, + HomeScheme: scheme, + homeGVKs: map[schema.GroupVersionKind]bool{historyGVK: true}, + } + + recorder := mcl.GetEventRecorder("test-recorder") + + const goroutines = 20 + var wg sync.WaitGroup + wg.Add(goroutines) + for i := range goroutines { + go func(n int) { + defer wg.Done() + history := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("history-%d", n)}, + } + recorder.Eventf(history, nil, corev1.EventTypeNormal, "Test", "Test", "event %d", n) + }(i) + } + wg.Wait() + + homeCalls := homeRecorder.getCalls() + if len(homeCalls) != goroutines { + t.Errorf("expected %d calls, got %d", goroutines, len(homeCalls)) + } +}