Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
74b78a6
Set SchedulerHints in failover request and removed incorrect prefix (…
umswmayj Mar 25, 2026
39d1957
Bump cortex chart appVersions to sha-74b78a6a [skip ci]
github-actions[bot] Mar 25, 2026
72e6a96
Add multicluster support for the event recorder (#619)
SoWieMarkus Mar 25, 2026
c8d2bac
Bump cortex chart appVersions to sha-72e6a960 [skip ci]
github-actions[bot] Mar 25, 2026
7632137
Update commitments with support for non-standard units (#624)
mblos Mar 25, 2026
b19510a
Bump cortex chart appVersions to sha-76321376 [skip ci]
github-actions[bot] Mar 25, 2026
aa037ae
Commitment resources use new naming schema (#629)
mblos Mar 25, 2026
16f76e8
Add "Last Scheduled" column to history crd (#630)
SoWieMarkus Mar 25, 2026
8f8fa6a
Bump cortex chart appVersions to sha-16f76e87 [skip ci]
github-actions[bot] Mar 25, 2026
8ed86c7
Disable history for kvm-new-failover-reservation reservation (#628)
umswmayj Mar 25, 2026
1b0dddb
Bump cortex chart appVersions to sha-8ed86c79 [skip ci]
github-actions[bot] Mar 25, 2026
5bbde50
Committed resources syncer with more alerts (#631)
mblos Mar 25, 2026
60c1407
Bump cortex chart appVersions to sha-5bbde504 [skip ci]
github-actions[bot] Mar 25, 2026
c5930f2
Add attr to wide view of failover reservations (#627)
umswmayj Mar 25, 2026
364c4a1
Bump cortex chart appVersions to sha-c5930f28 [skip ci]
github-actions[bot] Mar 25, 2026
b350110
Commitments base URL updated (#632)
mblos Mar 25, 2026
079c9ca
Bump cortex chart appVersions to sha-b3501102 [skip ci]
github-actions[bot] Mar 25, 2026
f7d4b9a
Added commitments quota endpoint (no-op) (#633)
mblos Mar 25, 2026
73c5d14
Bump cortex chart appVersions to sha-f7d4b9a6 [skip ci]
github-actions[bot] Mar 25, 2026
18591fa
Bump core to 0.0.32 and bundles to 0.0.45
mblos Mar 25, 2026
12a55fd
Bump cortex chart appVersions to sha-18591fa9 [skip ci]
github-actions[bot] Mar 25, 2026
3721350
Change log level of VM missing failover log
umswmayj Mar 25, 2026
e4a0431
Bump cortex chart appVersions to sha-3721350c [skip ci]
github-actions[bot] Mar 25, 2026
30995c2
fix test flakiness (#636)
mblos Mar 25, 2026
ecd97fb
Bump cortex chart appVersions to sha-30995c2e [skip ci]
github-actions[bot] Mar 25, 2026
f213f21
Fix history client tests (#635)
SoWieMarkus Mar 25, 2026
4d30ee7
Bump cortex chart appVersions to sha-f213f217 [skip ci]
github-actions[bot] Mar 25, 2026
d1eafec
Support concatenated aggregates in filter requested destination (#637)
umswmayj Mar 25, 2026
6270094
Committed resource watch fixed (#638)
mblos Mar 25, 2026
97f76d5
Bump cortex chart appVersions to sha-62700947 [skip ci]
github-actions[bot] Mar 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,13 @@ if 'nova' in ACTIVE_DEPLOYMENTS:
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
)
local_resource(
'Commitments E2E Tests',
'/bin/sh -c "kubectl exec deploy/cortex-nova-scheduling-controller-manager -- /manager e2e-commitments"',
labels=['Cortex-Nova'],
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
)

if 'manila' in ACTIVE_DEPLOYMENTS:
print("Activating Cortex Manila bundle")
Expand Down
5 changes: 5 additions & 0 deletions api/external/nova/messages.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ const (
EvacuateIntent v1alpha1.SchedulingIntent = "evacuate"
// CreateIntent indicates that the request is intended for creating a new VM.
CreateIntent v1alpha1.SchedulingIntent = "create"
// ReserveForFailoverIntent indicates that the request is for failover reservation scheduling.
ReserveForFailoverIntent v1alpha1.SchedulingIntent = "reserve_for_failover"
)

// GetIntent analyzes the request spec and determines the intent of the scheduling request.
Expand All @@ -160,6 +162,9 @@ func (req ExternalSchedulerRequest) GetIntent() (v1alpha1.SchedulingIntent, erro
// See: https://github.com/sapcc/nova/blob/c88393/nova/compute/api.py#L5770
case "evacuate":
return EvacuateIntent, nil
// Used by cortex failover reservation controller
case "reserve_for_failover":
return ReserveForFailoverIntent, nil
default:
return CreateIntent, nil
}
Expand Down
1 change: 1 addition & 0 deletions api/v1alpha1/history_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ type HistoryStatus struct {
// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
// +kubebuilder:printcolumn:name="Target Host",type="string",JSONPath=".status.current.targetHost"
// +kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason"
// +kubebuilder:printcolumn:name="Last Scheduled",type="date",JSONPath=".status.current.timestamp"
// +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp"

// The history is a CRD that provides a record of past scheduling decisions for a given resource (e.g., a nova instance).
Expand Down
10 changes: 10 additions & 0 deletions api/v1alpha1/reservation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,16 @@ type ReservationStatus struct {
// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.cloud/type']"
// +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
// +kubebuilder:printcolumn:name="ResourceGroup",type="string",JSONPath=".spec.committedResourceReservation.resourceGroup"
// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.committedResourceReservation.projectID"
// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
// +kubebuilder:printcolumn:name="StartTime",type="string",JSONPath=".spec.startTime",priority=1
// +kubebuilder:printcolumn:name="EndTime",type="string",JSONPath=".spec.endTime"
// +kubebuilder:printcolumn:name="Resources",type="string",JSONPath=".spec.resources",priority=1
// +kubebuilder:printcolumn:name="LastChanged",type="date",JSONPath=".status.failoverReservation.lastChanged",priority=1
// +kubebuilder:printcolumn:name="AcknowledgedAt",type="date",JSONPath=".status.failoverReservation.acknowledgedAt",priority=1
// +kubebuilder:printcolumn:name="CR Allocations",type="string",JSONPath=".status.committedResourceReservation.allocations",priority=1
// +kubebuilder:printcolumn:name="HA Allocations",type="string",JSONPath=".status.failoverReservation.allocations",priority=1

// Reservation is the Schema for the reservations API
type Reservation struct {
Expand Down
8 changes: 7 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ func main() {
manilaChecksConfig := conf.GetConfigOrDie[manila.ChecksConfig]()
manila.RunChecks(ctx, client, manilaChecksConfig)
return
case "e2e-commitments":
commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]()
commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
return
}
}

Expand Down Expand Up @@ -665,7 +669,9 @@ func main() {

if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") {
setupLog.Info("starting commitments syncer")
syncer := commitments.NewSyncer(multiclusterClient)
syncerMonitor := commitments.NewSyncerMonitor()
must.Succeed(metrics.Registry.Register(syncerMonitor))
syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor)
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
syncerDefaults := commitments.DefaultSyncerConfig()
if syncerConfig.SyncInterval == 0 {
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/multicluster/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Cortex Multi-Cluster Testing

> [!NOTE]
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you.
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. If you want to test the multi-cluster setup you can run the `schedule.sh` script, which will create a scheduling request and show you how it gets processed across the clusters.

Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. Similarly, another use case is to have multiple remote clusters that maintain all the compute workloads and expose resources that Cortex needs to access, such as the `Hypervisor` resource.

Expand Down
8 changes: 6 additions & 2 deletions docs/guides/multicluster/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,20 @@ global:
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
- cortex.cloud/v1alpha1/History
- cortex.cloud/v1alpha1/HistoryList
labels:
az: cortex-remote-az-a
availabilityZone: cortex-remote-az-a
caCert: |
$(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /')
- host: https://host.docker.internal:8445
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
- cortex.cloud/v1alpha1/History
- cortex.cloud/v1alpha1/HistoryList
labels:
az: cortex-remote-az-b
availabilityZone: cortex-remote-az-b
caCert: |
$(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /')
EOF
Expand Down
253 changes: 253 additions & 0 deletions docs/guides/multicluster/schedule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
#!/bin/bash

set -e

API_URL="http://localhost:8001/scheduler/nova/external"
INSTANCE_UUID="cortex-test-instance-001"
HISTORY_NAME="nova-$INSTANCE_UUID"

# --- Step 1: Apply the test pipeline -----------------------------------------

echo "=== Step 1: Apply test pipeline ==="
echo ""
echo "The test pipeline is a minimal filter-weigher pipeline with:"
echo " - createHistory: true (so a History CRD is created for each decision)"
echo " - filter_correct_az (filters hosts not matching the requested AZ)"
echo " - no weighers (hosts are returned in their original order)"
echo ""

kubectl --context kind-cortex-home apply -f docs/guides/multicluster/test-pipeline.yaml

echo ""
echo "Press enter to send a scheduling request..."
read -r

# --- Step 2: Send scheduling request -----------------------------------------

echo "=== Step 2: Send scheduling request ==="
echo ""
echo "Sending a Nova external scheduler request to the cortex API."
echo ""
echo " Instance UUID: $INSTANCE_UUID"
echo " Availability Zone: cortex-remote-az-b"
echo " Pipeline: multicluster-test"
echo " Candidate hosts: hypervisor-{1,2}-az-{a,b} (4 hosts across 2 AZs)"
echo ""
echo "The pipeline's filter_correct_az step should filter out the az-a hosts,"
echo "leaving only hypervisor-1-az-b and hypervisor-2-az-b."
echo ""

RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL" \
-H "Content-Type: application/json" \
-d @- <<EOF
{
"spec": {
"nova_object.name": "RequestSpec",
"nova_object.namespace": "nova",
"nova_object.version": "1.14",
"nova_object.changes": [],
"nova_object.data": {
"project_id": "test-project",
"user_id": "test-user",
"instance_uuid": "$INSTANCE_UUID",
"availability_zone": "cortex-remote-az-b",
"num_instances": 1,
"is_bfv": false,
"scheduler_hints": {},
"ignore_hosts": null,
"force_hosts": null,
"force_nodes": null,
"image": {
"nova_object.name": "ImageMeta",
"nova_object.namespace": "nova",
"nova_object.version": "1.8",
"nova_object.changes": [],
"nova_object.data": {
"id": "00000000-0000-0000-0000-000000000001",
"name": "test-image",
"status": "active",
"checksum": "0000000000000000",
"owner": "test-project",
"size": 1024,
"container_format": "bare",
"disk_format": "raw",
"created_at": "2025-01-01T00:00:00Z",
"updated_at": "2025-01-01T00:00:00Z",
"min_ram": 0,
"min_disk": 0,
"properties": {
"nova_object.name": "ImageMetaProps",
"nova_object.namespace": "nova",
"nova_object.version": "1.36",
"nova_object.changes": [],
"nova_object.data": {}
}
}
},
"flavor": {
"nova_object.name": "Flavor",
"nova_object.namespace": "nova",
"nova_object.version": "1.2",
"nova_object.changes": [],
"nova_object.data": {
"id": 1,
"name": "m1.small",
"memory_mb": 2048,
"vcpus": 1,
"root_gb": 20,
"ephemeral_gb": 0,
"flavorid": "1",
"swap": 0,
"rxtx_factor": 1.0,
"vcpu_weight": 0,
"disabled": false,
"is_public": true,
"extra_specs": {
"capabilities:hypervisor_type": "qemu"
},
"description": null,
"created_at": "2025-01-01T00:00:00Z",
"updated_at": null
}
},
"request_level_params": {
"nova_object.name": "RequestLevelParams",
"nova_object.namespace": "nova",
"nova_object.version": "1.1",
"nova_object.changes": [],
"nova_object.data": {
"root_required": [],
"root_forbidden": [],
"same_subtree": []
}
},
"network_metadata": {
"nova_object.name": "NetworkMetadata",
"nova_object.namespace": "nova",
"nova_object.version": "1.0",
"nova_object.changes": [],
"nova_object.data": {
"physnets": [],
"tunneled": false
}
},
"limits": {
"nova_object.name": "SchedulerLimits",
"nova_object.namespace": "nova",
"nova_object.version": "1.0",
"nova_object.changes": [],
"nova_object.data": {}
},
"requested_networks": {
"objects": null
},
"security_groups": {
"objects": null
}
}
},
"context": {
"user": "test-user",
"project_id": "test-project",
"system_scope": null,
"project": "test-project",
"domain": null,
"user_domain": "Default",
"project_domain": "Default",
"is_admin": false,
"read_only": false,
"show_deleted": false,
"request_id": "req-test-001",
"global_request_id": null,
"resource_uuid": null,
"roles": [],
"user_identity": "test-user test-project - Default -",
"is_admin_project": false,
"read_deleted": "no",
"remote_address": "127.0.0.1",
"timestamp": "2025-01-01T00:00:00.000000",
"quota_class": null,
"user_name": "test-user",
"project_name": "test-project"
},
"hosts": [
{"host": "hypervisor-1-az-a", "hypervisor_hostname": "hypervisor-1-az-a"},
{"host": "hypervisor-2-az-a", "hypervisor_hostname": "hypervisor-2-az-a"},
{"host": "hypervisor-1-az-b", "hypervisor_hostname": "hypervisor-1-az-b"},
{"host": "hypervisor-2-az-b", "hypervisor_hostname": "hypervisor-2-az-b"}
],
"weights": {
"hypervisor-1-az-a": 1.0,
"hypervisor-2-az-a": 2.0,
"hypervisor-1-az-b": 3.0,
"hypervisor-2-az-b": 4.0
},
"pipeline": "multicluster-test"
}
EOF
)

HTTP_CODE=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')

echo "Response (HTTP $HTTP_CODE):"
echo "$BODY" | python3 -m json.tool 2>/dev/null || echo "$BODY"

if [ "$HTTP_CODE" != "200" ]; then
echo ""
echo "ERROR: Scheduling request failed. Check the controller logs:"
echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager"
exit 1
fi

echo ""
echo "Press enter to check History CRDs and events across all clusters..."
read -r

# --- Step 3: Check History and Events ----------------------------------------

echo "=== Step 3: Check History CRDs and Events ==="
echo ""
echo "The pipeline has createHistory: true, so a History CRD named '$HISTORY_NAME'"
echo "should have been created. An event should also have been recorded on it."
echo "Based on the multicluster config, this should be on the remote cluster cortex-remote-az-b."
echo ""

sleep 1

for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do
echo "--- $CLUSTER ---"
echo "Histories:"
kubectl --context "$CLUSTER" get histories 2>/dev/null || echo " (none)"
echo "Events:"
kubectl --context "$CLUSTER" get events --field-selector reason=SchedulingSucceeded 2>/dev/null || echo " (none)"
echo ""
done

echo "Press enter to describe the History CRD and see the full scheduling result..."
read -r

# --- Step 4: Describe History ------------------------------------------------

echo "=== Step 4: Describe History CRD ==="
echo ""
echo "The History CRD contains the full scheduling decision context:"
echo " - Which pipeline was used"
echo " - The target host that was selected"
echo " - An explanation of each filter/weigher step"
echo " - The Ready condition (True = host selected, False = no host found)"
echo ""

# Try all clusters to find where the History ended up.
for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do
if kubectl --context "$CLUSTER" get history "$HISTORY_NAME" &>/dev/null; then
echo "Found History '$HISTORY_NAME' in $CLUSTER:"
echo ""
kubectl --context "$CLUSTER" describe history "$HISTORY_NAME"
exit 0
fi
done

echo "WARNING: History '$HISTORY_NAME' was not found in any cluster."
echo "Check the controller logs for errors:"
echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager | grep -i history"
12 changes: 12 additions & 0 deletions docs/guides/multicluster/test-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: cortex.cloud/v1alpha1
kind: Pipeline
metadata:
name: multicluster-test
spec:
schedulingDomain: nova
description: Minimal test pipeline for the multicluster guide.
type: filter-weigher
createHistory: true
filters:
- name: filter_correct_az
weighers: []
Loading