From 8dbc1455d96326e2dfd4aa7fcfe3e192a847b470 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Thu, 19 Mar 2026 15:55:00 +0100 Subject: [PATCH 1/8] Allow deploy multiple clusters in same ns --- api/v1alpha1/nodeset_types.go | 6 + cmd/sconfigcontroller/main.go | 2 +- .../crd/bases/slurm.nebius.ai_nodesets.yaml | 5 + config/webhook/manifests.yaml | 26 ----- helm/nodesets/templates/nodeset.yaml | 5 + helm/nodesets/values.yaml | 1 + helm/slurm-cluster/templates/_helpers.tpl | 7 +- .../templates/slurm-cluster-cr.yaml | 2 +- .../templates/slurm-scripts-cm.yaml | 2 +- .../templates/slurmcluster-crd.yaml | 5 + helm/soperator/crds/slurmcluster-crd.yaml | 5 + .../clustercontroller/accounting.go | 3 +- .../controller/clustercontroller/reconcile.go | 7 +- .../clustercontroller/sconfigcontroller.go | 4 +- .../clustercontroller/soperator_exporter.go | 3 +- .../controller/nodesetcontroller/reconcile.go | 24 ++-- internal/naming/naming.go | 11 +- internal/render/accounting/deployment.go | 3 +- internal/render/accounting/deployment_test.go | 3 +- internal/render/exporter/deployment.go | 2 +- internal/render/exporter/names.go | 5 - internal/render/rest/rest.go | 3 +- .../render/sconfigcontroller/deployment.go | 3 +- internal/utils/resourcegetter/nodeset.go | 9 +- .../resourcegetter/workload_name_prefix.go | 52 +++++++++ internal/values/slurm_accounting.go | 4 +- internal/values/slurm_accounting_test.go | 4 +- internal/values/slurm_cluster.go | 32 +++--- internal/values/slurm_controller.go | 6 +- internal/values/slurm_exporter.go | 10 +- internal/values/slurm_exporter_test.go | 6 +- internal/values/slurm_login.go | 7 +- internal/values/slurm_rest.go | 4 +- internal/values/slurm_sconfigcontroller.go | 4 + internal/webhook/v1alpha1/nodeset_webhook.go | 53 --------- .../webhook/v1alpha1/nodeset_webhook_test.go | 104 ------------------ 36 files changed, 171 insertions(+), 261 deletions(-) delete mode 100644 internal/render/exporter/names.go create mode 100644 internal/utils/resourcegetter/workload_name_prefix.go delete mode 100644 internal/webhook/v1alpha1/nodeset_webhook_test.go diff --git a/api/v1alpha1/nodeset_types.go b/api/v1alpha1/nodeset_types.go index 55dae395e..f52325272 100644 --- a/api/v1alpha1/nodeset_types.go +++ b/api/v1alpha1/nodeset_types.go @@ -113,6 +113,12 @@ const ( // NodeSetSpec defines the desired state of NodeSet type NodeSetSpec struct { + // ClusterName is the name of the SlurmCluster this NodeSet belongs to. + // Must be in the same namespace as the NodeSet. + // + // +kubebuilder:validation:Optional + ClusterName string `json:"clusterName,omitempty"` + // Replicas specifies the number of worker nodes in the NodeSet. // // Defaults to 1 if not specified. diff --git a/cmd/sconfigcontroller/main.go b/cmd/sconfigcontroller/main.go index fca7d7344..0a5e4b4e0 100644 --- a/cmd/sconfigcontroller/main.go +++ b/cmd/sconfigcontroller/main.go @@ -176,7 +176,7 @@ func main() { WebhookServer: webhookServer, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, - LeaderElectionID: "vqeyz6ae.nebius.ai", + LeaderElectionID: clusterName + ".vqeyz6ae.nebius.ai", LeaderElectionReleaseOnCancel: true, Cache: cache.Options{ DefaultNamespaces: map[string]cache.Config{ diff --git a/config/crd/bases/slurm.nebius.ai_nodesets.yaml b/config/crd/bases/slurm.nebius.ai_nodesets.yaml index f9bf92fd6..5977cfd8c 100644 --- a/config/crd/bases/slurm.nebius.ai_nodesets.yaml +++ b/config/crd/bases/slurm.nebius.ai_nodesets.yaml @@ -970,6 +970,11 @@ spec: x-kubernetes-list-type: atomic type: object type: object + clusterName: + description: |- + ClusterName is the name of the SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string configMapRefSshd: description: |- ConfigMapRefSSHD defines the config name of Slurm SSHD. diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 8711d789c..86654b057 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -1,31 +1,5 @@ --- apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: mutating-webhook-configuration -webhooks: -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: webhook-service - namespace: system - path: /mutate-slurm-nebius-ai-v1alpha1-nodeset - failurePolicy: Fail - name: mnodeset-v1alpha1.kb.io - rules: - - apiGroups: - - slurm.nebius.ai - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - nodesets - sideEffects: None ---- -apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: name: validating-webhook-configuration diff --git a/helm/nodesets/templates/nodeset.yaml b/helm/nodesets/templates/nodeset.yaml index b5d34d7a3..e2e4f4b27 100644 --- a/helm/nodesets/templates/nodeset.yaml +++ b/helm/nodesets/templates/nodeset.yaml @@ -19,6 +19,10 @@ metadata: {{- end }} spec: + {{- with $.Values.clusterName }} + clusterName: {{ . | quote }} + {{- end }} + {{- with (.replicas | default 1) }} replicas: {{ . }} {{- end }} @@ -134,6 +138,7 @@ spec: security: limitsConfig: {{ get (.security | default dict) "limitsConfig" | quote }} appArmorProfile: {{ get (.security | default dict) "appArmorProfile" | default "unconfined" | quote }} + procMount: {{ get (.security | default dict) "procMount" | default "Default" | quote }} {{- end }} {{- with (required ".Values.nodesets[*].munge is required." .munge) }} diff --git a/helm/nodesets/values.yaml b/helm/nodesets/values.yaml index eef7a23e9..46270e74c 100644 --- a/helm/nodesets/values.yaml +++ b/helm/nodesets/values.yaml @@ -5,6 +5,7 @@ # Global settings nameOverride: "" fullnameOverride: "" +clusterName: "" # Priority Classes configuration # Define priority classes that can be used by NodeSets priorityClasses: diff --git a/helm/slurm-cluster/templates/_helpers.tpl b/helm/slurm-cluster/templates/_helpers.tpl index 7f3d5d1b8..5694c6e48 100644 --- a/helm/slurm-cluster/templates/_helpers.tpl +++ b/helm/slurm-cluster/templates/_helpers.tpl @@ -24,6 +24,11 @@ {{- include "slurm-cluster.selectorLabels" . | trim | nindent 0 }} {{- end }} +{{/* Name of the slurm-scripts ConfigMap */}} +{{- define "slurm-cluster.slurmScriptsCMName" -}} +{{- printf "%s-slurm-scripts" (include "slurm-cluster.name" .) -}} +{{- end -}} + {{- define "validateAccountingConfig" -}} {{- if .Values.slurmNodes.accounting.enabled -}} {{- if not (or .Values.slurmNodes.accounting.externalDB.enabled .Values.slurmNodes.accounting.mariadbOperator.enabled) -}} @@ -62,7 +67,7 @@ Create the name of the service account to use for exporter */}} {{- define "slurm-cluster.exporter.serviceAccountName" -}} {{- if .Values.slurmNodes.exporter.serviceAccount.create -}} - {{- default "slurm-exporter-sa" .Values.slurmNodes.exporter.serviceAccount.name }} + {{- default (printf "%s-exporter-sa" (include "slurm-cluster.name" .)) .Values.slurmNodes.exporter.serviceAccount.name }} {{- else -}} {{- default "default" .Values.slurmNodes.exporter.serviceAccount.name }} {{- end -}} diff --git a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml index 76f9cf3ec..c4409f383 100644 --- a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml +++ b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml @@ -93,7 +93,7 @@ spec: volumeSources: - name: slurm-scripts configMap: - name: slurm-scripts + name: {{ include "slurm-cluster.slurmScriptsCMName" . }} defaultMode: 0755 {{- range .Values.volumeSources }} - name: {{ .name | quote }} diff --git a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml index 6ebe29bf7..8a28efbce 100644 --- a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml +++ b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: namespace: {{ .Release.Namespace }} - name: slurm-scripts + name: {{ include "slurm-cluster.slurmScriptsCMName" . }} labels: app: {{ .Chart.Name }} release: {{ .Release.Name }} diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 3f481152b..6093ab3d3 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -16064,6 +16064,11 @@ spec: x-kubernetes-list-type: atomic type: object type: object + clusterName: + description: |- + ClusterName is the name of the SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string configMapRefSshd: description: |- ConfigMapRefSSHD defines the config name of Slurm SSHD. diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 3f481152b..6093ab3d3 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -16064,6 +16064,11 @@ spec: x-kubernetes-list-type: atomic type: object type: object + clusterName: + description: |- + ClusterName is the name of the SlurmCluster this NodeSet belongs to. + Must be in the same namespace as the NodeSet. + type: string configMapRefSshd: description: |- ConfigMapRefSSHD defines the config name of Slurm SSHD. diff --git a/internal/controller/clustercontroller/accounting.go b/internal/controller/clustercontroller/accounting.go index 5b2362bab..bf5558c46 100644 --- a/internal/controller/clustercontroller/accounting.go +++ b/internal/controller/clustercontroller/accounting.go @@ -340,8 +340,7 @@ func (r SlurmClusterReconciler) ReconcileAccounting( if !isAccountingEnabled { stepLogger.V(1).Info("Removing") - deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting) - if err = r.Deployment.Cleanup(stepCtx, cluster, deploymentName); err != nil { + if err = r.Deployment.Cleanup(stepCtx, cluster, clusterValues.NodeAccounting.Deployment.Name); err != nil { return fmt.Errorf("cleanup accounting Deployment: %w", err) } stepLogger.V(1).Info("Reconciled") diff --git a/internal/controller/clustercontroller/reconcile.go b/internal/controller/clustercontroller/reconcile.go index 36377214b..3a4f343b8 100644 --- a/internal/controller/clustercontroller/reconcile.go +++ b/internal/controller/clustercontroller/reconcile.go @@ -224,7 +224,12 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1 var clusterValues *values.SlurmCluster { var errReconciliation error - clusterValues, errReconciliation = values.BuildSlurmClusterFrom(ctx, cluster) + var namePrefix string + namePrefix, errReconciliation = resourcegetter.ResolveWorkloadNamePrefix(ctx, r.Client, cluster.Namespace, cluster.Name) + if errReconciliation != nil { + return ctrl.Result{}, errReconciliation + } + clusterValues, errReconciliation = values.BuildSlurmClusterFrom(ctx, cluster, namePrefix) if errReconciliation != nil { return ctrl.Result{}, errReconciliation } diff --git a/internal/controller/clustercontroller/sconfigcontroller.go b/internal/controller/clustercontroller/sconfigcontroller.go index 0a9ea11c8..95991e5dd 100644 --- a/internal/controller/clustercontroller/sconfigcontroller.go +++ b/internal/controller/clustercontroller/sconfigcontroller.go @@ -13,9 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" slurmv1 "nebius.ai/slurm-operator/api/v1" - "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/logfield" - "nebius.ai/slurm-operator/internal/naming" "nebius.ai/slurm-operator/internal/render/rest" "nebius.ai/slurm-operator/internal/render/sconfigcontroller" "nebius.ai/slurm-operator/internal/utils" @@ -93,7 +91,7 @@ func (r SlurmClusterReconciler) ValidateSConfigController( ctx, types.NamespacedName{ Namespace: clusterValues.Namespace, - Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController), + Name: clusterValues.SConfigController.Deployment.Name, }, existing, ) diff --git a/internal/controller/clustercontroller/soperator_exporter.go b/internal/controller/clustercontroller/soperator_exporter.go index b6d2f9a38..801f6a446 100644 --- a/internal/controller/clustercontroller/soperator_exporter.go +++ b/internal/controller/clustercontroller/soperator_exporter.go @@ -75,8 +75,7 @@ func (r SlurmClusterReconciler) ReconcileSoperatorExporter( } } else { debugLogger.Info("Exporter disabled, will delete Deployment if exists") - exporterDeploymentName := exporter.DeploymentName - if err := r.Deployment.Cleanup(stepCtx, cluster, exporterDeploymentName); err != nil { + if err := r.Deployment.Cleanup(stepCtx, cluster, clusterValues.SlurmExporter.Deployment.Name); err != nil { return fmt.Errorf("cleanup soperator exporter deployment: %w", err) } } diff --git a/internal/controller/nodesetcontroller/reconcile.go b/internal/controller/nodesetcontroller/reconcile.go index 2bca660be..ab6d04484 100644 --- a/internal/controller/nodesetcontroller/reconcile.go +++ b/internal/controller/nodesetcontroller/reconcile.go @@ -68,16 +68,26 @@ func (r *NodeSetReconciler) reconcile(ctx context.Context, nodeSet *slurmv1alpha err error ) { - clusterName, hasClusterRef := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] - if !hasClusterRef { - err = fmt.Errorf("getting parental cluster ref from annotations") - logger.Error(err, "No parent cluster ref found") - return ctrl.Result{}, err + // Migrate legacy annotation to spec.ClusterName if needed. + if nodeSet.Spec.ClusterName == "" { + clusterName, hasAnnotation := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] + if !hasAnnotation || clusterName == "" { + err = fmt.Errorf("spec.clusterName must be set") + logger.Error(err, "No cluster name found") + return ctrl.Result{}, err + } + logger.Info("Migrating cluster name from annotation to spec.clusterName") + patch := client.MergeFrom(nodeSet.DeepCopy()) + nodeSet.Spec.ClusterName = clusterName + delete(nodeSet.Annotations, consts.AnnotationParentalClusterRefName) + if err = r.Patch(ctx, nodeSet, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("migrating cluster name from annotation: %w", err) + } } cluster, err = resourcegetter.GetCluster(ctx, r.Client, types.NamespacedName{ Namespace: nodeSet.Namespace, - Name: clusterName, + Name: nodeSet.Spec.ClusterName, }, ) if err != nil { @@ -273,7 +283,7 @@ func (r NodeSetReconciler) executeReconciliation( stepLogger := log.FromContext(stepCtx) stepLogger.V(1).Info("Reconciling") - clusterValues, err := values.BuildSlurmClusterFrom(stepCtx, cluster) + clusterValues, err := values.BuildSlurmClusterFrom(stepCtx, cluster, cluster.Name) if err != nil { stepLogger.Error(err, "Failed to build cluster values") return fmt.Errorf("building cluster values: %w", err) diff --git a/internal/naming/naming.go b/internal/naming/naming.go index d8bc428a6..ba040edeb 100644 --- a/internal/naming/naming.go +++ b/internal/naming/naming.go @@ -83,10 +83,10 @@ func BuildAppArmorProfileName(clusterName, namespace string) string { }.String() } -func BuildStatefulSetName(componentType consts.ComponentType) string { +func BuildStatefulSetName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, - clusterName: "", + clusterName: clusterName, entity: "", }.String() } @@ -99,17 +99,18 @@ func BuildNodeSetStatefulSetName(nodeSetName string) string { }.String() } -func BuildDaemonSetName(componentType consts.ComponentType) string { +func BuildDaemonSetName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, - clusterName: "", + clusterName: clusterName, entity: "", }.String() } -func BuildDeploymentName(componentType consts.ComponentType) string { +func BuildDeploymentName(componentType consts.ComponentType, clusterName string) string { return namedEntity{ componentType: &componentType, + clusterName: clusterName, entity: "", }.String() } diff --git a/internal/render/accounting/deployment.go b/internal/render/accounting/deployment.go index af4dd85ad..46179960e 100644 --- a/internal/render/accounting/deployment.go +++ b/internal/render/accounting/deployment.go @@ -8,7 +8,6 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/check" "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/naming" "nebius.ai/slurm-operator/internal/render/common" "nebius.ai/slurm-operator/internal/values" ) @@ -42,7 +41,7 @@ func RenderDeployment( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeAccounting), + Name: accounting.Deployment.Name, Namespace: namespace, Labels: labels, }, diff --git a/internal/render/accounting/deployment_test.go b/internal/render/accounting/deployment_test.go index 9b96242d6..28d2093a8 100644 --- a/internal/render/accounting/deployment_test.go +++ b/internal/render/accounting/deployment_test.go @@ -7,7 +7,6 @@ import ( appsv1 "k8s.io/api/apps/v1" "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/naming" "nebius.ai/slurm-operator/internal/render/accounting" "nebius.ai/slurm-operator/internal/render/common" ) @@ -17,7 +16,7 @@ func Test_RenderDeployment(t *testing.T) { deployment, err := accounting.RenderDeployment(defaultNamespace, defaultNameCluster, acc, defaultNodeFilter, defaultVolumeSources) assert.NoError(t, err) - assert.Equal(t, naming.BuildDeploymentName(consts.ComponentTypeAccounting), deployment.Name) + assert.Equal(t, acc.Deployment.Name, deployment.Name) assert.Equal(t, defaultNamespace, deployment.Namespace) assert.Equal(t, common.RenderLabels(consts.ComponentTypeAccounting, defaultNameCluster), deployment.Labels) diff --git a/internal/render/exporter/deployment.go b/internal/render/exporter/deployment.go index 15efe9236..b5ead9072 100644 --- a/internal/render/exporter/deployment.go +++ b/internal/render/exporter/deployment.go @@ -48,7 +48,7 @@ func RenderDeploymentExporter(clusterValues *values.SlurmCluster) (*appsv1.Deplo return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: DeploymentName, + Name: clusterValues.SlurmExporter.Deployment.Name, Namespace: clusterValues.Namespace, Labels: labels, }, diff --git a/internal/render/exporter/names.go b/internal/render/exporter/names.go deleted file mode 100644 index 6e4737670..000000000 --- a/internal/render/exporter/names.go +++ /dev/null @@ -1,5 +0,0 @@ -package exporter - -const ( - DeploymentName = "slurm-exporter" -) diff --git a/internal/render/rest/rest.go b/internal/render/rest/rest.go index b7d5573ce..1910cf513 100644 --- a/internal/render/rest/rest.go +++ b/internal/render/rest/rest.go @@ -10,7 +10,6 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/check" "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/naming" "nebius.ai/slurm-operator/internal/render/common" "nebius.ai/slurm-operator/internal/values" ) @@ -50,7 +49,7 @@ func RenderDeploymentREST( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeREST), + Name: valuesREST.Deployment.Name, Namespace: namespace, Labels: labels, }, diff --git a/internal/render/sconfigcontroller/deployment.go b/internal/render/sconfigcontroller/deployment.go index 25b47b77f..da292861b 100644 --- a/internal/render/sconfigcontroller/deployment.go +++ b/internal/render/sconfigcontroller/deployment.go @@ -8,7 +8,6 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/check" "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/naming" "nebius.ai/slurm-operator/internal/render/common" "nebius.ai/slurm-operator/internal/values" ) @@ -45,7 +44,7 @@ func RenderDeployment( return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: naming.BuildDeploymentName(consts.ComponentTypeSConfigController), + Name: sConfigController.Deployment.Name, Namespace: clusterNamespace, Labels: labels, }, diff --git a/internal/utils/resourcegetter/nodeset.go b/internal/utils/resourcegetter/nodeset.go index 51554c3df..5405d482b 100644 --- a/internal/utils/resourcegetter/nodeset.go +++ b/internal/utils/resourcegetter/nodeset.go @@ -11,11 +11,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/utils/sliceutils" ) -// ListNodeSetsByClusterRef returns a list of [slurmv1alpha1.NodeSet] that have consts.AnnotationParentalClusterRefName annotation with clusterRef.Name value. +// ListNodeSetsByClusterRef returns a list of [slurmv1alpha1.NodeSet] with spec.ClusterName matching clusterRef.Name. func ListNodeSetsByClusterRef(ctx context.Context, r client.Reader, clusterRef types.NamespacedName) ([]slurmv1alpha1.NodeSet, error) { logger := log.FromContext(ctx) @@ -29,11 +28,7 @@ func ListNodeSetsByClusterRef(ctx context.Context, r client.Reader, clusterRef t return slices.SortedFunc( sliceutils.FilterSliceSeq(nodeSetList.Items, func(nodeSet slurmv1alpha1.NodeSet) bool { - clusterName, found := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] - if found && clusterName == clusterRef.Name { - return true - } - return false + return nodeSet.Spec.ClusterName == clusterRef.Name }), func(a, b slurmv1alpha1.NodeSet) int { return strings.Compare(a.Name, b.Name) diff --git a/internal/utils/resourcegetter/workload_name_prefix.go b/internal/utils/resourcegetter/workload_name_prefix.go new file mode 100644 index 000000000..3161dc862 --- /dev/null +++ b/internal/utils/resourcegetter/workload_name_prefix.go @@ -0,0 +1,52 @@ +package resourcegetter + +import ( + "context" + "fmt" + + kruisev1b1 "github.com/openkruise/kruise-api/apps/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "nebius.ai/slurm-operator/internal/consts" +) + +// ResolveWorkloadNamePrefix returns the prefix to use for workload resource names +// +// Detection is done by looking up the controller StatefulSet under its legacy unprefixed name. +// If it exists and belongs to this cluster, the cluster predates prefixed naming — "" is returned +// so all workload resources continue to be managed under their original names without any rename or deletion. +// For new clusters clusterName is returned, giving every workload resource a - name. +func ResolveWorkloadNamePrefix( + ctx context.Context, + r client.Reader, + namespace, + clusterName string, +) (string, error) { + sts := &kruisev1b1.StatefulSet{} + err := r.Get(ctx, types.NamespacedName{ + Namespace: namespace, + Name: consts.ComponentTypeController.String(), + }, sts) + + if err == nil && hasClusterLabel(sts, clusterName) { + // Legacy cluster: unprefixed StatefulSet already exists and belongs to this cluster. + return "", nil + } + + if err != nil && !apierrors.IsNotFound(err) { + return "", fmt.Errorf("checking legacy controller StatefulSet: %w", err) + } + + return clusterName, nil +} + +// hasClusterLabel checks whether the resource carries the standard instance label for the given cluster. +func hasClusterLabel(obj client.Object, clusterName string) bool { + labels := obj.GetLabels() + if labels == nil { + return false + } + return labels["app.kubernetes.io/instance"] == clusterName +} diff --git a/internal/values/slurm_accounting.go b/internal/values/slurm_accounting.go index aaea19303..83af062d3 100644 --- a/internal/values/slurm_accounting.go +++ b/internal/values/slurm_accounting.go @@ -29,7 +29,7 @@ type SlurmAccounting struct { Maintenance *consts.MaintenanceMode } -func buildAccountingFrom(clusterName string, maintenance *consts.MaintenanceMode, accounting *slurmv1.SlurmNodeAccounting) SlurmAccounting { +func buildAccountingFrom(clusterName, namePrefix string, maintenance *consts.MaintenanceMode, accounting *slurmv1.SlurmNodeAccounting) SlurmAccounting { containerAcc := buildContainerFrom( accounting.Slurmdbd, consts.ContainerNameAccounting, @@ -48,7 +48,7 @@ func buildAccountingFrom(clusterName string, maintenance *consts.MaintenanceMode CustomInitContainers: accounting.CustomInitContainers, Service: buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeAccounting, clusterName)), Deployment: buildDeploymentFrom( - naming.BuildDeploymentName(consts.ComponentTypeAccounting), + naming.BuildDeploymentName(consts.ComponentTypeAccounting, namePrefix), ), ExternalDB: accounting.ExternalDB, MariaDb: accounting.MariaDbOperator, diff --git a/internal/values/slurm_accounting_test.go b/internal/values/slurm_accounting_test.go index 23172354d..5ba9359b8 100644 --- a/internal/values/slurm_accounting_test.go +++ b/internal/values/slurm_accounting_test.go @@ -49,7 +49,7 @@ func TestBuildAccountingFrom(t *testing.T) { }, } - result := buildAccountingFrom(defaultNameCluster, ptr.To(consts.ModeNone), accounting) + result := buildAccountingFrom(defaultNameCluster, defaultNameCluster, ptr.To(consts.ModeNone), accounting) assert.Equal(t, *accounting.SlurmNode.DeepCopy(), result.SlurmNode) assert.Equal(t, buildContainerFrom(accounting.Munge, consts.ContainerNameMunge).Name, result.ContainerMunge.Name) @@ -60,7 +60,7 @@ func TestBuildAccountingFrom(t *testing.T) { assert.Equal(t, buildContainerFrom(accounting.Munge, consts.ContainerNameMunge).Port, result.ContainerMunge.Port) assert.Equal(t, buildContainerFrom(accounting.Munge, consts.ContainerNameMunge).Resources, result.ContainerMunge.Resources) assert.Equal(t, buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeAccounting, defaultNameCluster)), result.Service) - assert.Equal(t, buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeAccounting)), result.Deployment) + assert.Equal(t, buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeAccounting, defaultNameCluster)), result.Deployment) assert.Equal(t, accounting.ExternalDB, result.ExternalDB) assert.Equal(t, accounting.Enabled, result.Enabled) assert.Equal(t, slurmv1.NodeVolume{VolumeSourceName: ptr.To(consts.VolumeNameJail)}, result.VolumeJail) diff --git a/internal/values/slurm_cluster.go b/internal/values/slurm_cluster.go index 23a7d00a5..4680a53d2 100644 --- a/internal/values/slurm_cluster.go +++ b/internal/values/slurm_cluster.go @@ -43,8 +43,10 @@ type SlurmCluster struct { UseDefaultAppArmorProfile bool } -// BuildSlurmClusterFrom creates a new instance of SlurmCluster given a SlurmCluster CRD -func BuildSlurmClusterFrom(ctx context.Context, cluster *slurmv1.SlurmCluster) (*SlurmCluster, error) { +// BuildSlurmClusterFrom creates a new instance of SlurmCluster given a SlurmCluster CRD. +// namePrefix is the prefix to use for workload resource names (StatefulSet, DaemonSet, Deployment). +// Pass cluster.Name for new clusters, or "" to preserve legacy unprefixed names on existing clusters. +func BuildSlurmClusterFrom(ctx context.Context, cluster *slurmv1.SlurmCluster, namePrefix string) (*SlurmCluster, error) { logger := log.FromContext(ctx) logger.V(1).Info(fmt.Sprintf("%+v", cluster.Spec.SConfigController)) @@ -67,20 +69,17 @@ func BuildSlurmClusterFrom(ctx context.Context, cluster *slurmv1.SlurmCluster) ( NodeFilters: buildNodeFiltersFrom(cluster.Spec.K8sNodeFilters), VolumeSources: buildVolumeSourcesFrom(cluster.Spec.VolumeSources), Secrets: BuildSecretsFrom(&cluster.Spec.Secrets), - NodeController: buildSlurmControllerFrom(cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Controller), - NodeAccounting: buildAccountingFrom(cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Accounting), - NodeRest: buildRestFrom(cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Rest), - NodeLogin: buildSlurmLoginFrom( - cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Login, - cluster.Spec.UseDefaultAppArmorProfile, - ), - SlurmExporter: buildSlurmExporterFrom(cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Exporter), - SlurmConfig: cluster.Spec.SlurmConfig, - CustomSlurmConfig: cluster.Spec.CustomSlurmConfig, - CustomCgroupConfig: cluster.Spec.CustomCgroupConfig, - CgroupVersion: cluster.Spec.CgroupVersion, - MPIConfig: cluster.Spec.MPIConfig, - PlugStackConfig: cluster.Spec.PlugStackConfig, + NodeController: buildSlurmControllerFrom(cluster.Name, namePrefix, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Controller), + NodeAccounting: buildAccountingFrom(cluster.Name, namePrefix, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Accounting), + NodeRest: buildRestFrom(cluster.Name, namePrefix, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Rest), + NodeLogin: buildSlurmLoginFrom(cluster.Name, namePrefix, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Login, cluster.Spec.UseDefaultAppArmorProfile), + SlurmExporter: buildSlurmExporterFrom(namePrefix, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Exporter), + SlurmConfig: cluster.Spec.SlurmConfig, + CustomSlurmConfig: cluster.Spec.CustomSlurmConfig, + CustomCgroupConfig: cluster.Spec.CustomCgroupConfig, + CgroupVersion: cluster.Spec.CgroupVersion, + MPIConfig: cluster.Spec.MPIConfig, + PlugStackConfig: cluster.Spec.PlugStackConfig, SConfigController: buildSConfigControllerFrom( cluster.Spec.SConfigController.Node, cluster.Spec.SConfigController.Container, @@ -90,6 +89,7 @@ func BuildSlurmClusterFrom(ctx context.Context, cluster *slurmv1.SlurmCluster) ( cluster.Spec.SConfigController.ReconfigurePollInterval, cluster.Spec.SConfigController.ReconfigureWaitTimeout, cluster.Spec.SConfigController.ServiceAccountName, + namePrefix, ), UseDefaultAppArmorProfile: cluster.Spec.UseDefaultAppArmorProfile, } diff --git a/internal/values/slurm_controller.go b/internal/values/slurm_controller.go index 068b0afc7..5b0b8de10 100644 --- a/internal/values/slurm_controller.go +++ b/internal/values/slurm_controller.go @@ -34,16 +34,16 @@ type SlurmController struct { ServiceAccountName string } -func buildSlurmControllerFrom(clusterName string, maintenance *consts.MaintenanceMode, controller *slurmv1.SlurmNodeController) SlurmController { +func buildSlurmControllerFrom(clusterName, namePrefix string, maintenance *consts.MaintenanceMode, controller *slurmv1.SlurmNodeController) SlurmController { // Controller always has 1 replica statefulSet := buildStatefulSetWithMaxUnavailableFrom( - naming.BuildStatefulSetName(consts.ComponentTypeController), + naming.BuildStatefulSetName(consts.ComponentTypeController, namePrefix), consts.SingleReplicas, nil, ) daemonSet := buildDaemonSetFrom( - naming.BuildDaemonSetName(consts.ComponentTypeController), + naming.BuildDaemonSetName(consts.ComponentTypeController, namePrefix), ) sssdConfSecretName := controller.SSSDConfSecretRefName diff --git a/internal/values/slurm_exporter.go b/internal/values/slurm_exporter.go index 5d511790b..f8f90be0d 100644 --- a/internal/values/slurm_exporter.go +++ b/internal/values/slurm_exporter.go @@ -41,14 +41,21 @@ type SlurmExporter struct { // ServiceAccountName is the ServiceAccount to be used by exporter pods. ServiceAccountName string + + Deployment Deployment } -func buildSlurmExporterFrom(maintenance *consts.MaintenanceMode, exporter *slurmv1.SlurmExporter) SlurmExporter { +func buildSlurmExporterFrom(namePrefix string, maintenance *consts.MaintenanceMode, exporter *slurmv1.SlurmExporter) SlurmExporter { enabled := false if exporter.Enabled != nil { enabled = *exporter.Enabled } + deploymentName := "slurm-exporter" + if namePrefix != "" { + deploymentName = namePrefix + "-" + deploymentName + } + return SlurmExporter{ SlurmNode: *exporter.SlurmNode.DeepCopy(), Enabled: enabled, @@ -66,5 +73,6 @@ func buildSlurmExporterFrom(maintenance *consts.MaintenanceMode, exporter *slurm Container: *exporter.ExporterContainer.DeepCopy(), CollectionInterval: exporter.CollectionInterval, ServiceAccountName: exporter.ServiceAccountName, + Deployment: buildDeploymentFrom(deploymentName), } } diff --git a/internal/values/slurm_exporter_test.go b/internal/values/slurm_exporter_test.go index 8225e6121..71d8ecae1 100644 --- a/internal/values/slurm_exporter_test.go +++ b/internal/values/slurm_exporter_test.go @@ -20,7 +20,7 @@ func Test_BuildSlurmExporterFrom(t *testing.T) { }, } - result := buildSlurmExporterFrom(ptr.To(consts.ModeNone), exporter) + result := buildSlurmExporterFrom("test-cluster", ptr.To(consts.ModeNone), exporter) assert.NotNil(t, result.ExporterContainer) assert.NotNil(t, result.SlurmNode) @@ -40,7 +40,7 @@ func Test_BuildSlurmExporterFromWithNilTelemetry(t *testing.T) { t.Errorf("The code panicked: %v", r) } }() - buildSlurmExporterFrom(ptr.To(consts.ModeNone), exporter) + buildSlurmExporterFrom("test-cluster", ptr.To(consts.ModeNone), exporter) } func Test_BuildSlurmExporterFromWithNilPrometheus(t *testing.T) { @@ -57,5 +57,5 @@ func Test_BuildSlurmExporterFromWithNilPrometheus(t *testing.T) { t.Errorf("The code panicked: %v", r) } }() - buildSlurmExporterFrom(ptr.To(consts.ModeNone), exporter) + buildSlurmExporterFrom("test-cluster", ptr.To(consts.ModeNone), exporter) } diff --git a/internal/values/slurm_login.go b/internal/values/slurm_login.go index 12ffacfa0..45179a78f 100644 --- a/internal/values/slurm_login.go +++ b/internal/values/slurm_login.go @@ -36,10 +36,7 @@ type SlurmLogin struct { Maintenance *consts.MaintenanceMode } -func buildSlurmLoginFrom( - clusterName string, maintenance *consts.MaintenanceMode, - login *slurmv1.SlurmNodeLogin, useDefaultAppArmorProfile bool, -) SlurmLogin { +func buildSlurmLoginFrom(clusterName, namePrefix string, maintenance *consts.MaintenanceMode, login *slurmv1.SlurmNodeLogin, useDefaultAppArmorProfile bool) SlurmLogin { svc := buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeLogin, clusterName)) svc.Type = login.SshdServiceType svc.Annotations = login.SshdServiceAnnotations @@ -76,7 +73,7 @@ func buildSlurmLoginFrom( Service: svc, HeadlessService: headlessSvc, StatefulSet: buildStatefulSetFrom( - naming.BuildStatefulSetName(consts.ComponentTypeLogin), + naming.BuildStatefulSetName(consts.ComponentTypeLogin, namePrefix), login.SlurmNode.Size, ), SSHDConfigMapName: sshdConfigMapName, diff --git a/internal/values/slurm_rest.go b/internal/values/slurm_rest.go index a3794afb5..64ad6bbf9 100644 --- a/internal/values/slurm_rest.go +++ b/internal/values/slurm_rest.go @@ -19,11 +19,12 @@ type SlurmREST struct { ContainerREST Container CustomInitContainers []corev1.Container Service Service + Deployment Deployment VolumeJail slurmv1.NodeVolume Maintenance *consts.MaintenanceMode } -func buildRestFrom(clusterName string, maintenance *consts.MaintenanceMode, rest *slurmv1.SlurmRest) SlurmREST { +func buildRestFrom(clusterName, namePrefix string, maintenance *consts.MaintenanceMode, rest *slurmv1.SlurmRest) SlurmREST { containerREST := buildContainerFrom( rest.SlurmRestNode, consts.ContainerNameREST, @@ -40,6 +41,7 @@ func buildRestFrom(clusterName string, maintenance *consts.MaintenanceMode, rest ContainerREST: containerREST, CustomInitContainers: rest.CustomInitContainers, Service: buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeREST, clusterName)), + Deployment: buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeREST, namePrefix)), Maintenance: maintenance, VolumeJail: slurmv1.NodeVolume{ VolumeSourceName: ptr.To(consts.VolumeNameJail), diff --git a/internal/values/slurm_sconfigcontroller.go b/internal/values/slurm_sconfigcontroller.go index 00c90fe17..65b035019 100644 --- a/internal/values/slurm_sconfigcontroller.go +++ b/internal/values/slurm_sconfigcontroller.go @@ -5,6 +5,7 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/consts" + "nebius.ai/slurm-operator/internal/naming" ) // SlurmWorker contains the data needed to deploy and reconcile the Slurm Workers @@ -21,6 +22,7 @@ type SConfigController struct { ReconfigurePollInterval *string ReconfigureWaitTimeout *string ServiceAccountName string + Deployment Deployment } func buildSConfigControllerFrom( @@ -32,6 +34,7 @@ func buildSConfigControllerFrom( reconfigurePollInterval *string, reconfigureWaitTimeout *string, serviceAccountName string, + namePrefix string, ) SConfigController { containerSConfigController := buildContainerFrom( container, @@ -50,6 +53,7 @@ func buildSConfigControllerFrom( ReconfigurePollInterval: reconfigurePollInterval, ReconfigureWaitTimeout: reconfigureWaitTimeout, ServiceAccountName: serviceAccountName, + Deployment: buildDeploymentFrom(naming.BuildDeploymentName(consts.ComponentTypeSConfigController, namePrefix)), } return res diff --git a/internal/webhook/v1alpha1/nodeset_webhook.go b/internal/webhook/v1alpha1/nodeset_webhook.go index 355b2a38d..ecda15b4e 100644 --- a/internal/webhook/v1alpha1/nodeset_webhook.go +++ b/internal/webhook/v1alpha1/nodeset_webhook.go @@ -18,75 +18,22 @@ package v1alpha1 import ( "context" - "fmt" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" - "nebius.ai/slurm-operator/internal/utils/resourcegetter" ) -// nodesetLog is for logging in this package. -var nodesetLog = logf.Log.WithName("nodeset-resource") - // SetupNodeSetWebhookWithManager registers the webhook for NodeSet in the manager. func SetupNodeSetWebhookWithManager(mgr ctrl.Manager) error { return ctrl.NewWebhookManagedBy(mgr).For(&slurmv1alpha1.NodeSet{}). WithValidator(&NodeSetCustomValidator{}). - WithDefaulter(&NodeSetCustomDefaulter{Client: mgr.GetClient()}). Complete() } -// +kubebuilder:webhook:path=/mutate-slurm-nebius-ai-v1alpha1-nodeset,mutating=true,failurePolicy=fail,sideEffects=None,groups=slurm.nebius.ai,resources=nodesets,verbs=create;update,versions=v1alpha1,name=mnodeset-v1alpha1.kb.io,admissionReviewVersions=v1 - -// NodeSetCustomDefaulter struct is responsible for setting default values on the custom resource of the -// Kind NodeSet when those are created or updated. -type NodeSetCustomDefaulter struct { - Client client.Client -} - -var _ webhook.CustomDefaulter = &NodeSetCustomDefaulter{} - -// Default implements webhook.CustomDefaulter so a webhook will be registered for the Kind NodeSet. -func (d *NodeSetCustomDefaulter) Default(ctx context.Context, obj runtime.Object) error { - nodeSet, ok := obj.(*slurmv1alpha1.NodeSet) - - if !ok { - return fmt.Errorf("expected an NodeSet object but got %T", obj) - } - nodesetLog.Info("Defaulting for NodeSet", "name", nodeSet.GetName()) - - if err := defaultNodeSetParentalClusterRef(ctx, d.Client, nodeSet); err != nil { - return err - } - - return nil -} - -func defaultNodeSetParentalClusterRef(ctx context.Context, client client.Client, nodeSet *slurmv1alpha1.NodeSet) error { - if _, hasClusterRef := nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName]; hasClusterRef { - return nil - } - - cluster, err := resourcegetter.GetClusterInNamespace(ctx, client, nodeSet.Namespace) - if err != nil { - return fmt.Errorf("seeking parental cluster: %w", err) - } - - if nodeSet.Annotations == nil { - nodeSet.Annotations = map[string]string{} - } - nodeSet.Annotations[consts.AnnotationParentalClusterRefName] = cluster.Name - - return nil -} - // +kubebuilder:webhook:path=/validate-slurm-nebius-ai-v1alpha1-nodeset,mutating=false,failurePolicy=fail,sideEffects=None,groups=slurm.nebius.ai,resources=nodesets,verbs=create;update,versions=v1alpha1,name=vnodeset-v1alpha1.kb.io,admissionReviewVersions=v1 // NodeSetCustomValidator struct is responsible for validating the NodeSet resource diff --git a/internal/webhook/v1alpha1/nodeset_webhook_test.go b/internal/webhook/v1alpha1/nodeset_webhook_test.go deleted file mode 100644 index eb30d34c7..000000000 --- a/internal/webhook/v1alpha1/nodeset_webhook_test.go +++ /dev/null @@ -1,104 +0,0 @@ -/* -Copyright 2025 Nebius B.V. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha1_test - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - slurmv1 "nebius.ai/slurm-operator/api/v1" - slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" - "nebius.ai/slurm-operator/internal/consts" - . "nebius.ai/slurm-operator/internal/webhook/v1alpha1" -) - -func TestDefaultSlurmClusterRef(t *testing.T) { - defaulter := &NodeSetCustomDefaulter{ - Client: newClientBuilder().Build(), - } - const ( - testNamespace = "default" - testNodeSetName = "test-nodeset" - testClusterName = "test-cluster" - ) - - t.Run("Parental cluster ref exists", func(t *testing.T) { - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testNodeSetName, - Annotations: map[string]string{ - consts.AnnotationParentalClusterRefName: testClusterName, - }, - }, - } - - err := defaulter.Default(context.Background(), obj) - assert.NoError(t, err) - }) - - t.Run("Parental cluster ref is set", func(t *testing.T) { - defer func() { - defaulter.Client = newClientBuilder().Build() - }() - - defaulter.Client = newClientBuilder(). - WithObjects(&slurmv1.SlurmCluster{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - }). - Build() - - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - } - - err := defaulter.Default(context.Background(), obj) - assert.NoError(t, err) - assert.Equal(t, testClusterName, obj.GetAnnotations()[consts.AnnotationParentalClusterRefName]) - }) - - t.Run("Parental cluster doesn't exist", func(t *testing.T) { - obj := &slurmv1alpha1.NodeSet{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: testClusterName, - }, - } - - err := defaulter.Default(context.Background(), obj) - assert.Error(t, err) - }) -} - -func newClientBuilder() *fake.ClientBuilder { - scheme := runtime.NewScheme() - _ = slurmv1.AddToScheme(scheme) - _ = slurmv1alpha1.AddToScheme(scheme) - return fake.NewClientBuilder(). - WithScheme(scheme) -} From 7d1c14d39931d8966a3aca80537d79b7bcf9e15d Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Wed, 8 Apr 2026 10:34:35 +0200 Subject: [PATCH 2/8] fixes --- cmd/sconfigcontroller/main.go | 1 + .../tests/exporter-rbac_test.yaml | 8 +-- .../jailedconfig_controller.go | 10 +++- .../jailedconfig_controller_test.go | 1 + .../activecheck_prolog_controller.go | 27 +++++++-- .../workertopology_controller.go | 57 +++++++++++-------- .../workertopology_controller_update_test.go | 4 +- internal/render/common/configmap.go | 3 +- .../resourcegetter/workload_name_prefix.go | 9 ++- 9 files changed, 80 insertions(+), 40 deletions(-) diff --git a/cmd/sconfigcontroller/main.go b/cmd/sconfigcontroller/main.go index 0a5e4b4e0..1da45c012 100644 --- a/cmd/sconfigcontroller/main.go +++ b/cmd/sconfigcontroller/main.go @@ -215,6 +215,7 @@ func main() { if err = (sconfigcontroller.NewJailedConfigReconciler( mgr.GetClient(), mgr.GetScheme(), + clusterName, slurmAPIClient, jailFs, reconfigurePollInterval, diff --git a/helm/slurm-cluster/tests/exporter-rbac_test.yaml b/helm/slurm-cluster/tests/exporter-rbac_test.yaml index 413d76477..9f88ae08b 100644 --- a/helm/slurm-cluster/tests/exporter-rbac_test.yaml +++ b/helm/slurm-cluster/tests/exporter-rbac_test.yaml @@ -15,7 +15,7 @@ tests: of: ServiceAccount - equal: path: metadata.name - value: slurm-exporter-sa + value: test-cluster-exporter-sa - equal: path: metadata.namespace value: NAMESPACE @@ -114,7 +114,7 @@ tests: path: subjects content: kind: ServiceAccount - name: slurm-exporter-sa + name: test-cluster-exporter-sa namespace: NAMESPACE # Test RoleBinding with custom ServiceAccount name @@ -162,7 +162,7 @@ tests: asserts: - equal: path: spec.slurmNodes.exporter.serviceAccountName - value: slurm-exporter-sa + value: test-cluster-exporter-sa # Test SlurmCluster CR uses custom ServiceAccount name - it: should set custom serviceAccountName in SlurmCluster CR @@ -207,4 +207,4 @@ tests: asserts: - equal: path: spec.slurmNodes.exporter.serviceAccountName - value: slurm-exporter-sa + value: test-cluster-exporter-sa diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller.go b/internal/controller/sconfigcontroller/jailedconfig_controller.go index 68576d317..212870fc5 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller.go @@ -62,6 +62,7 @@ type JailedConfigReconciler struct { client.Client Scheme *runtime.Scheme + clusterName string slurmAPIClient slurmapi.Client clock Clock fs Fs @@ -327,11 +328,14 @@ func (r *JailedConfigReconciler) reconcileIndividual(ctx context.Context, jailed func (r *JailedConfigReconciler) reconcileWithAggregation(ctx context.Context, jailedConfig *slurmv1alpha1.JailedConfig, aggregationKey string) (ctrl.Result, error) { logger := logf.FromContext(ctx) - // Get all JailedConfigs with the same aggregation key in the same namespace + // Get all JailedConfigs with the same aggregation key and cluster name jailedConfigs := &slurmv1alpha1.JailedConfigList{} err := r.Client.List(ctx, jailedConfigs, client.InNamespace(jailedConfig.Namespace), - client.MatchingLabels{consts.LabelJailedAggregationKey: aggregationKey}, + client.MatchingLabels{ + consts.LabelJailedAggregationKey: aggregationKey, + consts.LabelInstanceKey: r.clusterName, + }, ) if err != nil { return ctrl.Result{}, fmt.Errorf("listing JailedConfigs with aggregation key %q: %w", aggregationKey, err) @@ -557,6 +561,7 @@ func (r *JailedConfigReconciler) shouldInitializeConditions(ctx context.Context, func NewJailedConfigReconciler( client client.Client, scheme *runtime.Scheme, + clusterName string, slurmAPIClient slurmapi.Client, fs Fs, reconfigurePollInterval time.Duration, @@ -571,6 +576,7 @@ func NewJailedConfigReconciler( return &JailedConfigReconciler{ Client: client, Scheme: scheme, + clusterName: clusterName, slurmAPIClient: slurmAPIClient, fs: fs, reconfigurePollInterval: reconfigurePollInterval, diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go index 1b57031ca..d1b340906 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go @@ -87,6 +87,7 @@ func newTestJailedConfigController( sctrl := NewJailedConfigReconciler( mgr.GetClient(), mgr.GetScheme(), + "test-cluster", apiClient, fakeFs, 1*time.Second, // Poll interval for tests diff --git a/internal/controller/soperatorchecks/activecheck_prolog_controller.go b/internal/controller/soperatorchecks/activecheck_prolog_controller.go index 1bab0b1f2..33657936e 100644 --- a/internal/controller/soperatorchecks/activecheck_prolog_controller.go +++ b/internal/controller/soperatorchecks/activecheck_prolog_controller.go @@ -6,6 +6,7 @@ import ( "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" @@ -21,6 +22,7 @@ import ( "nebius.ai/slurm-operator/api/v1alpha1" "nebius.ai/slurm-operator/internal/controller/reconciler" "nebius.ai/slurm-operator/internal/controllerconfig" + "nebius.ai/slurm-operator/internal/utils/resourcegetter" ) var ( @@ -85,7 +87,22 @@ func (r *ActiveCheckPrologReconciler) Reconcile( "Starting reconciliation", "SlurmCluster", req.Name, "Namespace", req.Namespace, ) - if err := r.updatePrologConfigMap(ctx, req.Namespace, r.getPrologScript()); err != nil { + slurmCluster := &slurmv1.SlurmCluster{} + if err := r.Client.Get(ctx, req.NamespacedName, slurmCluster); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return DefaultRequeueResult, err + } + + namePrefix, err := resourcegetter.ResolveWorkloadNamePrefix(ctx, r.Client, req.Namespace, req.Name) + if err != nil { + logger.Error(err, "Failed to resolve workload name prefix") + return DefaultRequeueResult, err + } + resourceName := resourcegetter.BuildPrefixedName(namePrefix, consts.ConfigMapNameActiveCheckPrologScript) + + if err := r.updatePrologConfigMap(ctx, req.Namespace, resourceName, r.getPrologScript()); err != nil { logger.Error(err, "Failed to update ConfigMap with active check prolog script") return DefaultRequeueResult, nil } @@ -94,14 +111,14 @@ func (r *ActiveCheckPrologReconciler) Reconcile( return DefaultRequeueResult, nil } -func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, namespace string, config string) error { +func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, namespace, resourceName, config string) error { configMap := &corev1.ConfigMap{ TypeMeta: ctrl.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.Version, Kind: "ConfigMap", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameActiveCheckPrologScript, + Name: resourceName, Namespace: namespace, }, Data: map[string]string{ @@ -122,12 +139,12 @@ func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, Kind: "JailedConfig", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameActiveCheckPrologScript, + Name: resourceName, Namespace: namespace, }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameActiveCheckPrologScript, + Name: resourceName, }, Items: []corev1.KeyToPath{ { diff --git a/internal/controller/topologyconfcontroller/workertopology_controller.go b/internal/controller/topologyconfcontroller/workertopology_controller.go index 22f938466..06f053de9 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller.go @@ -91,6 +91,12 @@ func (r *WorkerTopologyReconciler) Reconcile(ctx context.Context, req ctrl.Reque return DefaultRequeueResult, nil } + namePrefix, err := resourcegetter.ResolveWorkloadNamePrefix(ctx, r.Client, req.Namespace, slurmCluster.Name) + if err != nil { + return ctrl.Result{}, fmt.Errorf("resolve workload name prefix: %w", err) + } + topoConfigName := resourcegetter.BuildPrefixedName(namePrefix, consts.ConfigMapNameTopologyConfig) + logger.V(1).Info("Fetching nodeSetList for SlurmCluster") nodeSetList, err := resourcegetter.ListNodeSetsByClusterRef( ctx, r.Client, types.NamespacedName{Namespace: req.Namespace, Name: slurmCluster.Name}, @@ -101,7 +107,7 @@ func (r *WorkerTopologyReconciler) Reconcile(ctx context.Context, req ctrl.Reque logger.V(1).Info("Fetched NodeSets for SlurmCluster", "count", len(nodeSetList)) - existingTopologyConfig, err := r.EnsureWorkerTopologyConfigMap(ctx, req.Namespace, logger) + existingTopologyConfig, err := r.EnsureWorkerTopologyConfigMap(ctx, req.Namespace, topoConfigName, slurmCluster.Name, logger) if err != nil { return ctrl.Result{}, fmt.Errorf("ensure worker topology ConfigMap: %w", err) } @@ -122,13 +128,13 @@ func (r *WorkerTopologyReconciler) Reconcile(ctx context.Context, req ctrl.Reque if desiredHash == existingHash { logger.Info("Topology config unchanged, skipping update") - if err := r.ensureJailedConfig(ctx, req.Namespace); err != nil { + if err := r.ensureJailedConfig(ctx, req.Namespace, topoConfigName, slurmCluster.Name); err != nil { return ctrl.Result{}, fmt.Errorf("ensure JailedConfig: %w", err) } return DefaultRequeueResult, nil } - if err := r.updateTopologyConfigMap(ctx, req.Namespace, desiredTopology); err != nil { + if err := r.updateTopologyConfigMap(ctx, req.Namespace, topoConfigName, desiredTopology, slurmCluster.Name); err != nil { logger.Error(err, "Update ConfigMap with topology config") return ctrl.Result{}, fmt.Errorf("update ConfigMap with topology config: %w", err) } @@ -145,10 +151,10 @@ func isClusterReconciliationNeeded(slurmCluster *slurmv1.SlurmCluster) bool { // EnsureWorkerTopologyConfigMap checks if the topology ConfigMap and JailedConfig exist, and creates them if they don't. func (r *WorkerTopologyReconciler) EnsureWorkerTopologyConfigMap( - ctx context.Context, namespace string, logger logr.Logger, + ctx context.Context, namespace, resourceName, clusterName string, logger logr.Logger, ) (*corev1.ConfigMap, error) { - configMapKey := client.ObjectKey{Name: consts.ConfigMapNameTopologyConfig, Namespace: namespace} - jailedConfigKey := client.ObjectKey{Name: consts.ConfigMapNameTopologyConfig, Namespace: namespace} + configMapKey := client.ObjectKey{Name: resourceName, Namespace: namespace} + jailedConfigKey := client.ObjectKey{Name: resourceName, Namespace: namespace} configMap := &corev1.ConfigMap{} configMapExists := true @@ -158,7 +164,7 @@ func (r *WorkerTopologyReconciler) EnsureWorkerTopologyConfigMap( configMapExists = false logger.Info("Worker topology ConfigMap not found") } else { - return nil, fmt.Errorf("get ConfigMap %s: %w", consts.ConfigMapNameTopologyConfig, err) + return nil, fmt.Errorf("get ConfigMap %s: %w", resourceName, err) } } @@ -170,7 +176,7 @@ func (r *WorkerTopologyReconciler) EnsureWorkerTopologyConfigMap( jailedConfigExists = false logger.Info("Worker topology JailedConfig not found") } else { - return nil, fmt.Errorf("get JailedConfig %s: %w", consts.ConfigMapNameTopologyConfig, err) + return nil, fmt.Errorf("get JailedConfig %s: %w", resourceName, err) } } @@ -179,7 +185,7 @@ func (r *WorkerTopologyReconciler) EnsureWorkerTopologyConfigMap( "configMapExists", configMapExists, "jailedConfigExists", jailedConfigExists) - if err = r.createDefaultTopologyResources(ctx, namespace); err != nil { + if err = r.createDefaultTopologyResources(ctx, namespace, resourceName, clusterName); err != nil { return nil, fmt.Errorf("create default topology resources in namespace %q: %w", namespace, err) } @@ -197,18 +203,18 @@ func (r *WorkerTopologyReconciler) EnsureWorkerTopologyConfigMap( // createDefaultTopologyResources creates the default topology ConfigMap and JailedConfig with a basic topology configuration. func (r *WorkerTopologyReconciler) createDefaultTopologyResources( - ctx context.Context, namespace string, + ctx context.Context, namespace, resourceName, clusterName string, ) error { defaultTopology := "SwitchName=root" - configMap := r.renderTopologyConfigMap(namespace, defaultTopology) + configMap := r.renderTopologyConfigMap(namespace, resourceName, defaultTopology) err := r.Client.Create(ctx, configMap) if err != nil && !apierrors.IsAlreadyExists(err) { return fmt.Errorf("create ConfigMap %s: %w", configMap.Name, err) } - jailedConfig := r.renderTopologyJailedConfig(namespace) + jailedConfig := r.renderTopologyJailedConfig(namespace, resourceName, clusterName) err = r.Client.Create(ctx, jailedConfig) if err != nil && !apierrors.IsAlreadyExists(err) { return fmt.Errorf("create JailedConfig %s: %w", jailedConfig.Name, err) @@ -217,14 +223,14 @@ func (r *WorkerTopologyReconciler) createDefaultTopologyResources( return nil } -func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace string, config string) *corev1.ConfigMap { +func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace, resourceName, config string) *corev1.ConfigMap { return &corev1.ConfigMap{ TypeMeta: ctrl.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: resourceName, Namespace: namespace, }, Data: map[string]string{ @@ -233,22 +239,23 @@ func (r *WorkerTopologyReconciler) renderTopologyConfigMap(namespace string, con } } -func (r *WorkerTopologyReconciler) renderTopologyJailedConfig(namespace string) *v1alpha1.JailedConfig { +func (r *WorkerTopologyReconciler) renderTopologyJailedConfig(namespace, resourceName, clusterName string) *v1alpha1.JailedConfig { return &v1alpha1.JailedConfig{ TypeMeta: ctrl.TypeMeta{ APIVersion: v1alpha1.GroupVersion.String(), Kind: "JailedConfig", }, ObjectMeta: ctrl.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: resourceName, Namespace: namespace, Labels: map[string]string{ consts.LabelJailedAggregationKey: consts.LabelJailedAggregationCommonValue, + consts.LabelInstanceKey: clusterName, }, }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ - Name: consts.ConfigMapNameTopologyConfig, + Name: resourceName, }, Items: []corev1.KeyToPath{ { @@ -423,15 +430,15 @@ func (r *WorkerTopologyReconciler) calculateConfigHash(config string) string { return hex.EncodeToString(hash[:]) } -func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, namespace string, config string) error { - configMapKey := client.ObjectKey{Name: consts.ConfigMapNameTopologyConfig, Namespace: namespace} +func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, namespace, resourceName, config, clusterName string) error { + configMapKey := client.ObjectKey{Name: resourceName, Namespace: namespace} existingConfigMap := &corev1.ConfigMap{} err := r.Client.Get(ctx, configMapKey, existingConfigMap) if err != nil { if apierrors.IsNotFound(err) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: consts.ConfigMapNameTopologyConfig, + Name: resourceName, Namespace: namespace, }, Data: map[string]string{ @@ -439,10 +446,10 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, }, } if err := r.Client.Create(ctx, cm); err != nil { - return fmt.Errorf("create ConfigMap %s: %w", consts.ConfigMapNameTopologyConfig, err) + return fmt.Errorf("create ConfigMap %s: %w", resourceName, err) } } else { - return fmt.Errorf("get ConfigMap %s: %w", consts.ConfigMapNameTopologyConfig, err) + return fmt.Errorf("get ConfigMap %s: %w", resourceName, err) } } else { existingConfigMap.Data[consts.ConfigMapKeyTopologyConfig] = config @@ -451,7 +458,7 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, } } - if err := r.ensureJailedConfig(ctx, namespace); err != nil { + if err := r.ensureJailedConfig(ctx, namespace, resourceName, clusterName); err != nil { return fmt.Errorf("ensure JailedConfig: %w", err) } @@ -460,8 +467,8 @@ func (r *WorkerTopologyReconciler) updateTopologyConfigMap(ctx context.Context, // ensureJailedConfig ensures the JailedConfig for topology exists and matches the desired state. // If it doesn't exist, it creates one. If it exists, it updates the spec to match desired. -func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, namespace string) error { - desired := r.renderTopologyJailedConfig(namespace) +func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, namespace, resourceName, clusterName string) error { + desired := r.renderTopologyJailedConfig(namespace, resourceName, clusterName) existing := &v1alpha1.JailedConfig{} err := r.Client.Get(ctx, client.ObjectKeyFromObject(desired), existing) diff --git a/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go b/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go index 97a09641f..e390ffcac 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller_update_test.go @@ -162,7 +162,7 @@ func TestWorkerTopologyReconciler_updateTopologyConfigMap_Fixed(t *testing.T) { } ctx := context.Background() - err := reconciler.updateTopologyConfigMap(ctx, namespace, "new-topology-config") + err := reconciler.updateTopologyConfigMap(ctx, namespace, consts.ConfigMapNameTopologyConfig, "new-topology-config", "test-cluster") if tt.expectedError { assert.Error(t, err) @@ -209,7 +209,7 @@ func TestWorkerTopologyReconciler_renderTopologyConfigMap(t *testing.T) { namespace := "test-namespace" config := "SwitchName=root" - cm := r.renderTopologyConfigMap(namespace, config) + cm := r.renderTopologyConfigMap(namespace, consts.ConfigMapNameTopologyConfig, config) assert.Equal(t, consts.ConfigMapNameTopologyConfig, cm.Name) assert.Equal(t, namespace, cm.Namespace) diff --git a/internal/render/common/configmap.go b/internal/render/common/configmap.go index 90991cc92..6c90cc3b5 100644 --- a/internal/render/common/configmap.go +++ b/internal/render/common/configmap.go @@ -250,7 +250,8 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile { { svcName := cluster.NodeController.Service.Name - res.AddProperty("SlurmctldHost", fmt.Sprintf("%s(%s)", "controller-0", svcName)) + controllerHostname := cluster.NodeController.StatefulSet.Name + "-0" + res.AddProperty("SlurmctldHost", fmt.Sprintf("%s(%s)", controllerHostname, svcName)) } res.AddComment("") diff --git a/internal/utils/resourcegetter/workload_name_prefix.go b/internal/utils/resourcegetter/workload_name_prefix.go index 3161dc862..19ea98751 100644 --- a/internal/utils/resourcegetter/workload_name_prefix.go +++ b/internal/utils/resourcegetter/workload_name_prefix.go @@ -48,5 +48,12 @@ func hasClusterLabel(obj client.Object, clusterName string) bool { if labels == nil { return false } - return labels["app.kubernetes.io/instance"] == clusterName + return labels[consts.LabelInstanceKey] == clusterName +} + +func BuildPrefixedName(prefix, base string) string { + if prefix == "" { + return base + } + return prefix + "-" + base } From e8aca52d30d7b457f6d7c20476f9fea0ac24cdc8 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Wed, 8 Apr 2026 16:13:27 +0200 Subject: [PATCH 3/8] test-fixes --- internal/values/slurm_controller_test.go | 6 +++--- internal/values/slurm_login_test.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/values/slurm_controller_test.go b/internal/values/slurm_controller_test.go index 5480d1022..f779534eb 100644 --- a/internal/values/slurm_controller_test.go +++ b/internal/values/slurm_controller_test.go @@ -21,7 +21,7 @@ func TestBuildSlurmControllerFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeControllerVolumes{Spool: slurmv1.NodeVolume{}, Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmControllerFrom("test-cluster", nil, controller) + result := buildSlurmControllerFrom("test-cluster", "", nil, controller) if assert.NotNil(t, result.ContainerSSSD) { assert.Equal(t, "sssd-image", result.ContainerSSSD.Image) @@ -42,7 +42,7 @@ func TestBuildSlurmControllerFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeControllerVolumes{Spool: slurmv1.NodeVolume{}, Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmControllerFrom("test-cluster", nil, controller) + result := buildSlurmControllerFrom("test-cluster", "", nil, controller) assert.False(t, result.IsSSSDSecretDefault) assert.Equal(t, "custom-controller-sssd-secret", result.SSSDConfSecretName) @@ -55,7 +55,7 @@ func TestBuildSlurmControllerFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeControllerVolumes{Spool: slurmv1.NodeVolume{}, Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmControllerFrom("test-cluster", nil, controller) + result := buildSlurmControllerFrom("test-cluster", "", nil, controller) assert.Nil(t, result.ContainerSSSD) assert.Equal(t, naming.BuildSecretSSSDConfName("test-cluster"), result.SSSDConfSecretName) diff --git a/internal/values/slurm_login_test.go b/internal/values/slurm_login_test.go index e51f6d3da..87497dabc 100644 --- a/internal/values/slurm_login_test.go +++ b/internal/values/slurm_login_test.go @@ -23,7 +23,7 @@ func TestBuildSlurmLoginFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeLoginVolumes{Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmLoginFrom("test-cluster", nil, login, false) + result := buildSlurmLoginFrom("test-cluster", "", nil, login, false) if assert.NotNil(t, result.ContainerSSSD) { assert.Equal(t, "sssd-image", result.ContainerSSSD.Image) @@ -44,7 +44,7 @@ func TestBuildSlurmLoginFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeLoginVolumes{Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmLoginFrom("test-cluster", nil, login, false) + result := buildSlurmLoginFrom("test-cluster", "", nil, login, false) assert.False(t, result.IsSSSDSecretDefault) assert.Equal(t, "custom-sssd-secret", result.SSSDConfSecretName) @@ -57,7 +57,7 @@ func TestBuildSlurmLoginFrom_SSSD(t *testing.T) { Volumes: slurmv1.SlurmNodeLoginVolumes{Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmLoginFrom("test-cluster", nil, login, false) + result := buildSlurmLoginFrom("test-cluster", "", nil, login, false) assert.Nil(t, result.ContainerSSSD) assert.Equal(t, naming.BuildSecretSSSDConfName("test-cluster"), result.SSSDConfSecretName) @@ -72,7 +72,7 @@ func TestBuildSlurmLoginFrom_PreservesSSSHDConfigDefault(t *testing.T) { Volumes: slurmv1.SlurmNodeLoginVolumes{Jail: slurmv1.NodeVolume{}}, } - result := buildSlurmLoginFrom("test-cluster", nil, login, false) + result := buildSlurmLoginFrom("test-cluster", "", nil, login, false) assert.Equal(t, naming.BuildConfigMapSSHDConfigsNameLogin("test-cluster"), result.SSHDConfigMapName) assert.True(t, result.IsSSHDConfigMapDefault) From aafe2851a3d43da971fcecfcb7048d8493161e7c Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Tue, 14 Apr 2026 19:48:45 +0200 Subject: [PATCH 4/8] fixes --- internal/controller/nodesetcontroller/reconcile.go | 7 ++++++- .../topologyconfcontroller/workertopology_controller.go | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/internal/controller/nodesetcontroller/reconcile.go b/internal/controller/nodesetcontroller/reconcile.go index ab6d04484..156418fac 100644 --- a/internal/controller/nodesetcontroller/reconcile.go +++ b/internal/controller/nodesetcontroller/reconcile.go @@ -283,7 +283,12 @@ func (r NodeSetReconciler) executeReconciliation( stepLogger := log.FromContext(stepCtx) stepLogger.V(1).Info("Reconciling") - clusterValues, err := values.BuildSlurmClusterFrom(stepCtx, cluster, cluster.Name) + namePrefix, err := resourcegetter.ResolveWorkloadNamePrefix(stepCtx, r.Client, cluster.Namespace, cluster.Name) + if err != nil { + stepLogger.Error(err, "Failed to resolve workload name prefix") + return fmt.Errorf("resolving workload name prefix: %w", err) + } + clusterValues, err := values.BuildSlurmClusterFrom(stepCtx, cluster, namePrefix) if err != nil { stepLogger.Error(err, "Failed to build cluster values") return fmt.Errorf("building cluster values: %w", err) diff --git a/internal/controller/topologyconfcontroller/workertopology_controller.go b/internal/controller/topologyconfcontroller/workertopology_controller.go index 06f053de9..4ad4b47a4 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller.go @@ -495,6 +495,11 @@ func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, names func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurrency int, cacheSyncTimeout time.Duration) error { + isTopologyConfigName := func(name string) bool { + return name == consts.ConfigMapNameTopologyConfig || + strings.HasSuffix(name, "-"+consts.ConfigMapNameTopologyConfig) + } + return ctrl.NewControllerManagedBy(mgr).Named(WorkerTopologyReconcilerName). For(&slurmv1.SlurmCluster{}, builder.WithPredicates(predicate.Funcs{ CreateFunc: func(e event.CreateEvent) bool { @@ -533,7 +538,7 @@ func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, return false }, DeleteFunc: func(e event.DeleteEvent) bool { - return e.Object.GetName() == consts.ConfigMapNameTopologyConfig + return isTopologyConfigName(e.Object.GetName()) }, UpdateFunc: func(e event.UpdateEvent) bool { return false @@ -549,7 +554,7 @@ func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, return false }, DeleteFunc: func(e event.DeleteEvent) bool { - return e.Object.GetName() == consts.ConfigMapNameTopologyConfig + return isTopologyConfigName(e.Object.GetName()) }, UpdateFunc: func(e event.UpdateEvent) bool { return false From bf415e81417b17a6d877ec566c0b3e13d52f6ae7 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Mon, 20 Apr 2026 21:46:42 +0200 Subject: [PATCH 5/8] Fix generation issue --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 51a398505..6f4736c5f 100644 --- a/Makefile +++ b/Makefile @@ -156,6 +156,8 @@ helm: generate manifests kustomize helmify ## Update soperator Helm chart in_metadata && /^ name:/ {print; if (!done) {print " {{- if .Values.certManager.enabled }}"; print " annotations:"; print " cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include \"soperator.fullname\" . }}-serving-cert"; print " {{- end }}"; done=1}; next} \ in_metadata && /^ annotations:/ {next} \ in_metadata && /^ cert-manager/ {next} \ + in_metadata && /^ \{\{- if .Values.certManager.enabled \}\}/ {next} \ + in_metadata && /^ \{\{- end \}\}/ {next} \ in_metadata && /^ labels:/ {in_metadata=0} \ {print}' \ $(CHART_OPERATOR_PATH)/templates/mutating-webhook-configuration.yaml > $(CHART_OPERATOR_PATH)/templates/mutating-webhook-configuration.yaml.tmp && \ @@ -167,6 +169,8 @@ helm: generate manifests kustomize helmify ## Update soperator Helm chart in_metadata && /^ name:/ {print; if (!done) {print " {{- if .Values.certManager.enabled }}"; print " annotations:"; print " cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include \"soperator.fullname\" . }}-serving-cert"; print " {{- end }}"; done=1}; next} \ in_metadata && /^ annotations:/ {next} \ in_metadata && /^ cert-manager/ {next} \ + in_metadata && /^ \{\{- if .Values.certManager.enabled \}\}/ {next} \ + in_metadata && /^ \{\{- end \}\}/ {next} \ in_metadata && /^ labels:/ {in_metadata=0} \ {print}' \ $(CHART_OPERATOR_PATH)/templates/validating-webhook-configuration.yaml > $(CHART_OPERATOR_PATH)/templates/validating-webhook-configuration.yaml.tmp && \ From 3d4fe5e6061c52829c03e32bcc30669170494e88 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Tue, 21 Apr 2026 17:14:32 +0200 Subject: [PATCH 6/8] remove old test --- .../mutating-webhook-configuration.yaml | 31 ------------------- 1 file changed, 31 deletions(-) delete mode 100644 helm/soperator/templates/mutating-webhook-configuration.yaml diff --git a/helm/soperator/templates/mutating-webhook-configuration.yaml b/helm/soperator/templates/mutating-webhook-configuration.yaml deleted file mode 100644 index dda0da1e4..000000000 --- a/helm/soperator/templates/mutating-webhook-configuration.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: {{ include "soperator.fullname" . }}-mutating-webhook-configuration - {{- if .Values.certManager.enabled }} - annotations: - cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "soperator.fullname" . }}-serving-cert - {{- end }} - labels: - {{- include "soperator.labels" . | nindent 4 }} -webhooks: -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: '{{ include "soperator.fullname" . }}-webhook-service' - namespace: '{{ .Release.Namespace }}' - path: /mutate-slurm-nebius-ai-v1alpha1-nodeset - failurePolicy: Fail - name: mnodeset-v1alpha1.kb.io - rules: - - apiGroups: - - slurm.nebius.ai - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - nodesets - sideEffects: None From f5026586485068b1479e7342fa7cd47a1fb20cce Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Thu, 23 Apr 2026 15:34:19 +0200 Subject: [PATCH 7/8] Fix activechecks --- .../scripts/retrigger-checks.py | 5 +- .../templates/_helpers.tpl | 16 ++ .../templates/active-checks.yaml | 8 +- .../run-extensive-check-on-reservations.yaml | 28 ++-- .../templates/wait-for-checks-job.yaml | 7 +- .../templates/wait-for-checks-role.yaml | 2 +- .../wait-for-checks-rolebinging.yaml | 6 +- .../templates/wait-for-checks-sa.yaml | 2 +- .../jailedconfig_controller.go | 4 + .../jailedconfig_controller_test.go | 4 + .../activecheck_prolog_controller.go | 7 +- .../activecheck_prolog_controller_test.go | 149 ++++++++++++++++++ .../workertopology_controller.go | 4 + internal/utils/resourcegetter/nodeset.go | 7 +- internal/values/slurm_exporter.go | 6 +- 15 files changed, 222 insertions(+), 33 deletions(-) create mode 100644 internal/controller/soperatorchecks/activecheck_prolog_controller_test.go diff --git a/helm/soperator-activechecks/scripts/retrigger-checks.py b/helm/soperator-activechecks/scripts/retrigger-checks.py index 948cd8d68..417bdf6ed 100644 --- a/helm/soperator-activechecks/scripts/retrigger-checks.py +++ b/helm/soperator-activechecks/scripts/retrigger-checks.py @@ -6,6 +6,7 @@ import datetime NS = os.environ["NAMESPACE"] +SLURM_CLUSTER_REF_NAME = os.environ["SLURM_CLUSTER_REF_NAME"] logging.Formatter.converter = time.gmtime logging.basicConfig( @@ -27,8 +28,8 @@ def get_active_checks(): ])) active_checks = [] for it in data.get("items", []): - rac = it.get("spec", {}).get("runAfterCreation") - if rac: + spec = it.get("spec", {}) + if spec.get("runAfterCreation") and spec.get("slurmClusterRefName") == SLURM_CLUSTER_REF_NAME: active_checks.append(it["metadata"]["name"]) return active_checks diff --git a/helm/soperator-activechecks/templates/_helpers.tpl b/helm/soperator-activechecks/templates/_helpers.tpl index bf80515ba..5e847f12f 100644 --- a/helm/soperator-activechecks/templates/_helpers.tpl +++ b/helm/soperator-activechecks/templates/_helpers.tpl @@ -254,6 +254,22 @@ Otherwise: true for Kubernetes >= 1.33 (user namespaces stable, explicitly opt i {{- end -}} {{- end -}} +{{/* +Name for the run-extensive-check-on-reservations CronJob and its RBAC resources. +Prefixed with slurmClusterRefName to support multiple releases in the same namespace. +*/}} +{{- define "soperator-activechecks.extensiveCheckJobName" -}} +{{- printf "%s-run-extensive-check-on-reservations" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Name for the activecheck-waiter ServiceAccount and its RBAC resources. +Prefixed with slurmClusterRefName to support multiple releases in the same namespace. +*/}} +{{- define "soperator-activechecks.waitForChecksName" -}} +{{- printf "%s-activecheck-waiter" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }} +{{- end }} + {{/* Validate that a check does not enable both commentSlurmNode and drainSlurmNode under `failureReactions` for a single check. Invoke with (dict "name" "" "vals" .Values) diff --git a/helm/soperator-activechecks/templates/active-checks.yaml b/helm/soperator-activechecks/templates/active-checks.yaml index 4f387fdea..803171d72 100644 --- a/helm/soperator-activechecks/templates/active-checks.yaml +++ b/helm/soperator-activechecks/templates/active-checks.yaml @@ -6,13 +6,15 @@ apiVersion: slurm.nebius.ai/v1alpha1 kind: ActiveCheck metadata: - name: {{ $name | quote }} + name: {{ printf "%s-%s" $root.Values.slurmClusterRefName $name | quote }} spec: checkType: {{ $check.checkType | quote }} - name: {{ $name | quote }} + name: {{ printf "%s-%s" $root.Values.slurmClusterRefName $name | quote }} {{- with $check.dependsOn }} dependsOn: -{{ toYaml . | indent 4 }} + {{- range . }} + - {{ printf "%s-%s" $root.Values.slurmClusterRefName . | quote }} + {{- end }} {{- end }} slurmClusterRefName: {{ $root.Values.slurmClusterRefName | quote }} {{- with $check.schedule }} diff --git a/helm/soperator-activechecks/templates/run-extensive-check-on-reservations.yaml b/helm/soperator-activechecks/templates/run-extensive-check-on-reservations.yaml index 2e3852b12..ad42b1467 100644 --- a/helm/soperator-activechecks/templates/run-extensive-check-on-reservations.yaml +++ b/helm/soperator-activechecks/templates/run-extensive-check-on-reservations.yaml @@ -1,7 +1,8 @@ +{{- if (index .Values.checks "extensive-check").enabled -}} apiVersion: batch/v1 kind: CronJob metadata: - name: "run-extensive-check-on-reservations" + name: {{ include "soperator-activechecks.extensiveCheckJobName" . | quote }} labels: app.kubernetes.io/component: soperatorchecks app.kubernetes.io/instance: soperator @@ -51,7 +52,7 @@ spec: - name: RESERVATION_PREFIX value: {{ (index .Values.checks "extensive-check").reservationPrefix | quote }} - name: TARGET_ACTIVE_CHECK_NAME - value: extensive-check + value: {{ printf "%s-extensive-check" .Values.slurmClusterRefName | quote }} volumeMounts: - mountPath: /mnt/slurm-configs name: slurm-configs @@ -101,8 +102,8 @@ spec: name: munge-socket dnsPolicy: ClusterFirst restartPolicy: Never - serviceAccount: run-extensive-check-on-reservations - serviceAccountName: run-extensive-check-on-reservations + serviceAccount: {{ include "soperator-activechecks.extensiveCheckJobName" . }} + serviceAccountName: {{ include "soperator-activechecks.extensiveCheckJobName" . }} schedulerName: default-scheduler terminationGracePeriodSeconds: 30 volumes: @@ -127,14 +128,14 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: - name: run-extensive-check-on-reservations - namespace: soperator + name: {{ include "soperator-activechecks.extensiveCheckJobName" . }} + namespace: {{ .Release.Namespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: run-extensive-check-on-reservations - namespace: soperator + name: {{ include "soperator-activechecks.extensiveCheckJobName" . }} + namespace: {{ .Release.Namespace }} rules: - apiGroups: - batch @@ -155,13 +156,14 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: run-extensive-check-on-reservations - namespace: soperator + name: {{ include "soperator-activechecks.extensiveCheckJobName" . }} + namespace: {{ .Release.Namespace }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: run-extensive-check-on-reservations + name: {{ include "soperator-activechecks.extensiveCheckJobName" . }} subjects: - kind: ServiceAccount - name: run-extensive-check-on-reservations - namespace: soperator + name: {{ include "soperator-activechecks.extensiveCheckJobName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/helm/soperator-activechecks/templates/wait-for-checks-job.yaml b/helm/soperator-activechecks/templates/wait-for-checks-job.yaml index eba524458..5202a167e 100644 --- a/helm/soperator-activechecks/templates/wait-for-checks-job.yaml +++ b/helm/soperator-activechecks/templates/wait-for-checks-job.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: wait-for-active-checks + name: {{ printf "%s-wait-for-active-checks" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }} annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-delete-policy": hook-succeeded,hook-failed,before-hook-creation @@ -9,7 +9,7 @@ spec: template: spec: restartPolicy: Never - serviceAccountName: activecheck-waiter + serviceAccountName: {{ include "soperator-activechecks.waitForChecksName" . }} containers: - name: wait-for-active-checks image: alpine/k8s:1.31.11 @@ -23,11 +23,12 @@ spec: NAMESPACE="{{ .Release.Namespace }}" CRD_KIND="activechecks.slurm.nebius.ai" + SLURM_CLUSTER_REF_NAME="{{ .Values.slurmClusterRefName }}" echo "Fetching ActiveCheck list..." # Only wait for checks that run on creation and ignore flappy comment-only checks. active_check_names=$(kubectl get "$CRD_KIND" -n "$NAMESPACE" -o json \ - | jq -r '.items[] | select(.spec.runAfterCreation == true and .spec.failureReactions.commentSlurmNode == null) | .metadata.name') + | jq -r --arg cluster "$SLURM_CLUSTER_REF_NAME" '.items[] | select(.spec.runAfterCreation == true and .spec.failureReactions.commentSlurmNode == null and .spec.slurmClusterRefName == $cluster) | .metadata.name') if [ -z "$active_check_names" ]; then echo "No CRs with runAfterCreation=true found. Exiting." diff --git a/helm/soperator-activechecks/templates/wait-for-checks-role.yaml b/helm/soperator-activechecks/templates/wait-for-checks-role.yaml index ee447d3a8..688cc7749 100644 --- a/helm/soperator-activechecks/templates/wait-for-checks-role.yaml +++ b/helm/soperator-activechecks/templates/wait-for-checks-role.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: activecheck-waiter-role + name: {{ include "soperator-activechecks.waitForChecksName" . }}-role namespace: {{ .Release.Namespace }} rules: - apiGroups: ["slurm.nebius.ai"] diff --git a/helm/soperator-activechecks/templates/wait-for-checks-rolebinging.yaml b/helm/soperator-activechecks/templates/wait-for-checks-rolebinging.yaml index 6395ac2b7..a8ee6cf87 100644 --- a/helm/soperator-activechecks/templates/wait-for-checks-rolebinging.yaml +++ b/helm/soperator-activechecks/templates/wait-for-checks-rolebinging.yaml @@ -1,13 +1,13 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: activecheck-waiter-rolebinding + name: {{ include "soperator-activechecks.waitForChecksName" . }}-rolebinding namespace: {{ .Release.Namespace }} subjects: - kind: ServiceAccount - name: activecheck-waiter + name: {{ include "soperator-activechecks.waitForChecksName" . }} namespace: {{ .Release.Namespace }} roleRef: kind: Role - name: activecheck-waiter-role + name: {{ include "soperator-activechecks.waitForChecksName" . }}-role apiGroup: rbac.authorization.k8s.io diff --git a/helm/soperator-activechecks/templates/wait-for-checks-sa.yaml b/helm/soperator-activechecks/templates/wait-for-checks-sa.yaml index 6d4570e53..58983d6ab 100644 --- a/helm/soperator-activechecks/templates/wait-for-checks-sa.yaml +++ b/helm/soperator-activechecks/templates/wait-for-checks-sa.yaml @@ -1,5 +1,5 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: activecheck-waiter + name: {{ include "soperator-activechecks.waitForChecksName" . }} namespace: {{ .Release.Namespace }} diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller.go b/internal/controller/sconfigcontroller/jailedconfig_controller.go index 212870fc5..33c37531f 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller.go @@ -190,6 +190,10 @@ func (r *JailedConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request func (r *JailedConfigReconciler) reconcileIndividual(ctx context.Context, jailedConfig *slurmv1alpha1.JailedConfig) (ctrl.Result, error) { logger := logf.FromContext(ctx) + if jailedConfig.Labels[consts.LabelInstanceKey] != r.clusterName { + return ctrl.Result{}, nil + } + err := r.shouldInitializeConditions(ctx, jailedConfig) if err != nil { return ctrl.Result{}, fmt.Errorf("initializing conditions: %w", err) diff --git a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go index d1b340906..1f0aacca7 100644 --- a/internal/controller/sconfigcontroller/jailedconfig_controller_test.go +++ b/internal/controller/sconfigcontroller/jailedconfig_controller_test.go @@ -43,6 +43,7 @@ import ( v0041 "github.com/SlinkyProject/slurm-client/api/v0041" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" + "nebius.ai/slurm-operator/internal/consts" fakes "nebius.ai/slurm-operator/internal/controller/sconfigcontroller/fake" slurmapifake "nebius.ai/slurm-operator/internal/slurmapi/fake" ) @@ -162,6 +163,9 @@ func prepareTest(t *testing.T, options ...testOption) (*JailedConfigReconciler, ObjectMeta: metav1.ObjectMeta{ Name: testJailedConfig, Namespace: testNamespace, + Labels: map[string]string{ + consts.LabelInstanceKey: "test-cluster", + }, }, Spec: slurmv1alpha1.JailedConfigSpec{ ConfigMap: slurmv1alpha1.ConfigMapReference{ diff --git a/internal/controller/soperatorchecks/activecheck_prolog_controller.go b/internal/controller/soperatorchecks/activecheck_prolog_controller.go index 33657936e..7b29fe24a 100644 --- a/internal/controller/soperatorchecks/activecheck_prolog_controller.go +++ b/internal/controller/soperatorchecks/activecheck_prolog_controller.go @@ -102,7 +102,7 @@ func (r *ActiveCheckPrologReconciler) Reconcile( } resourceName := resourcegetter.BuildPrefixedName(namePrefix, consts.ConfigMapNameActiveCheckPrologScript) - if err := r.updatePrologConfigMap(ctx, req.Namespace, resourceName, r.getPrologScript()); err != nil { + if err := r.updatePrologConfigMap(ctx, req.Namespace, req.Name, resourceName, r.getPrologScript()); err != nil { logger.Error(err, "Failed to update ConfigMap with active check prolog script") return DefaultRequeueResult, nil } @@ -111,7 +111,7 @@ func (r *ActiveCheckPrologReconciler) Reconcile( return DefaultRequeueResult, nil } -func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, namespace, resourceName, config string) error { +func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, namespace, clusterName, resourceName, config string) error { configMap := &corev1.ConfigMap{ TypeMeta: ctrl.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.Version, @@ -141,6 +141,9 @@ func (r *ActiveCheckPrologReconciler) updatePrologConfigMap(ctx context.Context, ObjectMeta: ctrl.ObjectMeta{ Name: resourceName, Namespace: namespace, + Labels: map[string]string{ + consts.LabelInstanceKey: clusterName, + }, }, Spec: v1alpha1.JailedConfigSpec{ ConfigMap: v1alpha1.ConfigMapReference{ diff --git a/internal/controller/soperatorchecks/activecheck_prolog_controller_test.go b/internal/controller/soperatorchecks/activecheck_prolog_controller_test.go new file mode 100644 index 000000000..c87615793 --- /dev/null +++ b/internal/controller/soperatorchecks/activecheck_prolog_controller_test.go @@ -0,0 +1,149 @@ +package soperatorchecks + +import ( + "context" + "testing" + + kruisev1b1 "github.com/openkruise/kruise-api/apps/v1beta1" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + slurmv1 "nebius.ai/slurm-operator/api/v1" + "nebius.ai/slurm-operator/api/v1alpha1" + "nebius.ai/slurm-operator/internal/consts" + "nebius.ai/slurm-operator/internal/controller/reconciler" +) + +const ( + prologTestNamespace = "test-ns" + prologTestClusterName = "test-cluster" +) + +func newPrologTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, slurmv1.AddToScheme(scheme)) + require.NoError(t, v1alpha1.AddToScheme(scheme)) + require.NoError(t, kruisev1b1.AddToScheme(scheme)) + return scheme +} + +func newPrologController(scheme *runtime.Scheme, objects ...client.Object) *ActiveCheckPrologReconciler { + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() + return &ActiveCheckPrologReconciler{ + Reconciler: reconciler.NewReconciler(fakeClient, scheme, record.NewFakeRecorder(10)), + } +} + +func prologRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: prologTestClusterName, + Namespace: prologTestNamespace, + }, + } +} + +func newSlurmClusterObj(namespace, name string) *slurmv1.SlurmCluster { + return &slurmv1.SlurmCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + } +} + +// legacyControllerSTS simulates a pre-existing cluster whose StatefulSet was created +// under the old unprefixed naming scheme. ResolveWorkloadNamePrefix returns "" for such clusters. +func legacyControllerSTS(namespace, clusterName string) *kruisev1b1.StatefulSet { + return &kruisev1b1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.ComponentTypeController.String(), + Namespace: namespace, + Labels: map[string]string{ + consts.LabelInstanceKey: clusterName, + }, + }, + } +} + +// TestPrologReconcile_ClusterNotFound verifies that reconciliation against a missing +// SlurmCluster returns no error and creates no resources. +func TestPrologReconcile_ClusterNotFound(t *testing.T) { + scheme := newPrologTestScheme(t) + r := newPrologController(scheme) + + _, err := r.Reconcile(context.Background(), prologRequest()) + require.NoError(t, err) + + jcList := &v1alpha1.JailedConfigList{} + require.NoError(t, r.Client.List(context.Background(), jcList)) + require.Empty(t, jcList.Items) +} + +// TestPrologReconcile_NewCluster_PrefixedNameAndLabel verifies that for a new cluster +// (no legacy StatefulSet present) the JailedConfig and ConfigMap are created with a +// cluster-prefixed name and that LabelInstanceKey is set on the JailedConfig. +func TestPrologReconcile_NewCluster_PrefixedNameAndLabel(t *testing.T) { + scheme := newPrologTestScheme(t) + r := newPrologController(scheme, newSlurmClusterObj(prologTestNamespace, prologTestClusterName)) + + _, err := r.Reconcile(context.Background(), prologRequest()) + require.NoError(t, err) + + expectedName := prologTestClusterName + "-" + consts.ConfigMapNameActiveCheckPrologScript + + cm := &corev1.ConfigMap{} + require.NoError(t, r.Client.Get(context.Background(), types.NamespacedName{ + Name: expectedName, + Namespace: prologTestNamespace, + }, cm)) + require.Contains(t, cm.Data, consts.ConfigMapKeyActiveCheckPrologScript) + + jc := &v1alpha1.JailedConfig{} + require.NoError(t, r.Client.Get(context.Background(), types.NamespacedName{ + Name: expectedName, + Namespace: prologTestNamespace, + }, jc)) + require.Equal(t, prologTestClusterName, jc.Labels[consts.LabelInstanceKey]) + require.Equal(t, expectedName, jc.Spec.ConfigMap.Name) +} + +// TestPrologReconcile_LegacyCluster_UnprefixedNameAndLabel verifies that for a legacy +// cluster (controller StatefulSet exists with the old unprefixed name) the JailedConfig +// and ConfigMap are created without a name prefix, but LabelInstanceKey is still set. +func TestPrologReconcile_LegacyCluster_UnprefixedNameAndLabel(t *testing.T) { + scheme := newPrologTestScheme(t) + r := newPrologController(scheme, + newSlurmClusterObj(prologTestNamespace, prologTestClusterName), + legacyControllerSTS(prologTestNamespace, prologTestClusterName), + ) + + _, err := r.Reconcile(context.Background(), prologRequest()) + require.NoError(t, err) + + expectedName := consts.ConfigMapNameActiveCheckPrologScript + + cm := &corev1.ConfigMap{} + require.NoError(t, r.Client.Get(context.Background(), types.NamespacedName{ + Name: expectedName, + Namespace: prologTestNamespace, + }, cm)) + require.Contains(t, cm.Data, consts.ConfigMapKeyActiveCheckPrologScript) + + jc := &v1alpha1.JailedConfig{} + require.NoError(t, r.Client.Get(context.Background(), types.NamespacedName{ + Name: expectedName, + Namespace: prologTestNamespace, + }, jc)) + require.Equal(t, prologTestClusterName, jc.Labels[consts.LabelInstanceKey]) + require.Equal(t, expectedName, jc.Spec.ConfigMap.Name) +} diff --git a/internal/controller/topologyconfcontroller/workertopology_controller.go b/internal/controller/topologyconfcontroller/workertopology_controller.go index 4ad4b47a4..91d1c2aee 100644 --- a/internal/controller/topologyconfcontroller/workertopology_controller.go +++ b/internal/controller/topologyconfcontroller/workertopology_controller.go @@ -495,6 +495,10 @@ func (r *WorkerTopologyReconciler) ensureJailedConfig(ctx context.Context, names func (r *WorkerTopologyReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurrency int, cacheSyncTimeout time.Duration) error { + // Matches both the legacy unprefixed name and any cluster-prefixed variant. + // Intentionally broad: events from other clusters in the same namespace will + // trigger reconciliation, but the reconciler uses its own cluster-specific + // ConfigMap name so spurious triggers are harmless. isTopologyConfigName := func(name string) bool { return name == consts.ConfigMapNameTopologyConfig || strings.HasSuffix(name, "-"+consts.ConfigMapNameTopologyConfig) diff --git a/internal/utils/resourcegetter/nodeset.go b/internal/utils/resourcegetter/nodeset.go index 5405d482b..4a90735a7 100644 --- a/internal/utils/resourcegetter/nodeset.go +++ b/internal/utils/resourcegetter/nodeset.go @@ -11,6 +11,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" + "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/utils/sliceutils" ) @@ -28,7 +29,11 @@ func ListNodeSetsByClusterRef(ctx context.Context, r client.Reader, clusterRef t return slices.SortedFunc( sliceutils.FilterSliceSeq(nodeSetList.Items, func(nodeSet slurmv1alpha1.NodeSet) bool { - return nodeSet.Spec.ClusterName == clusterRef.Name + if nodeSet.Spec.ClusterName != "" { + return nodeSet.Spec.ClusterName == clusterRef.Name + } + // Fallback for NodeSets not yet migrated by the nodesetcontroller. + return nodeSet.GetAnnotations()[consts.AnnotationParentalClusterRefName] == clusterRef.Name }), func(a, b slurmv1alpha1.NodeSet) int { return strings.Compare(a.Name, b.Name) diff --git a/internal/values/slurm_exporter.go b/internal/values/slurm_exporter.go index f8f90be0d..37af49748 100644 --- a/internal/values/slurm_exporter.go +++ b/internal/values/slurm_exporter.go @@ -7,6 +7,7 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/consts" + "nebius.ai/slurm-operator/internal/utils/resourcegetter" ) type SlurmExporter struct { @@ -51,10 +52,7 @@ func buildSlurmExporterFrom(namePrefix string, maintenance *consts.MaintenanceMo enabled = *exporter.Enabled } - deploymentName := "slurm-exporter" - if namePrefix != "" { - deploymentName = namePrefix + "-" + deploymentName - } + deploymentName := resourcegetter.BuildPrefixedName(namePrefix, "slurm-exporter") return SlurmExporter{ SlurmNode: *exporter.SlurmNode.DeepCopy(), From e2e4f249b05e1c8ba0c192c310282fba588da918 Mon Sep 17 00:00:00 2001 From: Andrii Shestakov Date: Thu, 23 Apr 2026 18:11:06 +0200 Subject: [PATCH 8/8] fix helm test --- .../tests/hostusers_k8s_new_test.yaml | 2 +- .../tests/hostusers_k8s_old_test.yaml | 2 +- .../tests/k8sjob_spec_test.yaml | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/helm/soperator-activechecks/tests/hostusers_k8s_new_test.yaml b/helm/soperator-activechecks/tests/hostusers_k8s_new_test.yaml index 72fca6911..7e1f0c3a7 100644 --- a/helm/soperator-activechecks/tests/hostusers_k8s_new_test.yaml +++ b/helm/soperator-activechecks/tests/hostusers_k8s_new_test.yaml @@ -8,7 +8,7 @@ tests: - it: should set hostUsers true on Kubernetes >= 1.33 documentSelector: path: metadata.name - value: test-hostusers-new-k8s + value: test-cluster-test-hostusers-new-k8s set: slurmClusterRefName: test-cluster images: diff --git a/helm/soperator-activechecks/tests/hostusers_k8s_old_test.yaml b/helm/soperator-activechecks/tests/hostusers_k8s_old_test.yaml index 66d20620c..2d53c4a1b 100644 --- a/helm/soperator-activechecks/tests/hostusers_k8s_old_test.yaml +++ b/helm/soperator-activechecks/tests/hostusers_k8s_old_test.yaml @@ -8,7 +8,7 @@ tests: - it: should set hostUsers false on Kubernetes < 1.33 documentSelector: path: metadata.name - value: test-hostusers-old-k8s + value: test-cluster-test-hostusers-old-k8s set: slurmClusterRefName: test-cluster images: diff --git a/helm/soperator-activechecks/tests/k8sjob_spec_test.yaml b/helm/soperator-activechecks/tests/k8sjob_spec_test.yaml index 5f9107f52..b53dfc419 100644 --- a/helm/soperator-activechecks/tests/k8sjob_spec_test.yaml +++ b/helm/soperator-activechecks/tests/k8sjob_spec_test.yaml @@ -5,7 +5,7 @@ tests: - it: should render k8sJobSpec with correct structure documentSelector: path: metadata.name - value: test-k8sjob + value: test-cluster-test-k8sjob set: slurmClusterRefName: test-cluster images: @@ -47,7 +47,7 @@ tests: - it: should allow overriding k8sJob container image documentSelector: path: metadata.name - value: test-k8sjob-image-override + value: test-cluster-test-k8sjob-image-override set: slurmClusterRefName: test-cluster images: @@ -75,7 +75,7 @@ tests: - it: should not render appArmorProfile at k8sJobSpec level (regression test) documentSelector: path: metadata.name - value: test-apparmorprofile-placement + value: test-cluster-test-apparmorprofile-placement set: slurmClusterRefName: test-cluster images: @@ -103,7 +103,7 @@ tests: - it: should resolve NCCL tests version from map for CUDA 12.9.0 documentSelector: path: metadata.name - value: test-nccl-map-lookup + value: test-cluster-test-nccl-map-lookup set: slurmClusterRefName: test-cluster cudaVersion: "12.9.0" @@ -143,7 +143,7 @@ tests: - it: should resolve NCCL tests version from map for CUDA 13.0.2 documentSelector: path: metadata.name - value: test-nccl-map-lookup-alt + value: test-cluster-test-nccl-map-lookup-alt set: slurmClusterRefName: test-cluster cudaVersion: "13.0.2" @@ -186,7 +186,7 @@ tests: - it: should use flat override when ncclTestsVersion is set documentSelector: path: metadata.name - value: test-nccl-flat-override + value: test-cluster-test-nccl-flat-override set: slurmClusterRefName: test-cluster cudaVersion: "12.9.0" @@ -225,7 +225,7 @@ tests: - it: should allow explicit hostUsers false override documentSelector: path: metadata.name - value: test-hostusers-override-false + value: test-cluster-test-hostusers-override-false set: slurmClusterRefName: test-cluster hostUsers: false @@ -249,7 +249,7 @@ tests: - it: should allow explicit hostUsers true override documentSelector: path: metadata.name - value: test-hostusers-override-true + value: test-cluster-test-hostusers-override-true set: slurmClusterRefName: test-cluster hostUsers: true