Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ helm: generate manifests kustomize helmify ## Update soperator Helm chart
in_metadata && /^ name:/ {print; if (!done) {print " {{- if .Values.certManager.enabled }}"; print " annotations:"; print " cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include \"soperator.fullname\" . }}-serving-cert"; print " {{- end }}"; done=1}; next} \
in_metadata && /^ annotations:/ {next} \
in_metadata && /^ cert-manager/ {next} \
in_metadata && /^ \{\{- if .Values.certManager.enabled \}\}/ {next} \
in_metadata && /^ \{\{- end \}\}/ {next} \
in_metadata && /^ labels:/ {in_metadata=0} \
{print}' \
$(CHART_OPERATOR_PATH)/templates/mutating-webhook-configuration.yaml > $(CHART_OPERATOR_PATH)/templates/mutating-webhook-configuration.yaml.tmp && \
Expand All @@ -167,6 +169,8 @@ helm: generate manifests kustomize helmify ## Update soperator Helm chart
in_metadata && /^ name:/ {print; if (!done) {print " {{- if .Values.certManager.enabled }}"; print " annotations:"; print " cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include \"soperator.fullname\" . }}-serving-cert"; print " {{- end }}"; done=1}; next} \
in_metadata && /^ annotations:/ {next} \
in_metadata && /^ cert-manager/ {next} \
in_metadata && /^ \{\{- if .Values.certManager.enabled \}\}/ {next} \
in_metadata && /^ \{\{- end \}\}/ {next} \
in_metadata && /^ labels:/ {in_metadata=0} \
{print}' \
$(CHART_OPERATOR_PATH)/templates/validating-webhook-configuration.yaml > $(CHART_OPERATOR_PATH)/templates/validating-webhook-configuration.yaml.tmp && \
Expand Down
6 changes: 6 additions & 0 deletions api/v1alpha1/nodeset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ const (

// NodeSetSpec defines the desired state of NodeSet
type NodeSetSpec struct {
// ClusterName is the name of the SlurmCluster this NodeSet belongs to.
// Must be in the same namespace as the NodeSet.
//
// +kubebuilder:validation:Optional
ClusterName string `json:"clusterName,omitempty"`

// Replicas specifies the number of worker nodes in the NodeSet.
//
// Defaults to 1 if not specified.
Expand Down
3 changes: 2 additions & 1 deletion cmd/sconfigcontroller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ func main() {
WebhookServer: webhookServer,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "vqeyz6ae.nebius.ai",
LeaderElectionID: clusterName + ".vqeyz6ae.nebius.ai",
LeaderElectionReleaseOnCancel: true,
Cache: cache.Options{
DefaultNamespaces: map[string]cache.Config{
Expand Down Expand Up @@ -215,6 +215,7 @@ func main() {
if err = (sconfigcontroller.NewJailedConfigReconciler(
mgr.GetClient(),
mgr.GetScheme(),
clusterName,
slurmAPIClient,
jailFs,
reconfigurePollInterval,
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/slurm.nebius.ai_nodesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,11 @@ spec:
x-kubernetes-list-type: atomic
type: object
type: object
clusterName:
description: |-
ClusterName is the name of the SlurmCluster this NodeSet belongs to.
Must be in the same namespace as the NodeSet.
type: string
configMapRefSshd:
description: |-
ConfigMapRefSSHD defines the config name of Slurm SSHD.
Expand Down
26 changes: 0 additions & 26 deletions config/webhook/manifests.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,5 @@
---
apiVersion: admissionregistration.k8s.io/v1
kind: MutatingWebhookConfiguration
metadata:
name: mutating-webhook-configuration
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /mutate-slurm-nebius-ai-v1alpha1-nodeset
failurePolicy: Fail
name: mnodeset-v1alpha1.kb.io
rules:
- apiGroups:
- slurm.nebius.ai
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- nodesets
sideEffects: None
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: validating-webhook-configuration
Expand Down
5 changes: 5 additions & 0 deletions helm/nodesets/templates/nodeset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ metadata:
{{- end }}

spec:
{{- with $.Values.clusterName }}
clusterName: {{ . | quote }}
{{- end }}
Comment thread
andriishestakov marked this conversation as resolved.

{{- with (.replicas | default 1) }}
replicas: {{ . }}
{{- end }}
Expand Down Expand Up @@ -134,6 +138,7 @@ spec:
security:
limitsConfig: {{ get (.security | default dict) "limitsConfig" | quote }}
appArmorProfile: {{ get (.security | default dict) "appArmorProfile" | default "unconfined" | quote }}
procMount: {{ get (.security | default dict) "procMount" | default "Default" | quote }}
{{- end }}

{{- with (required ".Values.nodesets[*].munge is required." .munge) }}
Expand Down
1 change: 1 addition & 0 deletions helm/nodesets/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Global settings
nameOverride: ""
fullnameOverride: ""
clusterName: ""
# Priority Classes configuration
# Define priority classes that can be used by NodeSets
priorityClasses:
Expand Down
7 changes: 6 additions & 1 deletion helm/slurm-cluster/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
{{- include "slurm-cluster.selectorLabels" . | trim | nindent 0 }}
{{- end }}

{{/* Name of the slurm-scripts ConfigMap */}}
{{- define "slurm-cluster.slurmScriptsCMName" -}}
{{- printf "%s-slurm-scripts" (include "slurm-cluster.name" .) -}}
{{- end -}}

{{- define "validateAccountingConfig" -}}
{{- if .Values.slurmNodes.accounting.enabled -}}
{{- if not (or .Values.slurmNodes.accounting.externalDB.enabled .Values.slurmNodes.accounting.mariadbOperator.enabled) -}}
Expand Down Expand Up @@ -62,7 +67,7 @@ Create the name of the service account to use for exporter
*/}}
{{- define "slurm-cluster.exporter.serviceAccountName" -}}
{{- if .Values.slurmNodes.exporter.serviceAccount.create -}}
{{- default "slurm-exporter-sa" .Values.slurmNodes.exporter.serviceAccount.name }}
{{- default (printf "%s-exporter-sa" (include "slurm-cluster.name" .)) .Values.slurmNodes.exporter.serviceAccount.name }}
{{- else -}}
{{- default "default" .Values.slurmNodes.exporter.serviceAccount.name }}
{{- end -}}
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ spec:
volumeSources:
- name: slurm-scripts
configMap:
name: slurm-scripts
name: {{ include "slurm-cluster.slurmScriptsCMName" . }}
defaultMode: 0755
{{- range .Values.volumeSources }}
- name: {{ .name | quote }}
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/templates/slurm-scripts-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ .Release.Namespace }}
name: slurm-scripts
name: {{ include "slurm-cluster.slurmScriptsCMName" . }}
labels:
app: {{ .Chart.Name }}
release: {{ .Release.Name }}
Expand Down
8 changes: 4 additions & 4 deletions helm/slurm-cluster/tests/exporter-rbac_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ tests:
of: ServiceAccount
- equal:
path: metadata.name
value: slurm-exporter-sa
value: test-cluster-exporter-sa
- equal:
path: metadata.namespace
value: NAMESPACE
Expand Down Expand Up @@ -114,7 +114,7 @@ tests:
path: subjects
content:
kind: ServiceAccount
name: slurm-exporter-sa
name: test-cluster-exporter-sa
namespace: NAMESPACE

# Test RoleBinding with custom ServiceAccount name
Expand Down Expand Up @@ -162,7 +162,7 @@ tests:
asserts:
- equal:
path: spec.slurmNodes.exporter.serviceAccountName
value: slurm-exporter-sa
value: test-cluster-exporter-sa

# Test SlurmCluster CR uses custom ServiceAccount name
- it: should set custom serviceAccountName in SlurmCluster CR
Expand Down Expand Up @@ -207,4 +207,4 @@ tests:
asserts:
- equal:
path: spec.slurmNodes.exporter.serviceAccountName
value: slurm-exporter-sa
value: test-cluster-exporter-sa
5 changes: 3 additions & 2 deletions helm/soperator-activechecks/scripts/retrigger-checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import datetime

NS = os.environ["NAMESPACE"]
SLURM_CLUSTER_REF_NAME = os.environ["SLURM_CLUSTER_REF_NAME"]

logging.Formatter.converter = time.gmtime
logging.basicConfig(
Expand All @@ -27,8 +28,8 @@ def get_active_checks():
]))
active_checks = []
for it in data.get("items", []):
rac = it.get("spec", {}).get("runAfterCreation")
if rac:
spec = it.get("spec", {})
if spec.get("runAfterCreation") and spec.get("slurmClusterRefName") == SLURM_CLUSTER_REF_NAME:
active_checks.append(it["metadata"]["name"])
return active_checks

Expand Down
16 changes: 16 additions & 0 deletions helm/soperator-activechecks/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,22 @@ Otherwise: true for Kubernetes >= 1.33 (user namespaces stable, explicitly opt i
{{- end -}}
{{- end -}}

{{/*
Name for the run-extensive-check-on-reservations CronJob and its RBAC resources.
Prefixed with slurmClusterRefName to support multiple releases in the same namespace.
*/}}
{{- define "soperator-activechecks.extensiveCheckJobName" -}}
{{- printf "%s-run-extensive-check-on-reservations" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Name for the activecheck-waiter ServiceAccount and its RBAC resources.
Prefixed with slurmClusterRefName to support multiple releases in the same namespace.
*/}}
{{- define "soperator-activechecks.waitForChecksName" -}}
{{- printf "%s-activecheck-waiter" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Validate that a check does not enable both commentSlurmNode and drainSlurmNode
under `failureReactions` for a single check. Invoke with (dict "name" "<checkKey>" "vals" .Values)
Expand Down
8 changes: 5 additions & 3 deletions helm/soperator-activechecks/templates/active-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
apiVersion: slurm.nebius.ai/v1alpha1
kind: ActiveCheck
metadata:
name: {{ $name | quote }}
name: {{ printf "%s-%s" $root.Values.slurmClusterRefName $name | quote }}
spec:
checkType: {{ $check.checkType | quote }}
name: {{ $name | quote }}
name: {{ printf "%s-%s" $root.Values.slurmClusterRefName $name | quote }}
{{- with $check.dependsOn }}
dependsOn:
{{ toYaml . | indent 4 }}
{{- range . }}
- {{ printf "%s-%s" $root.Values.slurmClusterRefName . | quote }}
{{- end }}
{{- end }}
slurmClusterRefName: {{ $root.Values.slurmClusterRefName | quote }}
{{- with $check.schedule }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{{- if (index .Values.checks "extensive-check").enabled -}}
apiVersion: batch/v1
kind: CronJob
metadata:
name: "run-extensive-check-on-reservations"
name: {{ include "soperator-activechecks.extensiveCheckJobName" . | quote }}
labels:
app.kubernetes.io/component: soperatorchecks
app.kubernetes.io/instance: soperator
Expand Down Expand Up @@ -51,7 +52,7 @@ spec:
- name: RESERVATION_PREFIX
value: {{ (index .Values.checks "extensive-check").reservationPrefix | quote }}
- name: TARGET_ACTIVE_CHECK_NAME
value: extensive-check
value: {{ printf "%s-extensive-check" .Values.slurmClusterRefName | quote }}
volumeMounts:
- mountPath: /mnt/slurm-configs
name: slurm-configs
Expand Down Expand Up @@ -101,8 +102,8 @@ spec:
name: munge-socket
dnsPolicy: ClusterFirst
restartPolicy: Never
serviceAccount: run-extensive-check-on-reservations
serviceAccountName: run-extensive-check-on-reservations
serviceAccount: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
serviceAccountName: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
Expand All @@ -127,14 +128,14 @@ spec:
apiVersion: v1
kind: ServiceAccount
metadata:
name: run-extensive-check-on-reservations
namespace: soperator
name: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
namespace: {{ .Release.Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: run-extensive-check-on-reservations
namespace: soperator
name: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
namespace: {{ .Release.Namespace }}
rules:
- apiGroups:
- batch
Expand All @@ -155,13 +156,14 @@ rules:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: run-extensive-check-on-reservations
namespace: soperator
name: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: run-extensive-check-on-reservations
name: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
subjects:
- kind: ServiceAccount
name: run-extensive-check-on-reservations
namespace: soperator
name: {{ include "soperator-activechecks.extensiveCheckJobName" . }}
namespace: {{ .Release.Namespace }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
apiVersion: batch/v1
kind: Job
metadata:
name: wait-for-active-checks
name: {{ printf "%s-wait-for-active-checks" .Values.slurmClusterRefName | trunc 63 | trimSuffix "-" }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-delete-policy": hook-succeeded,hook-failed,before-hook-creation
spec:
template:
spec:
restartPolicy: Never
serviceAccountName: activecheck-waiter
serviceAccountName: {{ include "soperator-activechecks.waitForChecksName" . }}
containers:
- name: wait-for-active-checks
image: alpine/k8s:1.31.11
Expand All @@ -23,11 +23,12 @@ spec:

NAMESPACE="{{ .Release.Namespace }}"
CRD_KIND="activechecks.slurm.nebius.ai"
SLURM_CLUSTER_REF_NAME="{{ .Values.slurmClusterRefName }}"

echo "Fetching ActiveCheck list..."
# Only wait for checks that run on creation and ignore flappy comment-only checks.
active_check_names=$(kubectl get "$CRD_KIND" -n "$NAMESPACE" -o json \
| jq -r '.items[] | select(.spec.runAfterCreation == true and .spec.failureReactions.commentSlurmNode == null) | .metadata.name')
| jq -r --arg cluster "$SLURM_CLUSTER_REF_NAME" '.items[] | select(.spec.runAfterCreation == true and .spec.failureReactions.commentSlurmNode == null and .spec.slurmClusterRefName == $cluster) | .metadata.name')

if [ -z "$active_check_names" ]; then
echo "No CRs with runAfterCreation=true found. Exiting."
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: activecheck-waiter-role
name: {{ include "soperator-activechecks.waitForChecksName" . }}-role
namespace: {{ .Release.Namespace }}
rules:
- apiGroups: ["slurm.nebius.ai"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: activecheck-waiter-rolebinding
name: {{ include "soperator-activechecks.waitForChecksName" . }}-rolebinding
namespace: {{ .Release.Namespace }}
subjects:
- kind: ServiceAccount
name: activecheck-waiter
name: {{ include "soperator-activechecks.waitForChecksName" . }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: Role
name: activecheck-waiter-role
name: {{ include "soperator-activechecks.waitForChecksName" . }}-role
apiGroup: rbac.authorization.k8s.io
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: activecheck-waiter
name: {{ include "soperator-activechecks.waitForChecksName" . }}
namespace: {{ .Release.Namespace }}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ tests:
- it: should set hostUsers true on Kubernetes >= 1.33
documentSelector:
path: metadata.name
value: test-hostusers-new-k8s
value: test-cluster-test-hostusers-new-k8s
set:
slurmClusterRefName: test-cluster
images:
Expand Down
Loading
Loading