diff --git a/api/v1/slurmcluster_types.go b/api/v1/slurmcluster_types.go index f92a06c88..2a5a1d1a7 100644 --- a/api/v1/slurmcluster_types.go +++ b/api/v1/slurmcluster_types.go @@ -111,7 +111,7 @@ type SlurmClusterSpec struct { // PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`. // // +kubebuilder:validation:Optional - // +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } } + // +kubebuilder:default={ pyxis: { required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } } PlugStackConfig PlugStackConfig `json:"plugStackConfig,omitempty"` // SConfigController defines the desired state of controller that watches after configs @@ -214,7 +214,7 @@ type PlugStackConfig struct { // Pyxis represents the 'Pyxis' SPANK plugin configuration. // // +kubebuilder:validation:Optional - // +kubebuilder:default={ required: true, containerImageSave: "/var/cache/enroot-container-images/" } + // +kubebuilder:default={ required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" } Pyxis PluginConfigPyxis `json:"pyxis,omitempty"` // NcclDebug represents the 'NCCL Debug' SPANK plugin configuration. @@ -239,6 +239,14 @@ type PluginConfigPyxis struct { // +kubebuilder:default=true Required *bool `json:"required,omitempty"` + // Path to the executable for pyxis importer extension. + // File should be available to execute for every user in Slurm. + // More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default="/opt/slurm_scripts/pyxis_caching_importer.sh" + ImporterPath string `json:"importerPath,omitempty"` + // ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored. // If the specified file or directory already exists, it will be reused. // If the path does not exist, it will be created. @@ -248,7 +256,7 @@ type PluginConfigPyxis struct { // If the option argument is empty (""), SquashFS files will not be stored. // // +kubebuilder:validation:Optional - // +kubebuilder:default="/var/cache/enroot-container-images/" + // +kubebuilder:deprecation:warning="The ContainerImageSave field is deprecated and will be removed in a future release" ContainerImageSave string `json:"containerImageSave,omitempty"` } diff --git a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml index 287b2411d..3eb1bcb06 100644 --- a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml +++ b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml @@ -1231,7 +1231,7 @@ spec: outputToStdOut: false required: false pyxis: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`. @@ -1322,12 +1322,11 @@ spec: type: object pyxis: default: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: Pyxis represents the 'Pyxis' SPANK plugin configuration. properties: containerImageSave: - default: /var/cache/enroot-container-images/ description: |- ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored. If the specified file or directory already exists, it will be reused. @@ -1337,6 +1336,13 @@ spec: If the image name contains '/', a nested directory will be created under the specified path (if it is a directory). If the option argument is empty (""), SquashFS files will not be stored. type: string + importerPath: + default: /opt/slurm_scripts/pyxis_caching_importer.sh + description: |- + Path to the executable for pyxis importer extension. + File should be available to execute for every user in Slurm. + More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers. + type: string required: default: true description: |- diff --git a/helm/slurm-cluster/slurm_scripts/pyxis_caching_importer.sh b/helm/slurm-cluster/slurm_scripts/pyxis_caching_importer.sh new file mode 100644 index 000000000..51d9a2c68 --- /dev/null +++ b/helm/slurm-cluster/slurm_scripts/pyxis_caching_importer.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Patched version of https://github.com/NVIDIA/pyxis/blob/v0.23.0/importers/caching_importer.sh + +set -euo pipefail + +readonly cmd="$1" + +readonly cache_dir="${ENROOT_CONTAINER_IMAGES_CACHE_DIR:-/var/cache/enroot-container-images}" +readonly squashfs_temp_path="${cache_dir}/${SLURM_JOB_ID}.${SLURM_STEP_ID}.sqsh" + +# Since it's not an ephemeral squashfs file, we can use compression. +export ENROOT_SQUASH_OPTIONS="-comp zstd -Xcompression-level 3 -b 1M" + +case "${cmd}" in + get) + if [ $# -ne 2 ]; then + echo "usage: $0 get URI" >&2 + exit 1 + fi + + readonly image_uri="$2" + + mkdir -p -m 700 "${cache_dir}" + + readonly digest=$(enroot digest "${image_uri}") + if [ -z "${digest}" ]; then + echo "error: could not retrieve digest for image: ${image_uri}" >&2 + exit 1 + fi + readonly squashfs_path="${cache_dir}/${digest}.sqsh" + + if [ ! -e "${squashfs_path}" ]; then + # TODO: use `digest` approach once 406 Not Acceptable is tolerated in enroot + # https://github.com/NVIDIA/enroot/pull/263 + # if [[ "${image_uri}" == *"@${digest}" ]]; then + # # URI already has the digest in it. + # enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2 + # else + # # Add the digest to the URI. + # enroot import --output "${squashfs_temp_path}" "${image_uri}@${digest}" >&2 + # fi + enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2 + + # Save the URI as an extended attribute. + if command -v "setfattr" >/dev/null; then + setfattr -n user.image_uri -v "${image_uri}" "${squashfs_temp_path}" + fi + + chmod 777 "${squashfs_temp_path}" + mv -n "${squashfs_temp_path}" "${squashfs_path}" + fi + + # Output the squashfs path on stdout for pyxis to read + echo "${squashfs_path}" + ;; + release) + if [ $# -ne 1 ]; then + echo "usage: $0 release" >&2 + exit 1 + fi + + # Remove temporary file if still present (e.g. "get" was interrupted) + rm -f "${squashfs_temp_path}" + ;; + *) + echo "error: unknown command: ${cmd}" >&2 + exit 1 + ;; +esac diff --git a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml index c65fd423d..27d80c5ec 100644 --- a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml +++ b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml @@ -94,7 +94,7 @@ spec: - name: slurm-scripts configMap: name: slurm-scripts - defaultMode: 500 + defaultMode: 0755 {{- range .Values.volumeSources }} - name: {{ .name | quote }} {{- omit . "name" "createPVC" "storageClassName" "size" | toYaml | nindent 6 }} diff --git a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml index 2ed13dcb6..bfa21a123 100644 --- a/helm/slurm-cluster/templates/slurm-scripts-cm.yaml +++ b/helm/slurm-cluster/templates/slurm-scripts-cm.yaml @@ -8,7 +8,7 @@ metadata: release: {{ .Release.Name }} data: # base scripts -{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" }} +{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" "pyxis_caching_importer.sh" }} {{- $content := index $.Values.slurmScripts $name }} {{ $name }}: |- {{- if $content }} diff --git a/helm/slurm-cluster/tests/README.md b/helm/slurm-cluster/tests/README.md index 432efc334..93bf819f3 100644 --- a/helm/slurm-cluster/tests/README.md +++ b/helm/slurm-cluster/tests/README.md @@ -52,7 +52,7 @@ These tests verify the following kubebuilder default values: ### PlugStackConfig - `pyxis.required: true` -- `pyxis.containerImageSave: "/var/cache/enroot-container-images/"` +- `pyxis.importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh"` - `ncclDebug.required: false` - `ncclDebug.enabled: false` - `ncclDebug.logLevel: "INFO"` diff --git a/helm/slurm-cluster/tests/default-values_test.yaml b/helm/slurm-cluster/tests/default-values_test.yaml index 8b0ed9cc4..a2ba1a883 100644 --- a/helm/slurm-cluster/tests/default-values_test.yaml +++ b/helm/slurm-cluster/tests/default-values_test.yaml @@ -79,8 +79,8 @@ tests: path: spec.plugStackConfig.pyxis.required value: true - equal: - path: spec.plugStackConfig.pyxis.containerImageSave - value: "/var/cache/enroot-container-images/" + path: spec.plugStackConfig.pyxis.importerPath + value: "/opt/slurm_scripts/pyxis_caching_importer.sh" - equal: path: spec.plugStackConfig.ncclDebug.required value: false diff --git a/helm/slurm-cluster/values.yaml b/helm/slurm-cluster/values.yaml index 0a5d262a7..df3781ef2 100644 --- a/helm/slurm-cluster/values.yaml +++ b/helm/slurm-cluster/values.yaml @@ -158,7 +158,7 @@ mpiConfig: plugStackConfig: pyxis: required: true - containerImageSave: "/var/cache/enroot-container-images/" + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh ncclDebug: required: false enabled: true @@ -520,6 +520,7 @@ slurmScripts: prolog.sh: null epilog.sh: null hc_program.sh: null + pyxis_caching_importer.sh: null # Built-in scripts can be overridden or disabled here # Keys must match file names in slurm_scripts/ builtIn: diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 04bb1e066..0bdccd0c3 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -27447,7 +27447,7 @@ spec: outputToStdOut: false required: false pyxis: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`. @@ -27538,12 +27538,11 @@ spec: type: object pyxis: default: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: Pyxis represents the 'Pyxis' SPANK plugin configuration. properties: containerImageSave: - default: /var/cache/enroot-container-images/ description: |- ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored. If the specified file or directory already exists, it will be reused. @@ -27553,6 +27552,13 @@ spec: If the image name contains '/', a nested directory will be created under the specified path (if it is a directory). If the option argument is empty (""), SquashFS files will not be stored. type: string + importerPath: + default: /opt/slurm_scripts/pyxis_caching_importer.sh + description: |- + Path to the executable for pyxis importer extension. + File should be available to execute for every user in Slurm. + More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers. + type: string required: default: true description: |- diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 04bb1e066..0bdccd0c3 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -27447,7 +27447,7 @@ spec: outputToStdOut: false required: false pyxis: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`. @@ -27538,12 +27538,11 @@ spec: type: object pyxis: default: - containerImageSave: /var/cache/enroot-container-images/ + importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh required: true description: Pyxis represents the 'Pyxis' SPANK plugin configuration. properties: containerImageSave: - default: /var/cache/enroot-container-images/ description: |- ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored. If the specified file or directory already exists, it will be reused. @@ -27553,6 +27552,13 @@ spec: If the image name contains '/', a nested directory will be created under the specified path (if it is a directory). If the option argument is empty (""), SquashFS files will not be stored. type: string + importerPath: + default: /opt/slurm_scripts/pyxis_caching_importer.sh + description: |- + Path to the executable for pyxis importer extension. + File should be available to execute for every user in Slurm. + More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers. + type: string required: default: true description: |- diff --git a/images/login/sshd.dockerfile b/images/login/sshd.dockerfile index 9c2aaaa91..7b4c271f9 100644 --- a/images/login/sshd.dockerfile +++ b/images/login/sshd.dockerfile @@ -53,7 +53,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \ chmod 644 /etc/enroot/enroot.conf.d/custom-dirs.conf ARG SLURM_VERSION -ARG PYXIS_VERSION=0.21.0 +ARG PYXIS_VERSION=0.23.0 # Install slurm pyxis plugin RUN apt-get update && \ apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \ diff --git a/images/slurm_check_job/slurm_check_job.dockerfile b/images/slurm_check_job/slurm_check_job.dockerfile index 9f033f4c1..9b61e2285 100644 --- a/images/slurm_check_job/slurm_check_job.dockerfile +++ b/images/slurm_check_job/slurm_check_job.dockerfile @@ -27,7 +27,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \ chmod 644 /etc/enroot/enroot.conf.d/custom-dirs.conf ARG SLURM_VERSION -ARG PYXIS_VERSION=0.21.0 +ARG PYXIS_VERSION=0.23.0 # Install slurm pyxis plugin \ RUN apt-get update && \ apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \ diff --git a/images/worker/slurmd.dockerfile b/images/worker/slurmd.dockerfile index 1eedfa4d2..f8b39d925 100644 --- a/images/worker/slurmd.dockerfile +++ b/images/worker/slurmd.dockerfile @@ -62,7 +62,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \ # Install slurm pyxis plugin ARG SLURM_VERSION -ARG PYXIS_VERSION=0.21.0 +ARG PYXIS_VERSION=0.23.0 RUN apt-get update && \ apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \ apt-get clean && \ diff --git a/internal/render/common/configmap.go b/internal/render/common/configmap.go index 9092b7433..350478fcb 100644 --- a/internal/render/common/configmap.go +++ b/internal/render/common/configmap.go @@ -570,7 +570,6 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile { res.AddLine(fmt.Sprintf("required chroot.so %s", consts.VolumeMountPathJail)) - // TODO(@itechdima): make `expose_enroot_logs` configurable and enable it once #413 is resolved. res.AddLine(strings.Join( []string{ utils.Ternary(cluster.PlugStackConfig.Pyxis.Required != nil && *cluster.PlugStackConfig.Pyxis.Required, "required", "optional"), @@ -579,7 +578,7 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile { "execute_entrypoint=0", "container_scope=global", "sbatch_support=1", - fmt.Sprintf("container_image_save=%s", cluster.PlugStackConfig.Pyxis.ContainerImageSave), + fmt.Sprintf("importer=%s", cluster.PlugStackConfig.Pyxis.ImporterPath), }, " ", )) diff --git a/internal/render/common/configmap_test.go b/internal/render/common/configmap_test.go index 62dbc3d7e..27cfccee5 100644 --- a/internal/render/common/configmap_test.go +++ b/internal/render/common/configmap_test.go @@ -352,20 +352,20 @@ func TestRenderPlugstack(t *testing.T) { }, }).Render() assert.NotEmpty(t, result) - assert.Contains(t, result, "optional spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=") + assert.Contains(t, result, "optional spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 importer=") }) t.Run("Pyxis options", func(t *testing.T) { result := generateSpankConfig(&values.SlurmCluster{ PlugStackConfig: slurmv1.PlugStackConfig{ Pyxis: slurmv1.PluginConfigPyxis{ - Required: ptr.To(true), - ContainerImageSave: "/tmp/", + Required: ptr.To(true), + ImporterPath: "/opt/importer.sh", }, }, }).Render() assert.NotEmpty(t, result) - assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=/tmp/") + assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 importer=/opt/importer.sh") }) t.Run("NCCL no options", func(t *testing.T) {