Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ type SlurmClusterSpec struct {
// PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
// +kubebuilder:default={ pyxis: { required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
PlugStackConfig PlugStackConfig `json:"plugStackConfig,omitempty"`

// SConfigController defines the desired state of controller that watches after configs
Expand Down Expand Up @@ -214,7 +214,7 @@ type PlugStackConfig struct {
// Pyxis represents the 'Pyxis' SPANK plugin configuration.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={ required: true, containerImageSave: "/var/cache/enroot-container-images/" }
// +kubebuilder:default={ required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" }
Pyxis PluginConfigPyxis `json:"pyxis,omitempty"`

// NcclDebug represents the 'NCCL Debug' SPANK plugin configuration.
Expand All @@ -239,6 +239,14 @@ type PluginConfigPyxis struct {
// +kubebuilder:default=true
Required *bool `json:"required,omitempty"`

// Path to the executable for pyxis importer extension.
// File should be available to execute for every user in Slurm.
// More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="/opt/slurm_scripts/pyxis_caching_importer.sh"
ImporterPath string `json:"importerPath,omitempty"`

// ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
// If the specified file or directory already exists, it will be reused.
// If the path does not exist, it will be created.
Expand All @@ -248,7 +256,7 @@ type PluginConfigPyxis struct {
// If the option argument is empty (""), SquashFS files will not be stored.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="/var/cache/enroot-container-images/"
// +kubebuilder:deprecation:warning="The ContainerImageSave field is deprecated and will be removed in a future release"
ContainerImageSave string `json:"containerImageSave,omitempty"`
}

Expand Down
12 changes: 9 additions & 3 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1231,7 +1231,7 @@ spec:
outputToStdOut: false
required: false
pyxis:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: PlugStackConfig represents the Plugin stack configurations
in `plugstack.conf`.
Expand Down Expand Up @@ -1322,12 +1322,11 @@ spec:
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
properties:
containerImageSave:
default: /var/cache/enroot-container-images/
description: |-
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
If the specified file or directory already exists, it will be reused.
Expand All @@ -1337,6 +1336,13 @@ spec:
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
If the option argument is empty (""), SquashFS files will not be stored.
type: string
importerPath:
default: /opt/slurm_scripts/pyxis_caching_importer.sh
description: |-
Path to the executable for pyxis importer extension.
File should be available to execute for every user in Slurm.
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
type: string
required:
default: true
description: |-
Expand Down
69 changes: 69 additions & 0 deletions helm/slurm-cluster/slurm_scripts/pyxis_caching_importer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
# Patched version of https://github.com/NVIDIA/pyxis/blob/v0.23.0/importers/caching_importer.sh

set -euo pipefail

readonly cmd="$1"

readonly cache_dir="${ENROOT_CONTAINER_IMAGES_CACHE_DIR:-/var/cache/enroot-container-images}"
readonly squashfs_temp_path="${cache_dir}/${SLURM_JOB_ID}.${SLURM_STEP_ID}.sqsh"

# Since it's not an ephemeral squashfs file, we can use compression.
export ENROOT_SQUASH_OPTIONS="-comp zstd -Xcompression-level 3 -b 1M"

case "${cmd}" in
get)
if [ $# -ne 2 ]; then
echo "usage: $0 get URI" >&2
exit 1
fi

readonly image_uri="$2"

mkdir -p -m 700 "${cache_dir}"

readonly digest=$(enroot digest "${image_uri}")
if [ -z "${digest}" ]; then
echo "error: could not retrieve digest for image: ${image_uri}" >&2
exit 1
fi
readonly squashfs_path="${cache_dir}/${digest}.sqsh"

if [ ! -e "${squashfs_path}" ]; then
# TODO: use `digest` approach once 406 Not Acceptable is tolerated in enroot
Comment thread
itechdima marked this conversation as resolved.
# https://github.com/NVIDIA/enroot/pull/263
# if [[ "${image_uri}" == *"@${digest}" ]]; then
# # URI already has the digest in it.
# enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2
# else
# # Add the digest to the URI.
# enroot import --output "${squashfs_temp_path}" "${image_uri}@${digest}" >&2
# fi
enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2

# Save the URI as an extended attribute.
if command -v "setfattr" >/dev/null; then
setfattr -n user.image_uri -v "${image_uri}" "${squashfs_temp_path}"
fi

chmod 777 "${squashfs_temp_path}"
mv -n "${squashfs_temp_path}" "${squashfs_path}"
fi

# Output the squashfs path on stdout for pyxis to read
echo "${squashfs_path}"
;;
release)
if [ $# -ne 1 ]; then
echo "usage: $0 release" >&2
exit 1
fi

# Remove temporary file if still present (e.g. "get" was interrupted)
rm -f "${squashfs_temp_path}"
;;
*)
echo "error: unknown command: ${cmd}" >&2
exit 1
;;
esac
2 changes: 1 addition & 1 deletion helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ spec:
- name: slurm-scripts
configMap:
name: slurm-scripts
defaultMode: 500
defaultMode: 0755
{{- range .Values.volumeSources }}
- name: {{ .name | quote }}
{{- omit . "name" "createPVC" "storageClassName" "size" | toYaml | nindent 6 }}
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/templates/slurm-scripts-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
release: {{ .Release.Name }}
data:
# base scripts
{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" }}
{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" "pyxis_caching_importer.sh" }}
{{- $content := index $.Values.slurmScripts $name }}
{{ $name }}: |-
{{- if $content }}
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ These tests verify the following kubebuilder default values:

### PlugStackConfig
- `pyxis.required: true`
- `pyxis.containerImageSave: "/var/cache/enroot-container-images/"`
- `pyxis.importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh"`
- `ncclDebug.required: false`
- `ncclDebug.enabled: false`
- `ncclDebug.logLevel: "INFO"`
Expand Down
4 changes: 2 additions & 2 deletions helm/slurm-cluster/tests/default-values_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ tests:
path: spec.plugStackConfig.pyxis.required
value: true
- equal:
path: spec.plugStackConfig.pyxis.containerImageSave
value: "/var/cache/enroot-container-images/"
path: spec.plugStackConfig.pyxis.importerPath
value: "/opt/slurm_scripts/pyxis_caching_importer.sh"
- equal:
path: spec.plugStackConfig.ncclDebug.required
value: false
Expand Down
3 changes: 2 additions & 1 deletion helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ mpiConfig:
plugStackConfig:
pyxis:
required: true
containerImageSave: "/var/cache/enroot-container-images/"
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
ncclDebug:
required: false
enabled: true
Expand Down Expand Up @@ -520,6 +520,7 @@ slurmScripts:
prolog.sh: null
epilog.sh: null
hc_program.sh: null
pyxis_caching_importer.sh: null
# Built-in scripts can be overridden or disabled here
# Keys must match file names in slurm_scripts/
builtIn:
Expand Down
12 changes: 9 additions & 3 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27447,7 +27447,7 @@ spec:
outputToStdOut: false
required: false
pyxis:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: PlugStackConfig represents the Plugin stack configurations
in `plugstack.conf`.
Expand Down Expand Up @@ -27538,12 +27538,11 @@ spec:
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
properties:
containerImageSave:
default: /var/cache/enroot-container-images/
description: |-
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
If the specified file or directory already exists, it will be reused.
Expand All @@ -27553,6 +27552,13 @@ spec:
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
If the option argument is empty (""), SquashFS files will not be stored.
type: string
importerPath:
default: /opt/slurm_scripts/pyxis_caching_importer.sh
description: |-
Path to the executable for pyxis importer extension.
File should be available to execute for every user in Slurm.
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
type: string
required:
default: true
description: |-
Expand Down
12 changes: 9 additions & 3 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27447,7 +27447,7 @@ spec:
outputToStdOut: false
required: false
pyxis:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: PlugStackConfig represents the Plugin stack configurations
in `plugstack.conf`.
Expand Down Expand Up @@ -27538,12 +27538,11 @@ spec:
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
required: true
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
properties:
containerImageSave:
default: /var/cache/enroot-container-images/
description: |-
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
If the specified file or directory already exists, it will be reused.
Expand All @@ -27553,6 +27552,13 @@ spec:
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
If the option argument is empty (""), SquashFS files will not be stored.
type: string
importerPath:
default: /opt/slurm_scripts/pyxis_caching_importer.sh
description: |-
Path to the executable for pyxis importer extension.
File should be available to execute for every user in Slurm.
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
type: string
required:
default: true
description: |-
Expand Down
2 changes: 1 addition & 1 deletion images/login/sshd.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \
chmod 644 /etc/enroot/enroot.conf.d/custom-dirs.conf

ARG SLURM_VERSION
ARG PYXIS_VERSION=0.21.0
ARG PYXIS_VERSION=0.23.0
# Install slurm pyxis plugin
RUN apt-get update && \
apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \
Expand Down
2 changes: 1 addition & 1 deletion images/slurm_check_job/slurm_check_job.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \
chmod 644 /etc/enroot/enroot.conf.d/custom-dirs.conf

ARG SLURM_VERSION
ARG PYXIS_VERSION=0.21.0
ARG PYXIS_VERSION=0.23.0
# Install slurm pyxis plugin \
RUN apt-get update && \
apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \
Expand Down
2 changes: 1 addition & 1 deletion images/worker/slurmd.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ RUN chown 0:0 /etc/enroot/enroot.conf && \

# Install slurm pyxis plugin
ARG SLURM_VERSION
ARG PYXIS_VERSION=0.21.0
ARG PYXIS_VERSION=0.23.0
RUN apt-get update && \
apt -y install nvslurm-plugin-pyxis=${SLURM_VERSION}-${PYXIS_VERSION}-1 && \
apt-get clean && \
Expand Down
3 changes: 1 addition & 2 deletions internal/render/common/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,6 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {

res.AddLine(fmt.Sprintf("required chroot.so %s", consts.VolumeMountPathJail))

// TODO(@itechdima): make `expose_enroot_logs` configurable and enable it once #413 is resolved.
res.AddLine(strings.Join(
[]string{
utils.Ternary(cluster.PlugStackConfig.Pyxis.Required != nil && *cluster.PlugStackConfig.Pyxis.Required, "required", "optional"),
Expand All @@ -579,7 +578,7 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
"execute_entrypoint=0",
"container_scope=global",
"sbatch_support=1",
fmt.Sprintf("container_image_save=%s", cluster.PlugStackConfig.Pyxis.ContainerImageSave),
fmt.Sprintf("importer=%s", cluster.PlugStackConfig.Pyxis.ImporterPath),
},
" ",
))
Expand Down
8 changes: 4 additions & 4 deletions internal/render/common/configmap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,20 +352,20 @@ func TestRenderPlugstack(t *testing.T) {
},
}).Render()
assert.NotEmpty(t, result)
assert.Contains(t, result, "optional spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=")
assert.Contains(t, result, "optional spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 importer=")
})

t.Run("Pyxis options", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
Pyxis: slurmv1.PluginConfigPyxis{
Required: ptr.To(true),
ContainerImageSave: "/tmp/",
Required: ptr.To(true),
ImporterPath: "/opt/importer.sh",
},
},
}).Render()
assert.NotEmpty(t, result)
assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=/tmp/")
assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 importer=/opt/importer.sh")
})

t.Run("NCCL no options", func(t *testing.T) {
Expand Down
Loading