Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ type PlugStackConfig struct {
// +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" }
NcclDebug PluginConfigNcclDebug `json:"ncclDebug,omitempty"`

// NcclInspectorPreConf represents the NCCL Inspector Pre-Configuration SPANK plugin configuration.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={ required: false, pluginPath: "/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so", enabled: false, profilerPlugin: "/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so", dumpDir: "/opt/soperator-outputs/nccl_profiles/%j/%s", dumpThreadIntervalMicroseconds: 1000000 }
NcclInspectorPreConf PluginConfigNcclInspectorPreConf `json:"ncclInspectorPreConf,omitempty"`

// PluginConfigCustom represents a configuration of custom SPANK plugins.
//
// +kubebuilder:validation:Optional
Expand Down Expand Up @@ -301,6 +307,53 @@ type PluginConfigNcclDebug struct {
OutputDirectory string `json:"outputDirectory,omitempty"`
}

// PluginConfigNcclInspectorPreConf represents the NCCL Inspector Pre-Configuration SPANK plugin configuration.
//
// See: https://github.com/nebius/slurm-plugins/tree/main/spank/nccl-inspector-preconf
type PluginConfigNcclInspectorPreConf struct {
// Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM.
// Otherwise, 'optional'.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
Required bool `json:"required,omitempty"`

// PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file.
//
// Deprecated: It makes the ldconfig execution being not required.
// This is going to be removed when the plugin installation is established.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so"
PluginPath string `json:"pluginPath,omitempty"`

// Enabled defines whether to enable the plugin.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
Enabled *bool `json:"enabled,omitempty"`

// ProfilerPlugin defines a file path to NCCL Inspector's SO file.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so"
ProfilerPlugin string `json:"profilerPlugin,omitempty"`

// DumpDir defines a directory path where NCCL profiles will be dumped into.
//
// If the path does not exist, it will be created by the plugin.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="/opt/soperator-outputs/nccl_profiles/%j/%s"
DumpDir string `json:"dumpDir,omitempty"`

// DumpThreadIntervalMicroseconds defines an interval between NCCL Inspector dump thread runs in microseconds.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=1000000
DumpThreadIntervalMicroseconds uint64 `json:"dumpThreadIntervalMicroseconds,omitempty"`
}

// PluginConfigCustom represents a custom SPANK plugin configuration.
type PluginConfigCustom struct {
// Required defines if the plugin is 'required' for SLURM.
Expand Down Expand Up @@ -1313,6 +1366,13 @@ func (p *PluginConfigNcclDebug) SetDefaults() {
}
}

// SetDefaults sets default values for PluginConfigNcclInspectorPreConf
func (p *PluginConfigNcclInspectorPreConf) SetDefaults() {
if p.Enabled == nil {
p.Enabled = ptr.To(false)
}
}

// SetDefaults sets default values for SlurmExporter
func (s *SlurmExporter) SetDefaults() {
if s.Enabled == nil {
Expand Down
21 changes: 21 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,54 @@ spec:
Otherwise, 'optional'.
type: boolean
type: object
ncclInspectorPreConf:
default:
dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s
dumpThreadIntervalMicroseconds: 1000000
enabled: false
pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
required: false
description: NcclInspectorPreConf represents the NCCL Inspector
Pre-Configuration SPANK plugin configuration.
properties:
dumpDir:
default: /opt/soperator-outputs/nccl_profiles/%j/%s
description: |-
DumpDir defines a directory path where NCCL profiles will be dumped into.

If the path does not exist, it will be created by the plugin.
type: string
dumpThreadIntervalMicroseconds:
default: 1000000
description: DumpThreadIntervalMicroseconds defines an interval
between NCCL Inspector dump thread runs in microseconds.
format: int64
type: integer
enabled:
default: false
description: Enabled defines whether to enable the plugin.
type: boolean
pluginPath:
default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
description: |-
PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file.

Deprecated: It makes the ldconfig execution being not required.
This is going to be removed when the plugin installation is established.
type: string
profilerPlugin:
default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
description: ProfilerPlugin defines a file path to NCCL Inspector's
SO file.
type: string
required:
default: false
description: |-
Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM.
Otherwise, 'optional'.
type: boolean
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
Expand Down
48 changes: 48 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28144,6 +28144,54 @@ spec:
Otherwise, 'optional'.
type: boolean
type: object
ncclInspectorPreConf:
default:
dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s
dumpThreadIntervalMicroseconds: 1000000
enabled: false
pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
required: false
description: NcclInspectorPreConf represents the NCCL Inspector
Pre-Configuration SPANK plugin configuration.
properties:
dumpDir:
default: /opt/soperator-outputs/nccl_profiles/%j/%s
description: |-
DumpDir defines a directory path where NCCL profiles will be dumped into.

If the path does not exist, it will be created by the plugin.
type: string
dumpThreadIntervalMicroseconds:
default: 1000000
description: DumpThreadIntervalMicroseconds defines an interval
between NCCL Inspector dump thread runs in microseconds.
format: int64
type: integer
enabled:
default: false
description: Enabled defines whether to enable the plugin.
type: boolean
pluginPath:
default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
description: |-
PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file.

Deprecated: It makes the ldconfig execution being not required.
This is going to be removed when the plugin installation is established.
type: string
profilerPlugin:
default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
description: ProfilerPlugin defines a file path to NCCL Inspector's
SO file.
type: string
required:
default: false
description: |-
Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM.
Otherwise, 'optional'.
type: boolean
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
Expand Down
48 changes: 48 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28144,6 +28144,54 @@ spec:
Otherwise, 'optional'.
type: boolean
type: object
ncclInspectorPreConf:
default:
dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s
dumpThreadIntervalMicroseconds: 1000000
enabled: false
pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
required: false
description: NcclInspectorPreConf represents the NCCL Inspector
Pre-Configuration SPANK plugin configuration.
properties:
dumpDir:
default: /opt/soperator-outputs/nccl_profiles/%j/%s
description: |-
DumpDir defines a directory path where NCCL profiles will be dumped into.

If the path does not exist, it will be created by the plugin.
type: string
dumpThreadIntervalMicroseconds:
default: 1000000
description: DumpThreadIntervalMicroseconds defines an interval
between NCCL Inspector dump thread runs in microseconds.
format: int64
type: integer
enabled:
default: false
description: Enabled defines whether to enable the plugin.
type: boolean
pluginPath:
default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so
description: |-
PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file.

Deprecated: It makes the ldconfig execution being not required.
This is going to be removed when the plugin installation is established.
type: string
profilerPlugin:
default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so
description: ProfilerPlugin defines a file path to NCCL Inspector's
SO file.
type: string
required:
default: false
description: |-
Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM.
Otherwise, 'optional'.
type: boolean
type: object
pyxis:
default:
containerImageSave: /var/cache/enroot-container-images/
Expand Down
1 change: 1 addition & 0 deletions internal/controller/clustercontroller/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ func (r *SlurmClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request

slurmCluster.Spec.PlugStackConfig.Pyxis.SetDefaults()
slurmCluster.Spec.PlugStackConfig.NcclDebug.SetDefaults()
slurmCluster.Spec.PlugStackConfig.NcclInspectorPreConf.SetDefaults()
slurmCluster.Spec.SlurmNodes.Exporter.SetDefaults()

// If cluster marked for deletion, we have nothing to do
Expand Down
15 changes: 15 additions & 0 deletions internal/render/common/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,21 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
))
}

{
opts := cluster.PlugStackConfig.NcclInspectorPreConf.DeepCopy()
res.AddLine(strings.Join(
[]string{
utils.Ternary(opts.Required, "required", "optional"),
fmt.Sprintf("%s", utils.Ternary(opts.PluginPath != "", opts.PluginPath, "/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so")),
fmt.Sprintf("enabled=%d", utils.Ternary(opts.Enabled != nil && *opts.Enabled, 1, 0)),
fmt.Sprintf("profiler-plugin=%s", utils.Ternary(opts.ProfilerPlugin != "", opts.ProfilerPlugin, "/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so")),
fmt.Sprintf("dump-dir=%s", utils.Ternary(opts.DumpDir != "", opts.DumpDir, "/opt/soperator-outputs/nccl_profiles/%j/%s")),
fmt.Sprintf("dump-thread-interval-microseconds=%d", utils.Ternary(opts.DumpThreadIntervalMicroseconds > 0, opts.DumpThreadIntervalMicroseconds, 1000000)),
},
" ",
))
}

for _, plugin := range cluster.PlugStackConfig.CustomPlugins {
conf := []string{
utils.Ternary(plugin.Required, "required", "optional"),
Expand Down
37 changes: 32 additions & 5 deletions internal/render/common/configmap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ func TestRenderPlugstack(t *testing.T) {
assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=/tmp/")
})

t.Run("NCCL no options", func(t *testing.T) {
t.Run("NCCL Debug no options", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
NcclDebug: slurmv1.PluginConfigNcclDebug{},
Expand All @@ -378,7 +378,7 @@ func TestRenderPlugstack(t *testing.T) {
assert.Contains(t, result, "optional spanknccldebug.so enabled=0 log-level=INFO out-file=0 out-dir=/opt/soperator-outputs/nccl_logs out-stdout=0")
})

t.Run("NCCL options", func(t *testing.T) {
t.Run("NCCL Debug options", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
NcclDebug: slurmv1.PluginConfigNcclDebug{
Expand All @@ -395,14 +395,41 @@ func TestRenderPlugstack(t *testing.T) {
assert.Contains(t, result, "required spanknccldebug.so enabled=1 log-level=TRACE out-file=1 out-dir=/tmp out-stdout=1")
})

t.Run("NCCL Inspector Pre-Configuration no options", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
NcclInspectorPreConf: slurmv1.PluginConfigNcclInspectorPreConf{},
},
}).Render()
assert.NotEmpty(t, result)
assert.Contains(t, result, "optional /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so enabled=0 profiler-plugin=/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so dump-dir=/opt/soperator-outputs/nccl_profiles/%j/%s dump-thread-interval-microseconds=1000000")
})

t.Run("NCCL Inspector Pre-Configuration options", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
NcclInspectorPreConf: slurmv1.PluginConfigNcclInspectorPreConf{
Required: true,
PluginPath: "/usr/lib/s.so",
Enabled: ptr.To(true),
ProfilerPlugin: "/usr/lib/a.so",
DumpDir: "/tmp",
DumpThreadIntervalMicroseconds: 500,
},
},
}).Render()
assert.NotEmpty(t, result)
assert.Contains(t, result, "required /usr/lib/s.so enabled=1 profiler-plugin=/usr/lib/a.so dump-dir=/tmp dump-thread-interval-microseconds=500")
})

t.Run("Custom not provided", func(t *testing.T) {
result := generateSpankConfig(&values.SlurmCluster{
PlugStackConfig: slurmv1.PlugStackConfig{
CustomPlugins: []slurmv1.PluginConfigCustom{},
},
}).Render()
assert.NotEmpty(t, result)
assert.Equal(t, 3, len(strings.Split(result, "\n")))
assert.Equal(t, 4, len(strings.Split(result, "\n")))
})

t.Run("Custom no options", func(t *testing.T) {
Expand All @@ -414,7 +441,7 @@ func TestRenderPlugstack(t *testing.T) {
},
}).Render()
assert.NotEmpty(t, result)
assert.Equal(t, 4, len(strings.Split(result, "\n")))
assert.Equal(t, 5, len(strings.Split(result, "\n")))
assert.Contains(t, result, "optional /lol/kek.so")
})

Expand All @@ -437,7 +464,7 @@ func TestRenderPlugstack(t *testing.T) {
},
}).Render()
assert.NotEmpty(t, result)
assert.Equal(t, 5, len(strings.Split(result, "\n")))
assert.Equal(t, 6, len(strings.Split(result, "\n")))
assert.Contains(t, result, "required /lol/kek.so lol=kek")
assert.Contains(t, result, "required /kek/lol.so kek=lol")
})
Expand Down
Loading