From 9c41971aacaff85cadace3bd87f973bba02621aa Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Mon, 20 Apr 2026 13:26:17 +0200 Subject: [PATCH 1/2] [ADD] API for NCCL Inspector Pre-Configuration SPANK plugin --- api/v1/slurmcluster_types.go | 60 +++++++++++++++++++ api/v1/zz_generated.deepcopy.go | 21 +++++++ .../bases/slurm.nebius.ai_slurmclusters.yaml | 48 +++++++++++++++ .../templates/slurmcluster-crd.yaml | 48 +++++++++++++++ helm/soperator/crds/slurmcluster-crd.yaml | 48 +++++++++++++++ .../controller/clustercontroller/reconcile.go | 1 + 6 files changed, 226 insertions(+) diff --git a/api/v1/slurmcluster_types.go b/api/v1/slurmcluster_types.go index f43efe462..e71d196a3 100644 --- a/api/v1/slurmcluster_types.go +++ b/api/v1/slurmcluster_types.go @@ -223,6 +223,12 @@ type PlugStackConfig struct { // +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } NcclDebug PluginConfigNcclDebug `json:"ncclDebug,omitempty"` + // NcclInspectorPreConf represents the NCCL Inspector Pre-Configuration SPANK plugin configuration. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default={ required: false, pluginPath: "/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so", enabled: false, profilerPlugin: "/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so", dumpDir: "/opt/soperator-outputs/nccl_profiles/%j/%s", dumpThreadIntervalMicroseconds: 1000000 } + NcclInspectorPreConf PluginConfigNcclInspectorPreConf `json:"ncclInspectorPreConf,omitempty"` + // PluginConfigCustom represents a configuration of custom SPANK plugins. // // +kubebuilder:validation:Optional @@ -301,6 +307,53 @@ type PluginConfigNcclDebug struct { OutputDirectory string `json:"outputDirectory,omitempty"` } +// PluginConfigNcclInspectorPreConf represents the NCCL Inspector Pre-Configuration SPANK plugin configuration. +// +// See: https://github.com/nebius/slurm-plugins/tree/main/spank/nccl-inspector-preconf +type PluginConfigNcclInspectorPreConf struct { + // Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM. + // Otherwise, 'optional'. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default=false + Required bool `json:"required,omitempty"` + + // PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file. + // + // Deprecated: It makes the ldconfig execution being not required. + // This is going to be removed when the plugin installation is established. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default="/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so" + PluginPath string `json:"pluginPath,omitempty"` + + // Enabled defines whether to enable the plugin. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default=false + Enabled *bool `json:"enabled,omitempty"` + + // ProfilerPlugin defines a file path to NCCL Inspector's SO file. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default="/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so" + ProfilerPlugin string `json:"profilerPlugin,omitempty"` + + // DumpDir defines a directory path where NCCL profiles will be dumped into. + // + // If the path does not exist, it will be created by the plugin. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default="/opt/soperator-outputs/nccl_profiles/%j/%s" + DumpDir string `json:"dumpDir,omitempty"` + + // DumpThreadIntervalMicroseconds defines an interval between NCCL Inspector dump thread runs in microseconds. + // + // +kubebuilder:validation:Optional + // +kubebuilder:default=1000000 + DumpThreadIntervalMicroseconds uint64 `json:"dumpThreadIntervalMicroseconds,omitempty"` +} + // PluginConfigCustom represents a custom SPANK plugin configuration. type PluginConfigCustom struct { // Required defines if the plugin is 'required' for SLURM. @@ -1313,6 +1366,13 @@ func (p *PluginConfigNcclDebug) SetDefaults() { } } +// SetDefaults sets default values for PluginConfigNcclInspectorPreConf +func (p *PluginConfigNcclInspectorPreConf) SetDefaults() { + if p.Enabled == nil { + p.Enabled = ptr.To(false) + } +} + // SetDefaults sets default values for SlurmExporter func (s *SlurmExporter) SetDefaults() { if s.Enabled == nil { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 802390130..24f12844a 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -446,6 +446,7 @@ func (in *PlugStackConfig) DeepCopyInto(out *PlugStackConfig) { *out = *in in.Pyxis.DeepCopyInto(&out.Pyxis) in.NcclDebug.DeepCopyInto(&out.NcclDebug) + in.NcclInspectorPreConf.DeepCopyInto(&out.NcclInspectorPreConf) if in.CustomPlugins != nil { in, out := &in.CustomPlugins, &out.CustomPlugins *out = make([]PluginConfigCustom, len(*in)) @@ -507,6 +508,26 @@ func (in *PluginConfigNcclDebug) DeepCopy() *PluginConfigNcclDebug { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PluginConfigNcclInspectorPreConf) DeepCopyInto(out *PluginConfigNcclInspectorPreConf) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PluginConfigNcclInspectorPreConf. +func (in *PluginConfigNcclInspectorPreConf) DeepCopy() *PluginConfigNcclInspectorPreConf { + if in == nil { + return nil + } + out := new(PluginConfigNcclInspectorPreConf) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PluginConfigPyxis) DeepCopyInto(out *PluginConfigPyxis) { *out = *in diff --git a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml index e4abd32f2..bc6334887 100644 --- a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml +++ b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml @@ -1320,6 +1320,54 @@ spec: Otherwise, 'optional'. type: boolean type: object + ncclInspectorPreConf: + default: + dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s + dumpThreadIntervalMicroseconds: 1000000 + enabled: false + pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + required: false + description: NcclInspectorPreConf represents the NCCL Inspector + Pre-Configuration SPANK plugin configuration. + properties: + dumpDir: + default: /opt/soperator-outputs/nccl_profiles/%j/%s + description: |- + DumpDir defines a directory path where NCCL profiles will be dumped into. + + If the path does not exist, it will be created by the plugin. + type: string + dumpThreadIntervalMicroseconds: + default: 1000000 + description: DumpThreadIntervalMicroseconds defines an interval + between NCCL Inspector dump thread runs in microseconds. + format: int64 + type: integer + enabled: + default: false + description: Enabled defines whether to enable the plugin. + type: boolean + pluginPath: + default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + description: |- + PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file. + + Deprecated: It makes the ldconfig execution being not required. + This is going to be removed when the plugin installation is established. + type: string + profilerPlugin: + default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + description: ProfilerPlugin defines a file path to NCCL Inspector's + SO file. + type: string + required: + default: false + description: |- + Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM. + Otherwise, 'optional'. + type: boolean + type: object pyxis: default: containerImageSave: /var/cache/enroot-container-images/ diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 32097a0b6..00a06ca18 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -28144,6 +28144,54 @@ spec: Otherwise, 'optional'. type: boolean type: object + ncclInspectorPreConf: + default: + dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s + dumpThreadIntervalMicroseconds: 1000000 + enabled: false + pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + required: false + description: NcclInspectorPreConf represents the NCCL Inspector + Pre-Configuration SPANK plugin configuration. + properties: + dumpDir: + default: /opt/soperator-outputs/nccl_profiles/%j/%s + description: |- + DumpDir defines a directory path where NCCL profiles will be dumped into. + + If the path does not exist, it will be created by the plugin. + type: string + dumpThreadIntervalMicroseconds: + default: 1000000 + description: DumpThreadIntervalMicroseconds defines an interval + between NCCL Inspector dump thread runs in microseconds. + format: int64 + type: integer + enabled: + default: false + description: Enabled defines whether to enable the plugin. + type: boolean + pluginPath: + default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + description: |- + PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file. + + Deprecated: It makes the ldconfig execution being not required. + This is going to be removed when the plugin installation is established. + type: string + profilerPlugin: + default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + description: ProfilerPlugin defines a file path to NCCL Inspector's + SO file. + type: string + required: + default: false + description: |- + Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM. + Otherwise, 'optional'. + type: boolean + type: object pyxis: default: containerImageSave: /var/cache/enroot-container-images/ diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 32097a0b6..00a06ca18 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -28144,6 +28144,54 @@ spec: Otherwise, 'optional'. type: boolean type: object + ncclInspectorPreConf: + default: + dumpDir: /opt/soperator-outputs/nccl_profiles/%j/%s + dumpThreadIntervalMicroseconds: 1000000 + enabled: false + pluginPath: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + profilerPlugin: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + required: false + description: NcclInspectorPreConf represents the NCCL Inspector + Pre-Configuration SPANK plugin configuration. + properties: + dumpDir: + default: /opt/soperator-outputs/nccl_profiles/%j/%s + description: |- + DumpDir defines a directory path where NCCL profiles will be dumped into. + + If the path does not exist, it will be created by the plugin. + type: string + dumpThreadIntervalMicroseconds: + default: 1000000 + description: DumpThreadIntervalMicroseconds defines an interval + between NCCL Inspector dump thread runs in microseconds. + format: int64 + type: integer + enabled: + default: false + description: Enabled defines whether to enable the plugin. + type: boolean + pluginPath: + default: /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so + description: |- + PluginPath defines a file path to the NCCL Inspector Pre-Configuration's SO file. + + Deprecated: It makes the ldconfig execution being not required. + This is going to be removed when the plugin installation is established. + type: string + profilerPlugin: + default: /usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so + description: ProfilerPlugin defines a file path to NCCL Inspector's + SO file. + type: string + required: + default: false + description: |- + Required defines if NCCL Inspector Pre-Configuration is 'required' for SLURM. + Otherwise, 'optional'. + type: boolean + type: object pyxis: default: containerImageSave: /var/cache/enroot-container-images/ diff --git a/internal/controller/clustercontroller/reconcile.go b/internal/controller/clustercontroller/reconcile.go index 8cc3c1604..cdd62ae09 100644 --- a/internal/controller/clustercontroller/reconcile.go +++ b/internal/controller/clustercontroller/reconcile.go @@ -149,6 +149,7 @@ func (r *SlurmClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request slurmCluster.Spec.PlugStackConfig.Pyxis.SetDefaults() slurmCluster.Spec.PlugStackConfig.NcclDebug.SetDefaults() + slurmCluster.Spec.PlugStackConfig.NcclInspectorPreConf.SetDefaults() slurmCluster.Spec.SlurmNodes.Exporter.SetDefaults() // If cluster marked for deletion, we have nothing to do From 0dd72f63553a14c7303980a8ecf14e6704942f76 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Mon, 20 Apr 2026 13:27:25 +0200 Subject: [PATCH 2/2] [ADD] Render NCCL Inspector Pre-Configuration SPANK plugin in plugstack --- internal/render/common/configmap.go | 15 ++++++++++ internal/render/common/configmap_test.go | 37 ++++++++++++++++++++---- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/internal/render/common/configmap.go b/internal/render/common/configmap.go index 9092b7433..648ebbf2c 100644 --- a/internal/render/common/configmap.go +++ b/internal/render/common/configmap.go @@ -600,6 +600,21 @@ func generateSpankConfig(cluster *values.SlurmCluster) renderutils.ConfigFile { )) } + { + opts := cluster.PlugStackConfig.NcclInspectorPreConf.DeepCopy() + res.AddLine(strings.Join( + []string{ + utils.Ternary(opts.Required, "required", "optional"), + fmt.Sprintf("%s", utils.Ternary(opts.PluginPath != "", opts.PluginPath, "/usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so")), + fmt.Sprintf("enabled=%d", utils.Ternary(opts.Enabled != nil && *opts.Enabled, 1, 0)), + fmt.Sprintf("profiler-plugin=%s", utils.Ternary(opts.ProfilerPlugin != "", opts.ProfilerPlugin, "/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so")), + fmt.Sprintf("dump-dir=%s", utils.Ternary(opts.DumpDir != "", opts.DumpDir, "/opt/soperator-outputs/nccl_profiles/%j/%s")), + fmt.Sprintf("dump-thread-interval-microseconds=%d", utils.Ternary(opts.DumpThreadIntervalMicroseconds > 0, opts.DumpThreadIntervalMicroseconds, 1000000)), + }, + " ", + )) + } + for _, plugin := range cluster.PlugStackConfig.CustomPlugins { conf := []string{ utils.Ternary(plugin.Required, "required", "optional"), diff --git a/internal/render/common/configmap_test.go b/internal/render/common/configmap_test.go index 62dbc3d7e..1dc579ff3 100644 --- a/internal/render/common/configmap_test.go +++ b/internal/render/common/configmap_test.go @@ -368,7 +368,7 @@ func TestRenderPlugstack(t *testing.T) { assert.Contains(t, result, "required spank_pyxis.so runtime_path=/run/pyxis execute_entrypoint=0 container_scope=global sbatch_support=1 container_image_save=/tmp/") }) - t.Run("NCCL no options", func(t *testing.T) { + t.Run("NCCL Debug no options", func(t *testing.T) { result := generateSpankConfig(&values.SlurmCluster{ PlugStackConfig: slurmv1.PlugStackConfig{ NcclDebug: slurmv1.PluginConfigNcclDebug{}, @@ -378,7 +378,7 @@ func TestRenderPlugstack(t *testing.T) { assert.Contains(t, result, "optional spanknccldebug.so enabled=0 log-level=INFO out-file=0 out-dir=/opt/soperator-outputs/nccl_logs out-stdout=0") }) - t.Run("NCCL options", func(t *testing.T) { + t.Run("NCCL Debug options", func(t *testing.T) { result := generateSpankConfig(&values.SlurmCluster{ PlugStackConfig: slurmv1.PlugStackConfig{ NcclDebug: slurmv1.PluginConfigNcclDebug{ @@ -395,6 +395,33 @@ func TestRenderPlugstack(t *testing.T) { assert.Contains(t, result, "required spanknccldebug.so enabled=1 log-level=TRACE out-file=1 out-dir=/tmp out-stdout=1") }) + t.Run("NCCL Inspector Pre-Configuration no options", func(t *testing.T) { + result := generateSpankConfig(&values.SlurmCluster{ + PlugStackConfig: slurmv1.PlugStackConfig{ + NcclInspectorPreConf: slurmv1.PluginConfigNcclInspectorPreConf{}, + }, + }).Render() + assert.NotEmpty(t, result) + assert.Contains(t, result, "optional /usr/lib/x86_64-linux-gnu/slurm/spank_nccl_inspector_preconf.so enabled=0 profiler-plugin=/usr/lib/x86_64-linux-gnu/libnccl-profiler-inspector.so dump-dir=/opt/soperator-outputs/nccl_profiles/%j/%s dump-thread-interval-microseconds=1000000") + }) + + t.Run("NCCL Inspector Pre-Configuration options", func(t *testing.T) { + result := generateSpankConfig(&values.SlurmCluster{ + PlugStackConfig: slurmv1.PlugStackConfig{ + NcclInspectorPreConf: slurmv1.PluginConfigNcclInspectorPreConf{ + Required: true, + PluginPath: "/usr/lib/s.so", + Enabled: ptr.To(true), + ProfilerPlugin: "/usr/lib/a.so", + DumpDir: "/tmp", + DumpThreadIntervalMicroseconds: 500, + }, + }, + }).Render() + assert.NotEmpty(t, result) + assert.Contains(t, result, "required /usr/lib/s.so enabled=1 profiler-plugin=/usr/lib/a.so dump-dir=/tmp dump-thread-interval-microseconds=500") + }) + t.Run("Custom not provided", func(t *testing.T) { result := generateSpankConfig(&values.SlurmCluster{ PlugStackConfig: slurmv1.PlugStackConfig{ @@ -402,7 +429,7 @@ func TestRenderPlugstack(t *testing.T) { }, }).Render() assert.NotEmpty(t, result) - assert.Equal(t, 3, len(strings.Split(result, "\n"))) + assert.Equal(t, 4, len(strings.Split(result, "\n"))) }) t.Run("Custom no options", func(t *testing.T) { @@ -414,7 +441,7 @@ func TestRenderPlugstack(t *testing.T) { }, }).Render() assert.NotEmpty(t, result) - assert.Equal(t, 4, len(strings.Split(result, "\n"))) + assert.Equal(t, 5, len(strings.Split(result, "\n"))) assert.Contains(t, result, "optional /lol/kek.so") }) @@ -437,7 +464,7 @@ func TestRenderPlugstack(t *testing.T) { }, }).Render() assert.NotEmpty(t, result) - assert.Equal(t, 5, len(strings.Split(result, "\n"))) + assert.Equal(t, 6, len(strings.Split(result, "\n"))) assert.Contains(t, result, "required /lol/kek.so lol=kek") assert.Contains(t, result, "required /kek/lol.so kek=lol") })