From 06fbb0684666f0dc9b7435b6f86f7dd40fdc4834 Mon Sep 17 00:00:00 2001 From: Foreman Bot Date: Sun, 21 Jun 2026 01:05:48 -0700 Subject: [PATCH] fix: preserve agent-written schedulingStatus on InferenceService status update The metal-agent writes status.schedulingStatus and status.schedulingMessage (e.g. "MemoryCheckFailed", "InsufficientMemory") when admission rejects an InferenceService. The controller's reconcile loop subsequently updated the status without preserving those fields, so they read back empty almost immediately. The fix removes the else branch in updateStatusWithSchedulingInfo that unconditionally cleared SchedulingStatus, SchedulingMessage, and WaitingFor when schedulingInfo is nil. When schedulingInfo is nil (the common case for non-GPU-scheduling scenarios), the existing agent-written values are now preserved. A regression test verifies that agent-written scheduling fields survive a controller reconcile. Fixes #643 Signed-off-by: Foreman Bot --- .../inferenceservice_reconcile_test.go | 62 +++++++++++++++++++ internal/controller/status_builder.go | 7 +-- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/internal/controller/inferenceservice_reconcile_test.go b/internal/controller/inferenceservice_reconcile_test.go index d411159f..d5ad6705 100644 --- a/internal/controller/inferenceservice_reconcile_test.go +++ b/internal/controller/inferenceservice_reconcile_test.go @@ -1036,5 +1036,67 @@ var _ = Describe("Reconcile lifecycle", func() { Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ModelCachePVCName, Namespace: "default"}, pvc)).To(Succeed()) Expect(pvc.OwnerReferences).To(BeEmpty()) }) + + It("should preserve agent-written schedulingStatus on status update", func() { + modelName := "model-sched-preserve" + isvcName := "isvc-sched-preserve" + + model := &inferencev1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: "default"}, + Spec: inferencev1alpha1.ModelSpec{ + Source: "https://example.com/model.gguf", + Hardware: &inferencev1alpha1.HardwareSpec{Accelerator: "cpu"}, + }, + } + Expect(k8sClient.Create(ctx, model)).To(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, model) }() + + model.Status.Phase = PhaseReady + Expect(k8sClient.Status().Update(ctx, model)).To(Succeed()) + + replicas := int32(1) + isvc := &inferencev1alpha1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{Name: isvcName, Namespace: "default"}, + Spec: inferencev1alpha1.InferenceServiceSpec{ + ModelRef: modelName, + Replicas: &replicas, + Image: "ghcr.io/ggml-org/llama.cpp:server", + }, + } + Expect(k8sClient.Create(ctx, isvc)).To(Succeed()) + defer func() { + _ = k8sClient.Delete(ctx, isvc) + dep := &appsv1.Deployment{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep); err == nil { + _ = k8sClient.Delete(ctx, dep) + } + svc := &corev1.Service{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc); err == nil { + _ = k8sClient.Delete(ctx, svc) + } + }() + + // Simulate the metal-agent writing a scheduling rejection. + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, isvc)).To(Succeed()) + isvc.Status.SchedulingStatus = "MemoryCheckFailed" + isvc.Status.SchedulingMessage = "host memory insufficient for model" + Expect(k8sClient.Status().Update(ctx, isvc)).To(Succeed()) + + reconciler := &InferenceServiceReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + InitContainerImage: "docker.io/curlimages/curl:8.18.0", + } + _, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: isvcName, Namespace: "default"}, + }) + Expect(err).NotTo(HaveOccurred()) + + updated := &inferencev1alpha1.InferenceService{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, updated)).To(Succeed()) + // The controller must not clobber the agent-written scheduling fields. + Expect(updated.Status.SchedulingStatus).To(Equal("MemoryCheckFailed")) + Expect(updated.Status.SchedulingMessage).To(Equal("host memory insufficient for model")) + }) }) }) diff --git a/internal/controller/status_builder.go b/internal/controller/status_builder.go index 0ce67a83..08d7f77e 100644 --- a/internal/controller/status_builder.go +++ b/internal/controller/status_builder.go @@ -135,11 +135,10 @@ func (r *InferenceServiceReconciler) updateStatusWithSchedulingInfo( isvc.Status.SchedulingStatus = schedulingInfo.Status isvc.Status.SchedulingMessage = schedulingInfo.Message isvc.Status.WaitingFor = schedulingInfo.WaitingFor - } else { - isvc.Status.SchedulingStatus = "" - isvc.Status.SchedulingMessage = "" - isvc.Status.WaitingFor = "" } + // When schedulingInfo is nil, preserve agent-written scheduling fields + // (e.g. InsufficientMemory, MemoryCheckFailed) so the controller does not + // clobber them on its next status update (#643). if phase == PhaseWaitingForGPU { queuePos, err := r.calculateQueuePosition(ctx, isvc)