diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 3e8e18f6..9272683c 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -1453,6 +1453,16 @@ func (a *MetalAgent) checkMemoryAdmission( "headroom", formatMemory(budget.HeadroomBytes), "source", resolved.Source, ) + + // Clear any stale scheduling status from a previous failed check. + if isvc.Status.SchedulingStatus != "" || isvc.Status.SchedulingMessage != "" { + isvc.Status.SchedulingStatus = "" + isvc.Status.SchedulingMessage = "" + if updateErr := a.config.K8sClient.Status().Update(ctx, isvc); updateErr != nil { + a.logger.Warnw("failed to clear scheduling status", "error", updateErr) + } + } + return nil } diff --git a/pkg/agent/memory_admission_test.go b/pkg/agent/memory_admission_test.go index 61b82abb..ae6024d5 100644 --- a/pkg/agent/memory_admission_test.go +++ b/pkg/agent/memory_admission_test.go @@ -250,3 +250,34 @@ func TestCheckMemoryAdmission_PassesWithinBudget(t *testing.T) { t.Fatalf("model within budget should pass admission, got: %v", err) } } + +// Regression test for #777: when a previous memory check failed and set +// SchedulingStatus/Message, a subsequent passing check must clear those +// fields so the status reflects the current healthy state. +func TestCheckMemoryAdmission_ClearsStaleSchedulingStatus(t *testing.T) { + isvc := newAdmissionTestISVC() + isvc.Status.SchedulingStatus = "InsufficientMemory" + isvc.Status.SchedulingMessage = "estimated 100 GiB required, budget 6 GiB" + + agent := newAdmissionTestAgent(t, isvc, MetalAgentConfig{ + MemoryProvider: &mockMemoryProvider{totalBytes: 128 * 1024 * 1024 * 1024}, + MemoryFraction: 0.75, + }) + model := newAdmissionTestModel("https://model-host.invalid/model.gguf", "20.0 GiB") + + if err := agent.checkMemoryAdmission(context.Background(), isvc, model, 2048, "", ""); err != nil { + t.Fatalf("model within budget should pass admission, got: %v", err) + } + + updated := &inferencev1alpha1.InferenceService{} + if getErr := agent.config.K8sClient.Get(context.Background(), + types.NamespacedName{Namespace: "default", Name: "test-isvc"}, updated); getErr != nil { + t.Fatalf("failed to re-fetch InferenceService: %v", getErr) + } + if updated.Status.SchedulingStatus != "" { + t.Errorf("SchedulingStatus = %q, want empty (stale status should be cleared)", updated.Status.SchedulingStatus) + } + if updated.Status.SchedulingMessage != "" { + t.Errorf("SchedulingMessage = %q, want empty (stale message should be cleared)", updated.Status.SchedulingMessage) + } +}