From ae157fab86dc1518e23e9eedfff3443bb2506db4 Mon Sep 17 00:00:00 2001 From: Foreman Bot Date: Sun, 21 Jun 2026 03:13:41 -0700 Subject: [PATCH] fix: clear stale SchedulingStatus on successful memory check The metal-agent sets status.schedulingStatus to "InsufficientMemory" or "MemoryCheckFailed" on a failed memory admission, but the memory-check-pass path returned nil without clearing those fields. With PR #774 the controller no longer clears them (it correctly preserves agent-owned scheduling status). Net result: once set, SchedulingStatus persisted after the condition resolved, so a service that recovered kept showing a stale InsufficientMemory. On a successful memory check, clear SchedulingStatus and SchedulingMessage if they are set, and update the InferenceService status. Add a regression test that verifies stale status is cleared. Fixes #777 Signed-off-by: Foreman Bot --- pkg/agent/agent.go | 10 ++++++++++ pkg/agent/memory_admission_test.go | 31 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 3e8e18f6..9272683c 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -1453,6 +1453,16 @@ func (a *MetalAgent) checkMemoryAdmission( "headroom", formatMemory(budget.HeadroomBytes), "source", resolved.Source, ) + + // Clear any stale scheduling status from a previous failed check. + if isvc.Status.SchedulingStatus != "" || isvc.Status.SchedulingMessage != "" { + isvc.Status.SchedulingStatus = "" + isvc.Status.SchedulingMessage = "" + if updateErr := a.config.K8sClient.Status().Update(ctx, isvc); updateErr != nil { + a.logger.Warnw("failed to clear scheduling status", "error", updateErr) + } + } + return nil } diff --git a/pkg/agent/memory_admission_test.go b/pkg/agent/memory_admission_test.go index 61b82abb..ae6024d5 100644 --- a/pkg/agent/memory_admission_test.go +++ b/pkg/agent/memory_admission_test.go @@ -250,3 +250,34 @@ func TestCheckMemoryAdmission_PassesWithinBudget(t *testing.T) { t.Fatalf("model within budget should pass admission, got: %v", err) } } + +// Regression test for #777: when a previous memory check failed and set +// SchedulingStatus/Message, a subsequent passing check must clear those +// fields so the status reflects the current healthy state. +func TestCheckMemoryAdmission_ClearsStaleSchedulingStatus(t *testing.T) { + isvc := newAdmissionTestISVC() + isvc.Status.SchedulingStatus = "InsufficientMemory" + isvc.Status.SchedulingMessage = "estimated 100 GiB required, budget 6 GiB" + + agent := newAdmissionTestAgent(t, isvc, MetalAgentConfig{ + MemoryProvider: &mockMemoryProvider{totalBytes: 128 * 1024 * 1024 * 1024}, + MemoryFraction: 0.75, + }) + model := newAdmissionTestModel("https://model-host.invalid/model.gguf", "20.0 GiB") + + if err := agent.checkMemoryAdmission(context.Background(), isvc, model, 2048, "", ""); err != nil { + t.Fatalf("model within budget should pass admission, got: %v", err) + } + + updated := &inferencev1alpha1.InferenceService{} + if getErr := agent.config.K8sClient.Get(context.Background(), + types.NamespacedName{Namespace: "default", Name: "test-isvc"}, updated); getErr != nil { + t.Fatalf("failed to re-fetch InferenceService: %v", getErr) + } + if updated.Status.SchedulingStatus != "" { + t.Errorf("SchedulingStatus = %q, want empty (stale status should be cleared)", updated.Status.SchedulingStatus) + } + if updated.Status.SchedulingMessage != "" { + t.Errorf("SchedulingMessage = %q, want empty (stale message should be cleared)", updated.Status.SchedulingMessage) + } +}