From 723915f6b6c924799339448c0389e606bda7f855 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 00:48:21 +0000 Subject: [PATCH 01/15] npd --- pkg/status/collector.go | 79 +++++++++++++++++++++++++++++++++++++++++ pkg/status/types.go | 2 ++ 2 files changed, 81 insertions(+) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index dc4d72b4..46be075d 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -3,9 +3,11 @@ package status import ( "context" "encoding/json" + "fmt" "os" "os/exec" "path/filepath" + "strconv" "strings" "time" @@ -60,6 +62,9 @@ func (c *Collector) CollectStatus(ctx context.Context) (*NodeStatus, error) { } status.ArcStatus = arcStatus + // Check if reboot is needed node condition + status.NeedReboot = c.checkRebootNeeded(ctx) + return status, nil } @@ -297,3 +302,77 @@ func GetStatusFilePath() string { // Fallback to temp directory for testing/development return filepath.Join("/tmp/aks-flex-node", "status.json") } + +func getBootTime() (time.Time, error) { + data, err := os.ReadFile("/proc/uptime") + if err != nil { + return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err) + } + + // /proc/uptime contains two numbers: uptime in seconds and idle time + // We only need the first number + fields := strings.Fields(string(data)) + if len(fields) < 1 { + return time.Time{}, fmt.Errorf("invalid /proc/uptime format") + } + + uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) + if err != nil { + return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err) + } + + // Calculate boot time: current time - uptime + bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second))) + return bootTime, nil +} + +func getNodeName() (string, error) { + host, err := os.Hostname() + if err != nil { + return "", fmt.Errorf("failed to get hostname: %w", err) + } + + nodeName := strings.TrimSpace(host) + if nodeName == "" { + return "", fmt.Errorf("node name is empty") + } + + return nodeName, nil +} + +func (c *Collector) checkRebootNeeded(ctx context.Context) bool { + hostBootTime, err := getBootTime() + if err != nil { + c.logger.Warnf("Failed to get boot time: %v", err) + return false + } + nodeName, err := getNodeName() + if err != nil { + c.logger.Errorf("failed to get node name: %s", err.Error()) + return false + } + + clientset, err := kube.KubeletClientset() + if err != nil { + c.logger.Errorf("failed to get kubelet clientset: %s", err.Error()) + return false + } + // Get the node + node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + if err != nil { + c.logger.Errorf("failed to get node: %s", err.Error()) + return false + } + + for _, condition := range node.Status.Conditions { + switch condition.Type { + case "KernelDeadlock": + if condition.Status == "True" && condition.LastTransitionTime.Time.After(hostBootTime) { + c.logger.Infof("Node has a kernel deadlock since %s, rebooting...", + condition.LastTransitionTime.Time.Format("2006-01-02 15:04:05")) + return true + } + } + } + return false +} diff --git a/pkg/status/types.go b/pkg/status/types.go index ea32d816..c2c1aabf 100644 --- a/pkg/status/types.go +++ b/pkg/status/types.go @@ -41,6 +41,8 @@ type NodeStatus struct { LastUpdatedBy LastUpdatedBy `json:"lastUpdatedBy,omitempty"` LastUpdatedReason LastUpdatedReason `json:"lastUpdatedReason,omitempty"` AgentVersion string `json:"agentVersion"` + + NeedReboot bool `json:"needReboot,omitempty"` } // ArcStatus contains Azure Arc machine registration and connection status From ac31627ebd0c6c96c298e102bc7acbef89fae1ce Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 01:04:21 +0000 Subject: [PATCH 02/15] npd --- pkg/drift/defaults.go | 1 + pkg/drift/detector.go | 1 + pkg/drift/remediation.go | 45 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/pkg/drift/defaults.go b/pkg/drift/defaults.go index b4f82be1..9a61d7d9 100644 --- a/pkg/drift/defaults.go +++ b/pkg/drift/defaults.go @@ -7,5 +7,6 @@ package drift func DefaultDetectors() []Detector { return []Detector{ NewKubernetesVersionDetector(), + NewRebootDetector(), } } diff --git a/pkg/drift/detector.go b/pkg/drift/detector.go index 2b7608ea..400e50d1 100644 --- a/pkg/drift/detector.go +++ b/pkg/drift/detector.go @@ -25,6 +25,7 @@ type RemediationAction string const ( RemediationActionUnspecified RemediationAction = "" RemediationActionKubernetesUpgrade RemediationAction = "kubernetes-upgrade" + RemediationActionReboot RemediationAction = "reboot" ) // Remediation describes what the agent should do to address a drift. diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index bcf1bd90..72137b01 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "os/exec" "sync/atomic" "time" @@ -154,6 +155,13 @@ func detectAndRemediate( logger.Info("Kubernetes upgrade remediation completed successfully") return detectErr + case RemediationActionReboot: + if err := runRebootRemediation(ctx, logger); err != nil { + return fmt.Errorf("reboot remediation failed: %w", err) + } + logger.Info("Node reboot initiated successfully") + return detectErr + default: return fmt.Errorf("unsupported drift remediation action: %q", plan.Action) } @@ -267,6 +275,43 @@ func runKubernetesUpgradeRemediation( return result, err } +func runRebootRemediation( + ctx context.Context, + logger *logrus.Logger, +) error { + // Key design points: + // - Only reboot if aks-flex-node-agent is running as a systemd service + // - If not running under systemd, skip reboot (agent may be running in development/test mode) + // - Use systemctl reboot for a clean shutdown + if logger == nil { + logger = logrus.New() + } + + // Check if aks-flex-node-agent is managed by systemd. + // We use 'systemctl is-active' to check if the service is running under systemd. + checkCmd := exec.CommandContext(ctx, "systemctl", "is-active", "aks-flex-node-agent.service") + if err := checkCmd.Run(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + logger.WithError(exitErr).Warn("aks-flex-node-agent is not running as a systemd service; skipping reboot") + return fmt.Errorf("agent not managed by systemd, reboot skipped") + } + logger.WithError(err).Warn("Failed to check systemd service status; skipping reboot") + return fmt.Errorf("failed to check systemd service status: %w", err) + } + + logger.Info("Initiating system reboot via systemctl") + + // Use systemctl reboot for a clean shutdown. + // This will gracefully stop services and sync filesystems before rebooting. + cmd := exec.CommandContext(ctx, "systemctl", "reboot") + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to initiate reboot: %w", err) + } + + // Don't wait for the command to complete; the system will be shutting down. + return nil +} + // handleExecutionResult mirrors main's handleExecutionResult but lives in drift so remediation // can share the same logging and error semantics. func handleExecutionResult(result *bootstrapper.ExecutionResult, operation string, logger *logrus.Logger) error { From 6dee5445638ed7856be42e8fbc77848f05630e4b Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 01:05:15 +0000 Subject: [PATCH 03/15] npd --- pkg/drift/node_reboot.go | 53 ++++++++++++++++++++++ pkg/drift/node_reboot_test.go | 82 +++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 pkg/drift/node_reboot.go create mode 100644 pkg/drift/node_reboot_test.go diff --git a/pkg/drift/node_reboot.go b/pkg/drift/node_reboot.go new file mode 100644 index 00000000..7080e008 --- /dev/null +++ b/pkg/drift/node_reboot.go @@ -0,0 +1,53 @@ +package drift + +import ( + "context" + + "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/AKSFlexNode/pkg/spec" + "github.com/Azure/AKSFlexNode/pkg/status" +) + +const NodeRebootFindingID = "node-reboot" + +type RebootDetector struct{} + +func NewRebootDetector() *RebootDetector { + return &RebootDetector{} +} + +func (d *RebootDetector) Name() string { + return "RebootDetector" +} + +func (d *RebootDetector) Detect( + ctx context.Context, + _ *config.Config, + _ *spec.ManagedClusterSpec, + statusSnap *status.NodeStatus, +) ([]Finding, error) { + if ctx != nil { + if err := ctx.Err(); err != nil { + return nil, err + } + } + + if statusSnap == nil { + return nil, nil + } + + if !statusSnap.NeedReboot { + return nil, nil + } + + return []Finding{ + { + ID: NodeRebootFindingID, + Title: "Node reboot required", + Details: "Node status indicates a reboot is needed", + Remediation: Remediation{ + Action: RemediationActionReboot, + }, + }, + }, nil +} diff --git a/pkg/drift/node_reboot_test.go b/pkg/drift/node_reboot_test.go new file mode 100644 index 00000000..2082c135 --- /dev/null +++ b/pkg/drift/node_reboot_test.go @@ -0,0 +1,82 @@ +package drift + +import ( + "context" + "testing" + + "github.com/Azure/AKSFlexNode/pkg/status" +) + +func TestRebootDetector_Name(t *testing.T) { + t.Parallel() + d := NewRebootDetector() + if name := d.Name(); name != "RebootDetector" { + t.Errorf("expected name %q, got %q", "RebootDetector", name) + } +} + +func TestRebootDetector_NilStatus_NoFindings(t *testing.T) { + t.Parallel() + d := NewRebootDetector() + findings, err := d.Detect(context.Background(), nil, nil, nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(findings) != 0 { + t.Errorf("expected no findings, got %d", len(findings)) + } +} + +func TestRebootDetector_NeedRebootFalse_NoFindings(t *testing.T) { + t.Parallel() + d := NewRebootDetector() + statusSnap := &status.NodeStatus{ + NeedReboot: false, + } + findings, err := d.Detect(context.Background(), nil, nil, statusSnap) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(findings) != 0 { + t.Errorf("expected no findings, got %d", len(findings)) + } +} + +func TestRebootDetector_NeedRebootTrue_ReturnsFinding(t *testing.T) { + t.Parallel() + d := NewRebootDetector() + statusSnap := &status.NodeStatus{ + NeedReboot: true, + } + findings, err := d.Detect(context.Background(), nil, nil, statusSnap) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + + f := findings[0] + if f.ID != NodeRebootFindingID { + t.Errorf("expected ID %q, got %q", NodeRebootFindingID, f.ID) + } + if f.Title != "Node reboot required" { + t.Errorf("unexpected title: %q", f.Title) + } + if f.Remediation.Action != RemediationActionReboot { + t.Errorf("expected action %q, got %q", RemediationActionReboot, f.Remediation.Action) + } +} + +func TestRebootDetector_CanceledContext_ReturnsError(t *testing.T) { + t.Parallel() + d := NewRebootDetector() + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + statusSnap := &status.NodeStatus{NeedReboot: true} + _, err := d.Detect(ctx, nil, nil, statusSnap) + if err == nil { + t.Fatal("expected error from canceled context") + } +} From 915ec604a9a722538f73288a73bc8efdc2716a9b Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 01:09:36 +0000 Subject: [PATCH 04/15] npd --- pkg/drift/node_reboot.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/drift/node_reboot.go b/pkg/drift/node_reboot.go index 7080e008..17bae9c2 100644 --- a/pkg/drift/node_reboot.go +++ b/pkg/drift/node_reboot.go @@ -26,11 +26,6 @@ func (d *RebootDetector) Detect( _ *spec.ManagedClusterSpec, statusSnap *status.NodeStatus, ) ([]Finding, error) { - if ctx != nil { - if err := ctx.Err(); err != nil { - return nil, err - } - } if statusSnap == nil { return nil, nil From 0875cc724aae98263fe200acfe417721b2bc1ce9 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 01:13:57 +0000 Subject: [PATCH 05/15] npd --- pkg/status/collector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index 46be075d..d1b5c573 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -367,9 +367,9 @@ func (c *Collector) checkRebootNeeded(ctx context.Context) bool { for _, condition := range node.Status.Conditions { switch condition.Type { case "KernelDeadlock": - if condition.Status == "True" && condition.LastTransitionTime.Time.After(hostBootTime) { + if condition.Status == "True" && condition.LastTransitionTime.After(hostBootTime) { c.logger.Infof("Node has a kernel deadlock since %s, rebooting...", - condition.LastTransitionTime.Time.Format("2006-01-02 15:04:05")) + condition.LastTransitionTime.Format("2006-01-02 15:04:05")) return true } } From 42b396ced86a9e33948546b0ffe4e49e12b63589 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Fri, 20 Mar 2026 18:17:13 -0700 Subject: [PATCH 06/15] Update pkg/status/collector.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/status/collector.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index d1b5c573..8d6147bc 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -357,8 +357,12 @@ func (c *Collector) checkRebootNeeded(ctx context.Context) bool { c.logger.Errorf("failed to get kubelet clientset: %s", err.Error()) return false } - // Get the node - node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + + // Get the node with a timeout and respecting the passed-in context + ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + node, err := clientset.CoreV1().Nodes().Get(ctxWithTimeout, nodeName, metav1.GetOptions{}) if err != nil { c.logger.Errorf("failed to get node: %s", err.Error()) return false From 55ed1bb219e6daafba44c01a12d3106847169bb4 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Fri, 20 Mar 2026 18:17:27 -0700 Subject: [PATCH 07/15] Update pkg/status/collector.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/status/collector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index 8d6147bc..ae6462a1 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -371,7 +371,7 @@ func (c *Collector) checkRebootNeeded(ctx context.Context) bool { for _, condition := range node.Status.Conditions { switch condition.Type { case "KernelDeadlock": - if condition.Status == "True" && condition.LastTransitionTime.After(hostBootTime) { + if condition.Status == corev1.ConditionTrue && condition.LastTransitionTime.Time.After(hostBootTime) { c.logger.Infof("Node has a kernel deadlock since %s, rebooting...", condition.LastTransitionTime.Format("2006-01-02 15:04:05")) return true From 1c3b73361b616d70cf7b0db3c15e4e6d86f8ec48 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Fri, 20 Mar 2026 18:17:47 -0700 Subject: [PATCH 08/15] Update pkg/drift/remediation.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/drift/remediation.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 72137b01..7609f4f1 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -293,7 +293,8 @@ func runRebootRemediation( if err := checkCmd.Run(); err != nil { if exitErr, ok := err.(*exec.ExitError); ok { logger.WithError(exitErr).Warn("aks-flex-node-agent is not running as a systemd service; skipping reboot") - return fmt.Errorf("agent not managed by systemd, reboot skipped") + // Not running under systemd is an expected scenario (e.g., dev/test); treat as a no-op, not an error. + return nil } logger.WithError(err).Warn("Failed to check systemd service status; skipping reboot") return fmt.Errorf("failed to check systemd service status: %w", err) From 7cbe9d61dd6153590c780240a6edd7f338e33b44 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Sat, 21 Mar 2026 01:19:45 +0000 Subject: [PATCH 09/15] npd --- pkg/status/collector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index ae6462a1..cf8852b4 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -371,7 +371,7 @@ func (c *Collector) checkRebootNeeded(ctx context.Context) bool { for _, condition := range node.Status.Conditions { switch condition.Type { case "KernelDeadlock": - if condition.Status == corev1.ConditionTrue && condition.LastTransitionTime.Time.After(hostBootTime) { + if condition.Status == corev1.ConditionTrue && condition.LastTransitionTime.After(hostBootTime) { c.logger.Infof("Node has a kernel deadlock since %s, rebooting...", condition.LastTransitionTime.Format("2006-01-02 15:04:05")) return true From 94ee58c3977f6375b360a268bb4142d88960fcda Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Mon, 23 Mar 2026 22:28:34 +0000 Subject: [PATCH 10/15] npd --- pkg/drift/remediation.go | 20 +++++++++++++++----- pkg/status/collector.go | 26 ++++++++------------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 7609f4f1..e2956b9b 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -17,6 +17,7 @@ import ( "github.com/Azure/AKSFlexNode/pkg/kube" "github.com/Azure/AKSFlexNode/pkg/spec" "github.com/Azure/AKSFlexNode/pkg/status" + "github.com/Azure/AKSFlexNode/pkg/systemd" ) const driftKubernetesUpgradeOperation = "drift-kubernetes-upgrade" @@ -29,6 +30,8 @@ const ( upgradeStepUncordon = "uncordon" ) +const agentServiceName = "aks-flex-node-agent.service" + // maxManagedClusterSpecAge is a safety guard to avoid acting on very stale spec snapshots. // In normal operation we run drift immediately after a successful spec collection, so this // should rarely block remediation. @@ -288,11 +291,12 @@ func runRebootRemediation( } // Check if aks-flex-node-agent is managed by systemd. - // We use 'systemctl is-active' to check if the service is running under systemd. - checkCmd := exec.CommandContext(ctx, "systemctl", "is-active", "aks-flex-node-agent.service") - if err := checkCmd.Run(); err != nil { - if exitErr, ok := err.(*exec.ExitError); ok { - logger.WithError(exitErr).Warn("aks-flex-node-agent is not running as a systemd service; skipping reboot") + // We use GetUnitStatus to check if the service is active and running under systemd. + mgr := systemd.New() + status, err := mgr.GetUnitStatus(ctx, agentServiceName) + if err != nil { + if errors.Is(err, systemd.ErrUnitNotFound) { + logger.Warn("aks-flex-node-agent is not running as a systemd service; skipping reboot") // Not running under systemd is an expected scenario (e.g., dev/test); treat as a no-op, not an error. return nil } @@ -300,6 +304,12 @@ func runRebootRemediation( return fmt.Errorf("failed to check systemd service status: %w", err) } + // Only reboot if the service is active + if status.ActiveState != systemd.UnitActiveStateActive { + logger.Warnf("aks-flex-node-agent service is not active (state: %s); skipping reboot", status.ActiveState) + return nil + } + logger.Info("Initiating system reboot via systemctl") // Use systemctl reboot for a clean shutdown. diff --git a/pkg/status/collector.go b/pkg/status/collector.go index cf8852b4..79740a7b 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -7,7 +7,6 @@ import ( "os" "os/exec" "path/filepath" - "strconv" "strings" "time" @@ -15,6 +14,7 @@ import ( "github.com/Azure/AKSFlexNode/pkg/kube" "github.com/Azure/AKSFlexNode/pkg/utils" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -304,25 +304,15 @@ func GetStatusFilePath() string { } func getBootTime() (time.Time, error) { - data, err := os.ReadFile("/proc/uptime") - if err != nil { - return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err) - } - - // /proc/uptime contains two numbers: uptime in seconds and idle time - // We only need the first number - fields := strings.Fields(string(data)) - if len(fields) < 1 { - return time.Time{}, fmt.Errorf("invalid /proc/uptime format") - } - - uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) - if err != nil { - return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err) + var sysinfo unix.Sysinfo_t + if err := unix.Sysinfo(&sysinfo); err != nil { + return time.Time{}, fmt.Errorf("failed to get system info: %w", err) } // Calculate boot time: current time - uptime - bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second))) + // Sysinfo.Uptime is in seconds since boot + uptimeSeconds := time.Duration(sysinfo.Uptime) * time.Second + bootTime := time.Now().Add(-uptimeSeconds) return bootTime, nil } @@ -332,7 +322,7 @@ func getNodeName() (string, error) { return "", fmt.Errorf("failed to get hostname: %w", err) } - nodeName := strings.TrimSpace(host) + nodeName := strings.ToLower(strings.TrimSpace(host)) if nodeName == "" { return "", fmt.Errorf("node name is empty") } From 82b8a0bc087944330dda87e565cf38366170ac53 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Mon, 23 Mar 2026 22:35:06 +0000 Subject: [PATCH 11/15] npd --- pkg/status/collector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/status/collector.go b/pkg/status/collector.go index 79740a7b..4cd7b192 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -311,8 +311,8 @@ func getBootTime() (time.Time, error) { // Calculate boot time: current time - uptime // Sysinfo.Uptime is in seconds since boot - uptimeSeconds := time.Duration(sysinfo.Uptime) * time.Second - bootTime := time.Now().Add(-uptimeSeconds) + uptime := time.Duration(sysinfo.Uptime) * time.Second + bootTime := time.Now().Add(-uptime) return bootTime, nil } From 7cc494bc42a5626ebfc1d7ca84393781ae915220 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Mon, 23 Mar 2026 15:47:27 -0700 Subject: [PATCH 12/15] Update pkg/drift/remediation.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/drift/remediation.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index e2956b9b..6b17f2c8 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -314,12 +314,28 @@ func runRebootRemediation( // Use systemctl reboot for a clean shutdown. // This will gracefully stop services and sync filesystems before rebooting. - cmd := exec.CommandContext(ctx, "systemctl", "reboot") - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to initiate reboot: %w", err) + // To avoid silently ignoring immediate failures (e.g., DBus unavailable), run the + // command and check its exit status, using a short timeout if no deadline is set. + rebootCtx := ctx + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + var cancel context.CancelFunc + rebootCtx, cancel = context.WithTimeout(ctx, 30*time.Second) + defer cancel() } - // Don't wait for the command to complete; the system will be shutting down. + cmd := exec.CommandContext(rebootCtx, "systemctl", "reboot") + output, err := cmd.CombinedOutput() + if err != nil { + // If the context was canceled or timed out, surface that information explicitly. + if errors.Is(rebootCtx.Err(), context.DeadlineExceeded) { + logger.WithError(err).WithField("output", string(output)). + Error("systemctl reboot timed out") + return fmt.Errorf("systemctl reboot timed out: %w", err) + } + logger.WithError(err).WithField("output", string(output)). + Error("systemctl reboot failed") + return fmt.Errorf("systemctl reboot failed: %w", err) + } return nil } From ce20060e1b791e4be5b903927a39a8cbe9687706 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Mon, 23 Mar 2026 15:48:09 -0700 Subject: [PATCH 13/15] Update pkg/drift/remediation.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/drift/remediation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 6b17f2c8..7bf63e9a 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -162,7 +162,7 @@ func detectAndRemediate( if err := runRebootRemediation(ctx, logger); err != nil { return fmt.Errorf("reboot remediation failed: %w", err) } - logger.Info("Node reboot initiated successfully") + logger.Info("Reboot remediation completed without error") return detectErr default: From 3c049252311527cab2de8fe948a91c0baa2121ed Mon Sep 17 00:00:00 2001 From: Runzhen Date: Mon, 23 Mar 2026 15:48:18 -0700 Subject: [PATCH 14/15] Update pkg/drift/remediation.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/drift/remediation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 7bf63e9a..90be5a64 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -300,7 +300,7 @@ func runRebootRemediation( // Not running under systemd is an expected scenario (e.g., dev/test); treat as a no-op, not an error. return nil } - logger.WithError(err).Warn("Failed to check systemd service status; skipping reboot") + logger.WithError(err).Warn("Failed to check systemd service status; aborting reboot remediation") return fmt.Errorf("failed to check systemd service status: %w", err) } From 89e55e8125ca39fd77ddbf0a344139250785cdc3 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 00:16:32 +0000 Subject: [PATCH 15/15] npd --- pkg/drift/node_reboot.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/drift/node_reboot.go b/pkg/drift/node_reboot.go index 17bae9c2..9d6874c9 100644 --- a/pkg/drift/node_reboot.go +++ b/pkg/drift/node_reboot.go @@ -26,6 +26,9 @@ func (d *RebootDetector) Detect( _ *spec.ManagedClusterSpec, statusSnap *status.NodeStatus, ) ([]Finding, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } if statusSnap == nil { return nil, nil