From 7b1994755ac468f53a9d406ca08c7f2cf09ea800 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 6 May 2026 18:10:57 -0700 Subject: [PATCH] Refactor restorer.restore to extract post-restore work - Extracted the inline goroutine function from the `restore` method into a new method `restorer.postRestore`. - Passed `*timing.Timer` to `postRestore` and moved `timer.Log()` to the end of `postRestore` to ensure asynchronous logging after all background work (including page loading) completes, avoiding spawning an extra goroutine. PiperOrigin-RevId: 911655209 --- runsc/boot/restore.go | 146 ++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 70 deletions(-) diff --git a/runsc/boot/restore.go b/runsc/boot/restore.go index aff2cece09..85a839acee 100644 --- a/runsc/boot/restore.go +++ b/runsc/boot/restore.go @@ -370,86 +370,92 @@ func (r *restorer) restore(l *Loader) error { r.stateFile.Close() + // Transfer ownership of the `timer` to a new goroutine. + // This is because `timer.Log` blocks until all timed tasks are finished, + // but some restore tasks may still run in the background, and we don't + // want to block this function until they finish. postRestoreThread := r.timer.Fork("postRestore") - go func() { - defer postRestoreThread.End() - postRestoreThread.Reached("scheduled") - if err := control.PostRestore(l.k, postRestoreThread); err != nil { - log.Warningf("Killing the sandbox after post restore work failed: %v", err) - l.k.Kill(linux.WaitStatusTerminationSignal(linux.SIGKILL)) - return - } - postRestoreThread.Reached("post restore done") + go r.postRestore(l.k, postRestoreThread, r.timer) + r.timer = nil - // Now that post restore work succeeded, increment the checkpoint gen - // manually. The count was saved while the previous kernel was being saved - // and checkpoint success was unknown at that time. Now we know the had - // checkpoint succeeded. Allow the application to proceed while pages may - // keep loading in the background. - l.k.IncCheckpointGenOnRestore() + return nil +} - // Wait for page loading to complete if happening in the background. - if r.asyncMFLoader != nil { - if err := r.asyncMFLoader.Wait(); err != nil { - log.Warningf("Killing the sandbox after MemoryFile page loading failed: %v", err) - l.k.Kill(linux.WaitStatusTerminationSignal(linux.SIGKILL)) - return - } - } +func (r *restorer) postRestore(k *kernel.Kernel, postRestoreThread *timing.Timeline, timer *timing.Timer) { + defer timer.Log() + defer postRestoreThread.End() - // Calculate the CPU time saved for restore. - t, err := state.CPUTime() - var s Savings - if err != nil { - log.Warningf("Failed to get CPU time usage for restore, err: %v", err) - } else { - log.Infof("Restore CPU usage: %s", t.String()) - savedTimeStr, ok := r.metadata[state.GvisorCPUUsageKey] - if !ok { - log.Warningf("Failed to retrieve CPU time usage from the metadata") - } else { - savedTime, err := time2.ParseDuration(savedTimeStr) - if err != nil { - log.Warningf("CPU time usage in metadata %v is invalid, err: %v", savedTimeStr, err) - } else { - s.CPUTimeSaved = savedTime - t - log.Infof("CPU time saved with restore: %v ms", s.CPUTimeSaved.Milliseconds()) - } - } - } + postRestoreThread.Reached("scheduled") + if err := control.PostRestore(k, postRestoreThread); err != nil { + log.Warningf("Killing the sandbox after post restore work failed: %v", err) + k.Kill(linux.WaitStatusTerminationSignal(linux.SIGKILL)) + return + } + postRestoreThread.Reached("post restore done") - // Calculate the walltime saved for restore. - startTime := starttime.Get() - curTime := time2.Now() - wt := curTime.Sub(startTime) - log.Infof("Restore wall time: %s", wt.String()) - savedWtStr, ok := r.metadata[state.GvisorWallTimeKey] - if !ok { - log.Warningf("Failed to retrieve walltime from the metadata") - } else { - savedWt, err := time2.ParseDuration(savedWtStr) - if err != nil { - log.Warningf("Walltime in metadata %v is invalid, err: %v", savedWtStr, err) - } else { - s.WallTimeSaved = savedWt - wt - log.Infof("Walltime saved with restore: %v ms", s.WallTimeSaved.Milliseconds()) - } + // Now that post restore work succeeded, increment the checkpoint gen + // manually. The count was saved while the previous kernel was being saved + // and checkpoint success was unknown at that time. Now we know the had + // checkpoint succeeded. Allow the application to proceed while pages may + // keep loading in the background. + k.IncCheckpointGenOnRestore() + + // Wait for page loading to complete if happening in the background. + if r.asyncMFLoader != nil { + if err := r.asyncMFLoader.Wait(); err != nil { + log.Warningf("Killing the sandbox after MemoryFile page loading failed: %v", err) + k.Kill(linux.WaitStatusTerminationSignal(linux.SIGKILL)) + return } + } - r.cm.onRestoreDone(s) - postRestoreThread.Reached("kernel notified") + var s Savings + if err := r.calculateCPUSavings(&s); err != nil { + log.Warningf("Failed to calculate CPU savings: %v", err) + } + if err := r.calculateWallTimeSavings(&s); err != nil { + log.Warningf("Failed to calculate walltime savings: %v", err) + } - log.Infof("Restore successful") - }() + r.cm.onRestoreDone(s) + postRestoreThread.Reached("kernel notified") + log.Infof("Restore successful") +} - // Transfer ownership of the `timer` to a new goroutine. - // This is because `timer.Log` blocks until all timed tasks are finished, - // but some restore tasks may still run in the background, and we don't - // want to block this function until they finish. - timer := r.timer - r.timer = nil - go timer.Log() +// Calculate the CPU time saved for restore. +func (r *restorer) calculateCPUSavings(s *Savings) error { + t, err := state.CPUTime() + if err != nil { + return fmt.Errorf("failed to get CPU time usage for restore, err: %w", err) + } + savedTimeStr, ok := r.metadata[state.GvisorCPUUsageKey] + if !ok { + return fmt.Errorf("failed to retrieve CPU time usage from the metadata") + } + savedTime, err := time2.ParseDuration(savedTimeStr) + if err != nil { + return fmt.Errorf("cpu time usage in metadata %v is invalid, err: %w", savedTimeStr, err) + } + + s.CPUTimeSaved = savedTime - t + log.Infof("CPU time saved with restore: %v ms, restore CPU time: %v ms", s.CPUTimeSaved.Milliseconds(), t.Milliseconds()) + return nil +} + +// Calculate the walltime saved for restore. +func (r *restorer) calculateWallTimeSavings(s *Savings) error { + savedWtStr, ok := r.metadata[state.GvisorWallTimeKey] + if !ok { + return fmt.Errorf("failed to retrieve walltime from the metadata") + } + savedWt, err := time2.ParseDuration(savedWtStr) + if err != nil { + return fmt.Errorf("walltime in metadata %v is invalid, err: %w", savedWtStr, err) + } + wt := time2.Since(starttime.Get()) + s.WallTimeSaved = savedWt - wt + log.Infof("Walltime saved with restore: %v ms, restore walltime: %v ms", s.WallTimeSaved.Milliseconds(), wt.Milliseconds()) return nil }