From f4c60133a5d6ec2ba7845227df923d9b3f9dc282 Mon Sep 17 00:00:00 2001 From: kpurdon Date: Fri, 24 Apr 2026 09:29:26 -0600 Subject: [PATCH 1/2] fix(shim): use per-container id for TaskOOM routing (fixes #12838) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On aarch64 (and any architecture where the containerd shim race between TaskOOM and TaskExit routinely resolves OOM-last), OOM-killed pods run by gVisor report kubelet reason=Error instead of reason=OOMKilled. Root cause: the shim binary is spawned by pkg/shim/v1/manager.go's newCommand() without the `-id` flag. containerd's shim framework then defaults `id` to "", which threads through to runscService.id = "". Every TaskOOM{ContainerID: s.id} therefore carries an empty string. containerd's CRI event handler routes TaskOOM by ContainerID through containerStore.Get(e.ContainerID). With an empty id, the truncindex lookup returns ErrEmptyPrefix (which is not a NotFound), so the CRI event handler wraps and returns the error. The container's Status.Reason is never set, so kubelet falls back to reason:Error. TaskExit is unaffected because containerd routes TaskExit by e.ID (init process id), not e.ContainerID. The prior fix in #12843 added a synchronous cgroup check at init exit, which does close the async race — but that path also publishes TaskOOM{ContainerID: s.id}, so on aarch64 the empty-id bug continues to produce reason=Error after #12843 merged. Don't rely on s.id. This change: * CreateWithFSRestore: register the cgroup with rfs.Create.ID (the per-container id from the Create request) instead of s.id. The id stored in the watcher's cgroups and lastOOM maps is now non-empty and correctly keyed, so both async (EventChan -> run) and sync (isOOM at exit) paths publish TaskOOM with a real ContainerID. * checkProcesses: find the container whose process matched the exit event, use that container's id for isOOM, and use it for both the TaskOOM and TaskExit ContainerID fields. No changes to oom_v2.go: with add() keyed by rfs.Create.ID, the existing async publish path already emits TaskOOM with the correct id via i.id. Validated end-to-end on AL2023 aarch64 (kernel 6.12, containerd 2.2.1, kubelet 1.34) with a forced-OOM pod: before this change pods show reason:Error and journald reports "can't find container for TaskOOM event: prefix can't be empty"; after, pods show reason:OOMKilled and the TaskOOM event carries the real container id. Assisted-by: Claude Code --- pkg/shim/v1/runsc/service.go | 88 ++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/pkg/shim/v1/runsc/service.go b/pkg/shim/v1/runsc/service.go index 05b2d5481e..5dcc9a3844 100644 --- a/pkg/shim/v1/runsc/service.go +++ b/pkg/shim/v1/runsc/service.go @@ -213,7 +213,13 @@ func (s *runscService) CreateWithFSRestore(ctx context.Context, rfs *extension.C if err != nil { return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err) } - if err := s.oomPoller.add(s.id, cg); err != nil { + // Register the sandbox cgroup with the OOM poller using this + // container's id (typically the pause/sandbox id on first Create). + // Do not use s.id: it is empty because the shim binary is spawned + // without `-id` (see manager.go:newCommand), which would cause + // every TaskOOM event to be published with an empty ContainerID + // and silently dropped by containerd (ErrEmptyPrefix). + if err := s.oomPoller.add(rfs.Create.ID, cg); err != nil { return nil, fmt.Errorf("add cg to OOM monitor: %w", err) } } @@ -523,37 +529,61 @@ func (s *runscService) processExits(ctx context.Context) { } func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) { - for _, p := range s.allProcessesForAllContainers() { - if p.ID() == e.ID { - ip, isInit := p.(*proc.Init) - if isInit { - // Ensure all children are killed. - log.L.Debugf("Container init process exited, killing all container processes") - ip.KillAll(ctx) - } - p.SetExited(e.Status) - // On init process exit, synchronously check the cgroup for OOM - // kills before publishing the exit event. The async OOM - // notification via EventChan (cgroups v2) can lose the race - // against the container exit on some architectures (notably - // aarch64), causing kubelet to report reason=Error instead of - // reason=OOMKilled. Only check on init exit — exec processes - // share the sandbox cgroup and would produce spurious events. - if isInit && s.oomPoller.isOOM(s.id) { - s.send(&events.TaskOOM{ - ContainerID: s.id, - }) + // Find the container whose process exited. + s.mu.Lock() + var ( + exitingCID string + exitingP process.Process + ) + for cid, c := range s.containers { + for _, p := range c.All() { + if p.ID() == e.ID { + exitingCID = cid + exitingP = p + break } - s.send(&events.TaskExit{ - ContainerID: s.id, - ID: p.ID(), - Pid: uint32(p.Pid()), - ExitStatus: uint32(e.Status), - ExitedAt: p.ExitedAt(), - }) - return + } + if exitingP != nil { + break } } + s.mu.Unlock() + if exitingP == nil { + return + } + p := exitingP + + ip, isInit := p.(*proc.Init) + if isInit { + // Ensure all children are killed. + log.L.Debugf("Container init process exited, killing all container processes") + ip.KillAll(ctx) + } + p.SetExited(e.Status) + // On init process exit, synchronously check the cgroup for OOM kills + // before publishing the exit event. The async OOM notification via + // EventChan (cgroups v2) can lose the race against the container exit + // on some architectures (notably aarch64), causing kubelet to report + // reason=Error instead of reason=OOMKilled. Only check on init exit — + // exec processes share the sandbox cgroup and would produce spurious + // events. + // + // Use the per-container id for TaskOOM routing — not s.id, which is + // empty in the normal (non-grouping) shim spawn path because + // pkg/shim/v1/manager.go:newCommand() does not pass `-id`. With + // ContainerID="", containerd's CRI would drop the TaskOOM at + // containerStore.Get("") → ErrEmptyPrefix, and kubelet would report + // reason:Error instead of reason:OOMKilled. + if isInit && s.oomPoller.isOOM(exitingCID) { + s.send(&events.TaskOOM{ContainerID: exitingCID}) + } + s.send(&events.TaskExit{ + ContainerID: exitingCID, + ID: p.ID(), + Pid: uint32(p.Pid()), + ExitStatus: uint32(e.Status), + ExitedAt: p.ExitedAt(), + }) } func (s *runscService) send(event any) { From ef14594e12d2fe99cb0f1c3f82b5ad3ff09cda9c Mon Sep 17 00:00:00 2001 From: kpurdon Date: Mon, 4 May 2026 08:47:54 -0600 Subject: [PATCH 2/2] fix(shim): rename exitingCID to containerID for readability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on #12996 — use the more idiomatic name. --- pkg/shim/v1/runsc/service.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/shim/v1/runsc/service.go b/pkg/shim/v1/runsc/service.go index 5dcc9a3844..80c1127b7b 100644 --- a/pkg/shim/v1/runsc/service.go +++ b/pkg/shim/v1/runsc/service.go @@ -532,13 +532,13 @@ func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) { // Find the container whose process exited. s.mu.Lock() var ( - exitingCID string - exitingP process.Process + containerID string + exitingP process.Process ) for cid, c := range s.containers { for _, p := range c.All() { if p.ID() == e.ID { - exitingCID = cid + containerID = cid exitingP = p break } @@ -574,11 +574,11 @@ func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) { // ContainerID="", containerd's CRI would drop the TaskOOM at // containerStore.Get("") → ErrEmptyPrefix, and kubelet would report // reason:Error instead of reason:OOMKilled. - if isInit && s.oomPoller.isOOM(exitingCID) { - s.send(&events.TaskOOM{ContainerID: exitingCID}) + if isInit && s.oomPoller.isOOM(containerID) { + s.send(&events.TaskOOM{ContainerID: containerID}) } s.send(&events.TaskExit{ - ContainerID: exitingCID, + ContainerID: containerID, ID: p.ID(), Pid: uint32(p.Pid()), ExitStatus: uint32(e.Status),