From f4c60133a5d6ec2ba7845227df923d9b3f9dc282 Mon Sep 17 00:00:00 2001
From: kpurdon <kylepurdon@gmail.com>
Date: Fri, 24 Apr 2026 09:29:26 -0600
Subject: [PATCH 1/2] fix(shim): use per-container id for TaskOOM routing
 (fixes #12838)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On aarch64 (and any architecture where the containerd shim race
between TaskOOM and TaskExit routinely resolves OOM-last), OOM-killed
pods run by gVisor report kubelet reason=Error instead of
reason=OOMKilled.

Root cause: the shim binary is spawned by pkg/shim/v1/manager.go's
newCommand() without the `-id` flag. containerd's shim framework then
defaults `id` to "", which threads through to runscService.id = "".
Every TaskOOM{ContainerID: s.id} therefore carries an empty string.

containerd's CRI event handler routes TaskOOM by ContainerID through
containerStore.Get(e.ContainerID). With an empty id, the truncindex
lookup returns ErrEmptyPrefix (which is not a NotFound), so the CRI
event handler wraps and returns the error. The container's
Status.Reason is never set, so kubelet falls back to reason:Error.

TaskExit is unaffected because containerd routes TaskExit by e.ID
(init process id), not e.ContainerID.

The prior fix in #12843 added a synchronous cgroup check at init
exit, which does close the async race — but that path also publishes
TaskOOM{ContainerID: s.id}, so on aarch64 the empty-id bug continues
to produce reason=Error after #12843 merged. Don't rely on s.id.

This change:

* CreateWithFSRestore: register the cgroup with rfs.Create.ID (the
  per-container id from the Create request) instead of s.id. The id
  stored in the watcher's cgroups and lastOOM maps is now non-empty
  and correctly keyed, so both async (EventChan -> run) and sync
  (isOOM at exit) paths publish TaskOOM with a real ContainerID.

* checkProcesses: find the container whose process matched the exit
  event, use that container's id for isOOM, and use it for both the
  TaskOOM and TaskExit ContainerID fields.

No changes to oom_v2.go: with add() keyed by rfs.Create.ID, the
existing async publish path already emits TaskOOM with the correct
id via i.id.

Validated end-to-end on AL2023 aarch64 (kernel 6.12, containerd
2.2.1, kubelet 1.34) with a forced-OOM pod: before this change pods
show reason:Error and journald reports "can't find container for
TaskOOM event: prefix can't be empty"; after, pods show
reason:OOMKilled and the TaskOOM event carries the real container id.

Assisted-by: Claude Code
---
 pkg/shim/v1/runsc/service.go | 88 ++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 29 deletions(-)

diff --git a/pkg/shim/v1/runsc/service.go b/pkg/shim/v1/runsc/service.go
index 05b2d5481e..5dcc9a3844 100644
--- a/pkg/shim/v1/runsc/service.go
+++ b/pkg/shim/v1/runsc/service.go
@@ -213,7 +213,13 @@ func (s *runscService) CreateWithFSRestore(ctx context.Context, rfs *extension.C
 		if err != nil {
 			return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err)
 		}
-		if err := s.oomPoller.add(s.id, cg); err != nil {
+		// Register the sandbox cgroup with the OOM poller using this
+		// container's id (typically the pause/sandbox id on first Create).
+		// Do not use s.id: it is empty because the shim binary is spawned
+		// without `-id` (see manager.go:newCommand), which would cause
+		// every TaskOOM event to be published with an empty ContainerID
+		// and silently dropped by containerd (ErrEmptyPrefix).
+		if err := s.oomPoller.add(rfs.Create.ID, cg); err != nil {
 			return nil, fmt.Errorf("add cg to OOM monitor: %w", err)
 		}
 	}
@@ -523,37 +529,61 @@ func (s *runscService) processExits(ctx context.Context) {
 }
 
 func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) {
-	for _, p := range s.allProcessesForAllContainers() {
-		if p.ID() == e.ID {
-			ip, isInit := p.(*proc.Init)
-			if isInit {
-				// Ensure all children are killed.
-				log.L.Debugf("Container init process exited, killing all container processes")
-				ip.KillAll(ctx)
-			}
-			p.SetExited(e.Status)
-			// On init process exit, synchronously check the cgroup for OOM
-			// kills before publishing the exit event. The async OOM
-			// notification via EventChan (cgroups v2) can lose the race
-			// against the container exit on some architectures (notably
-			// aarch64), causing kubelet to report reason=Error instead of
-			// reason=OOMKilled. Only check on init exit — exec processes
-			// share the sandbox cgroup and would produce spurious events.
-			if isInit && s.oomPoller.isOOM(s.id) {
-				s.send(&events.TaskOOM{
-					ContainerID: s.id,
-				})
+	// Find the container whose process exited.
+	s.mu.Lock()
+	var (
+		exitingCID string
+		exitingP   process.Process
+	)
+	for cid, c := range s.containers {
+		for _, p := range c.All() {
+			if p.ID() == e.ID {
+				exitingCID = cid
+				exitingP = p
+				break
 			}
-			s.send(&events.TaskExit{
-				ContainerID: s.id,
-				ID:          p.ID(),
-				Pid:         uint32(p.Pid()),
-				ExitStatus:  uint32(e.Status),
-				ExitedAt:    p.ExitedAt(),
-			})
-			return
+		}
+		if exitingP != nil {
+			break
 		}
 	}
+	s.mu.Unlock()
+	if exitingP == nil {
+		return
+	}
+	p := exitingP
+
+	ip, isInit := p.(*proc.Init)
+	if isInit {
+		// Ensure all children are killed.
+		log.L.Debugf("Container init process exited, killing all container processes")
+		ip.KillAll(ctx)
+	}
+	p.SetExited(e.Status)
+	// On init process exit, synchronously check the cgroup for OOM kills
+	// before publishing the exit event. The async OOM notification via
+	// EventChan (cgroups v2) can lose the race against the container exit
+	// on some architectures (notably aarch64), causing kubelet to report
+	// reason=Error instead of reason=OOMKilled. Only check on init exit —
+	// exec processes share the sandbox cgroup and would produce spurious
+	// events.
+	//
+	// Use the per-container id for TaskOOM routing — not s.id, which is
+	// empty in the normal (non-grouping) shim spawn path because
+	// pkg/shim/v1/manager.go:newCommand() does not pass `-id`. With
+	// ContainerID="", containerd's CRI would drop the TaskOOM at
+	// containerStore.Get("") → ErrEmptyPrefix, and kubelet would report
+	// reason:Error instead of reason:OOMKilled.
+	if isInit && s.oomPoller.isOOM(exitingCID) {
+		s.send(&events.TaskOOM{ContainerID: exitingCID})
+	}
+	s.send(&events.TaskExit{
+		ContainerID: exitingCID,
+		ID:          p.ID(),
+		Pid:         uint32(p.Pid()),
+		ExitStatus:  uint32(e.Status),
+		ExitedAt:    p.ExitedAt(),
+	})
 }
 
 func (s *runscService) send(event any) {

From ef14594e12d2fe99cb0f1c3f82b5ad3ff09cda9c Mon Sep 17 00:00:00 2001
From: kpurdon <kylepurdon@gmail.com>
Date: Mon, 4 May 2026 08:47:54 -0600
Subject: [PATCH 2/2] fix(shim): rename exitingCID to containerID for
 readability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback on #12996 — use the more idiomatic name.
---
 pkg/shim/v1/runsc/service.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pkg/shim/v1/runsc/service.go b/pkg/shim/v1/runsc/service.go
index 5dcc9a3844..80c1127b7b 100644
--- a/pkg/shim/v1/runsc/service.go
+++ b/pkg/shim/v1/runsc/service.go
@@ -532,13 +532,13 @@ func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) {
 	// Find the container whose process exited.
 	s.mu.Lock()
 	var (
-		exitingCID string
-		exitingP   process.Process
+		containerID string
+		exitingP    process.Process
 	)
 	for cid, c := range s.containers {
 		for _, p := range c.All() {
 			if p.ID() == e.ID {
-				exitingCID = cid
+				containerID = cid
 				exitingP = p
 				break
 			}
@@ -574,11 +574,11 @@ func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) {
 	// ContainerID="", containerd's CRI would drop the TaskOOM at
 	// containerStore.Get("") → ErrEmptyPrefix, and kubelet would report
 	// reason:Error instead of reason:OOMKilled.
-	if isInit && s.oomPoller.isOOM(exitingCID) {
-		s.send(&events.TaskOOM{ContainerID: exitingCID})
+	if isInit && s.oomPoller.isOOM(containerID) {
+		s.send(&events.TaskOOM{ContainerID: containerID})
 	}
 	s.send(&events.TaskExit{
-		ContainerID: exitingCID,
+		ContainerID: containerID,
 		ID:          p.ID(),
 		Pid:         uint32(p.Pid()),
 		ExitStatus:  uint32(e.Status),