From d9f2a24a5ba6164404b943dee34501821367c271 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 2 Oct 2023 16:39:50 -0700
Subject: [PATCH 1/8] libct: replace runType with hasInit

The semantics of runType is slightly complicated, and the only place
where we need to distinguish between Created and Running is
refreshState.

Replace runType with simpler hasInit, simplifying its users (except the
refreshState, which now figures out on its own whether the container is
Created or Running).

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/container_linux.go | 31 ++++++++++++++-----------------
 libcontainer/state_linux.go     | 17 ++++++++---------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 616cd1ede9f..d0faaa47ea7 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -115,7 +115,7 @@ func (c *Container) ignoreCgroupError(err error) error {
 	if err == nil {
 		return nil
 	}
-	if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
+	if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() {
 		return nil
 	}
 	return err
@@ -1006,34 +1006,31 @@ func (c *Container) refreshState() error {
 	if paused {
 		return c.state.transition(&pausedState{c: c})
 	}
-	t := c.runType()
-	switch t {
-	case Created:
+	if !c.hasInit() {
+		return c.state.transition(&stoppedState{c: c})
+	}
+	// The presence of exec fifo helps to distinguish between
+	// the created and the running states.
+	if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
 		return c.state.transition(&createdState{c: c})
-	case Running:
-		return c.state.transition(&runningState{c: c})
 	}
-	return c.state.transition(&stoppedState{c: c})
+	return c.state.transition(&runningState{c: c})
 }
 
-func (c *Container) runType() Status {
+// hasInit tells whether the container init process exists.
+func (c *Container) hasInit() bool {
 	if c.initProcess == nil {
-		return Stopped
+		return false
 	}
 	pid := c.initProcess.pid()
 	stat, err := system.Stat(pid)
 	if err != nil {
-		return Stopped
+		return false
 	}
 	if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
-		return Stopped
-	}
-	// We'll create exec fifo and blocking on it after container is created,
-	// and delete it after start container.
-	if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
-		return Created
+		return false
 	}
-	return Running
+	return true
 }
 
 func (c *Container) isPaused() (bool, error) {
diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go
index c6967e5f4f5..0f629922b56 100644
--- a/libcontainer/state_linux.go
+++ b/libcontainer/state_linux.go
@@ -103,7 +103,7 @@ func (r *runningState) status() Status {
 func (r *runningState) transition(s containerState) error {
 	switch s.(type) {
 	case *stoppedState:
-		if r.c.runType() == Running {
+		if r.c.hasInit() {
 			return ErrRunning
 		}
 		r.c.state = s
@@ -118,7 +118,7 @@ func (r *runningState) transition(s containerState) error {
 }
 
 func (r *runningState) destroy() error {
-	if r.c.runType() == Running {
+	if r.c.hasInit() {
 		return ErrRunning
 	}
 	return destroy(r.c)
@@ -170,14 +170,13 @@ func (p *pausedState) transition(s containerState) error {
 }
 
 func (p *pausedState) destroy() error {
-	t := p.c.runType()
-	if t != Running && t != Created {
-		if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
-			return err
-		}
-		return destroy(p.c)
+	if p.c.hasInit() {
+		return ErrPaused
+	}
+	if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
+		return err
 	}
-	return ErrPaused
+	return destroy(p.c)
 }
 
 // restoredState is the same as the running state but also has associated checkpoint

From 542cce01220a0896b56b6bbf40401acde3b368e4 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 30 Oct 2023 18:00:11 -0700
Subject: [PATCH 2/8] libct: Signal: slight refactor

Let's use c.hasInit and c.isPaused where needed instead of
c.curentStatus for simplicity.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/container_linux.go | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index d0faaa47ea7..d40eb2614fb 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -364,14 +364,8 @@ func (c *Container) start(process *Process) (retErr error) {
 func (c *Container) Signal(s os.Signal) error {
 	c.m.Lock()
 	defer c.m.Unlock()
-	status, err := c.currentStatus()
-	if err != nil {
-		return err
-	}
 	// To avoid a PID reuse attack, don't kill non-running container.
-	switch status {
-	case Running, Created, Paused:
-	default:
+	if !c.hasInit() {
 		return ErrNotRunning
 	}
 
@@ -382,6 +376,7 @@ func (c *Container) Signal(s os.Signal) error {
 	//
 	// OTOH, if PID namespace is shared, we should kill all pids to avoid
 	// leftover processes.
+	var err error
 	if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
 		err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
 	} else {
@@ -390,11 +385,13 @@ func (c *Container) Signal(s os.Signal) error {
 	if err != nil {
 		return fmt.Errorf("unable to signal init: %w", err)
 	}
-	if status == Paused && s == unix.SIGKILL {
+	if s == unix.SIGKILL {
 		// For cgroup v1, killing a process in a frozen cgroup
 		// does nothing until it's thawed. Only thaw the cgroup
 		// for SIGKILL.
-		_ = c.cgroupManager.Freeze(configs.Thawed)
+		if paused, _ := c.isPaused(); paused {
+			_ = c.cgroupManager.Freeze(configs.Thawed)
+		}
 	}
 	return nil
 }

From dcf1b731f5aaa810ba67978685d712dc1ac0b248 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 30 Oct 2023 18:09:07 -0700
Subject: [PATCH 3/8] runc kill: fix sending KILL to non-pidns container

Commit f8ad20f made it impossible to kill leftover processes in a
stopped container that does not have its own PID namespace. In other
words, if a container init is gone, it is no longer possible to use
`runc kill` to kill the leftover processes.

Fix this by moving the check if container init exists to after the
special case of handling the container without own PID namespace.

While at it, fix the minor issue introduced by commit 9583b3d:
if signalAllProcesses is used, there is no need to thaw the
container (as freeze/thaw is either done in signalAllProcesses already,
or not needed at all).

Also, make signalAllProcesses return an error early if the container
cgroup does not exist (as it relies on it to do its job). This way, the
error message returned is more generic and easier to understand
("container not running" instead of "can't open file").

Finally, add a test case.

Fixes: f8ad20f
Fixes: 9583b3d
Co-authored-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/container_linux.go | 21 ++++++-----
 libcontainer/init_linux.go      |  3 ++
 tests/integration/kill.bats     | 66 +++++++++++++++++++++++++++------
 3 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index d40eb2614fb..515755276ad 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -364,10 +364,6 @@ func (c *Container) start(process *Process) (retErr error) {
 func (c *Container) Signal(s os.Signal) error {
 	c.m.Lock()
 	defer c.m.Unlock()
-	// To avoid a PID reuse attack, don't kill non-running container.
-	if !c.hasInit() {
-		return ErrNotRunning
-	}
 
 	// When a container has its own PID namespace, inside it the init PID
 	// is 1, and thus it is handled specially by the kernel. In particular,
@@ -375,14 +371,19 @@ func (c *Container) Signal(s os.Signal) error {
 	// all other processes in that PID namespace (see pid_namespaces(7)).
 	//
 	// OTOH, if PID namespace is shared, we should kill all pids to avoid
-	// leftover processes.
-	var err error
+	// leftover processes. Handle this special case here.
 	if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
-		err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
-	} else {
-		err = c.initProcess.signal(s)
+		if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
+			return fmt.Errorf("unable to kill all processes: %w", err)
+		}
+		return nil
 	}
-	if err != nil {
+
+	// To avoid a PID reuse attack, don't kill non-running container.
+	if !c.hasInit() {
+		return ErrNotRunning
+	}
+	if err := c.initProcess.signal(s); err != nil {
 		return fmt.Errorf("unable to signal init: %w", err)
 	}
 	if s == unix.SIGKILL {
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index ec0f002a5fc..6ab0698eed8 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -673,6 +673,9 @@ func setupPersonality(config *configs.Config) error {
 // signalAllProcesses freezes then iterates over all the processes inside the
 // manager's cgroups sending the signal s to them.
 func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
+	if !m.Exists() {
+		return ErrNotRunning
+	}
 	// Use cgroup.kill, if available.
 	if s == unix.SIGKILL {
 		if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats
index 8f2f4a7b4e8..ad116df0d6f 100644
--- a/tests/integration/kill.bats
+++ b/tests/integration/kill.bats
@@ -10,6 +10,7 @@ function teardown() {
 	teardown_bundle
 }
 
+# shellcheck disable=SC2030
 @test "kill detached busybox" {
 	# run busybox detached
 	runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
@@ -35,16 +36,15 @@ function teardown() {
 	[ "$status" -eq 0 ]
 }
 
-# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration.
-@test "kill KILL [host pidns]" {
-	# kill -a currently requires cgroup freezer.
+test_host_pidns_kill() {
 	requires cgroups_freezer
 
 	update_config '	  .linux.namespaces -= [{"type": "pid"}]'
 	set_cgroups_path
 	if [ $EUID -ne 0 ]; then
-		# kill --all requires working cgroups.
 		requires rootless_cgroup
+		# Can't mount real /proc when rootless + no pidns,
+		# so change it to a bind-mounted one from the host.
 		update_config '	  .mounts |= map((select(.type == "proc")
 					| .type = "none"
 					| .source = "/proc"
@@ -53,17 +53,28 @@ function teardown() {
 	fi
 
 	runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
+	# shellcheck disable=SC2031
 	[ "$status" -eq 0 ]
+	cgpath=$(get_cgroup_path "pids")
+	init_pid=$(cat "$cgpath"/cgroup.procs)
 
 	# Start a few more processes.
 	for _ in 1 2 3 4 5; do
 		__runc exec -d test_busybox sleep 1h
-		[ "$status" -eq 0 ]
 	done
 
+	if [ -v KILL_INIT ]; then
+		# Now kill the container's init process. Since the container do
+		# not have own PID ns, its init is no special and the container
+		# will still be up and running (except for rootless container
+		# AND systemd cgroup driver AND systemd > v245, when systemd
+		# kills the container; see "kill KILL [host pidns + init gone]"
+		# below).
+		kill -9 "$init_pid"
+	fi
+
 	# Get the list of all container processes.
-	path=$(get_cgroup_path "pids")
-	pids=$(cat "$path"/cgroup.procs)
+	pids=$(cat "$cgpath"/cgroup.procs)
 	echo "pids: $pids"
 	# Sanity check -- make sure all processes exist.
 	for p in $pids; do
@@ -71,14 +82,45 @@ function teardown() {
 	done
 
 	runc kill test_busybox KILL
+	# shellcheck disable=SC2031
 	[ "$status" -eq 0 ]
 	wait_for_container 10 1 test_busybox stopped
 
 	# Make sure all processes are gone.
-	pids=$(cat "$path"/cgroup.procs) || true # OK if cgroup is gone
+	pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone
 	echo "pids: $pids"
-	for p in $pids; do
-		run ! kill -0 "$p"
-		[[ "$output" = *"No such process" ]]
-	done
+	[ -z "$pids" ]
+}
+
+# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration.
+# The differences are:
+#
+# 1. Here we use separate processes to create and to kill a container, so the
+#    processes inside a container are not children of "runc kill".
+#
+# 2. We hit different codepaths (nonChildProcess.signal rather than initProcess.signal).
+@test "kill KILL [host pidns]" {
+	unset KILL_INIT
+	test_host_pidns_kill
+}
+
+# Same as above plus:
+#
+# 3. Test runc kill on a container whose init process is gone.
+#
+# Issue 4047, case "runc kill".
+@test "kill KILL [host pidns + init gone]" {
+	# Apparently, for rootless test, when using systemd cgroup manager,
+	# newer versions of systemd clean up the container as soon as its init
+	# process is gone. This is all fine and dandy, except it prevents us to
+	# test this case, thus we skip the test.
+	#
+	# It is not entirely clear which systemd version got this feature:
+	# v245 works fine, and v249 does not.
+	if [ $EUID -ne 0 ] && [ -v RUNC_USE_SYSTEMD ] && [ "$(systemd_version)" -gt 245 ]; then
+		skip "rootless+systemd conflicts with systemd > 245"
+	fi
+	KILL_INIT=1
+	test_host_pidns_kill
+	unset KILL_INIT
 }

From 29283bb7db788fcafbc6cf413d06069b2d6b6bff Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Fri, 3 Nov 2023 18:41:53 -0700
Subject: [PATCH 4/8] runc delete -f: fix for no pidns + no init case

Commit f8ad20f moved the kill logic from container destroy to container
kill (which is the right thing to do).

Alas, it broke the use case of doing "runc delete -f" for a container
which does not have its own private PID namespace, when its init process
is gone. In this case, some processes may still be running, and runc
delete -f should kill them (the same way as "runc kill" does).

It does not do that because the container status is "stopped" (as runc
considers the container with no init process as stopped), and so we only
call "destroy" (which was doing the killing before).

The fix is easy: if --force is set, call killContainer no matter what.

Add a test case, similar to the one in the previous commit.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 delete.go                     | 11 ++++--
 tests/integration/delete.bats | 63 +++++++++++++++++++++++++++++++++++
 tests/integration/kill.bats   |  1 +
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/delete.go b/delete.go
index 682101ccad8..e8b92ae5a92 100644
--- a/delete.go
+++ b/delete.go
@@ -66,6 +66,14 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
 			}
 			return err
 		}
+		// When --force is given, we kill all container processes and
+		// then destroy the container. This is done even for a stopped
+		// container, because (in case it does not have its own PID
+		// namespace) there may be some leftover processes in the
+		// container's cgroup.
+		if force {
+			return killContainer(container)
+		}
 		s, err := container.Status()
 		if err != nil {
 			return err
@@ -76,9 +84,6 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
 		case libcontainer.Created:
 			return killContainer(container)
 		default:
-			if force {
-				return killContainer(container)
-			}
 			return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
 		}
 
diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats
index e9b80b64d06..8cc53c19e01 100644
--- a/tests/integration/delete.bats
+++ b/tests/integration/delete.bats
@@ -62,6 +62,69 @@ function teardown() {
 	[ "$status" -eq 0 ]
 }
 
+# Issue 4047, case "runc delete -f".
+# See also: "kill KILL [host pidns + init gone]" test in kill.bats.
+@test "runc delete --force [host pidns + init gone]" {
+	requires cgroups_freezer
+
+	update_config '	  .linux.namespaces -= [{"type": "pid"}]'
+	set_cgroups_path
+	if [ $EUID -ne 0 ]; then
+		requires rootless_cgroup
+		# Apparently, for rootless test, when using systemd cgroup manager,
+		# newer versions of systemd clean up the container as soon as its init
+		# process is gone. This is all fine and dandy, except it prevents us to
+		# test this case, thus we skip the test.
+		#
+		# It is not entirely clear which systemd version got this feature:
+		# v245 works fine, and v249 does not.
+		if [ -v RUNC_USE_SYSTEMD ] && [ "$(systemd_version)" -gt 245 ]; then
+			skip "rootless+systemd conflicts with systemd > 245"
+		fi
+		# Can't mount real /proc when rootless + no pidns,
+		# so change it to a bind-mounted one from the host.
+		update_config '	  .mounts |= map((select(.type == "proc")
+					| .type = "none"
+					| .source = "/proc"
+					| .options = ["rbind", "nosuid", "nodev", "noexec"]
+				  ) // .)'
+	fi
+
+	runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
+	[ "$status" -eq 0 ]
+	cgpath=$(get_cgroup_path "pids")
+	init_pid=$(cat "$cgpath"/cgroup.procs)
+
+	# Start a few more processes.
+	for _ in 1 2 3 4 5; do
+		__runc exec -d test_busybox sleep 1h
+	done
+
+	# Now kill the container's init process. Since the container do
+	# not have own PID ns, its init is no special and the container
+	# will still be up and running.
+	kill -9 "$init_pid"
+
+	# Get the list of all container processes.
+	pids=$(cat "$cgpath"/cgroup.procs)
+	echo "pids: $pids"
+	# Sanity check -- make sure all processes exist.
+	for p in $pids; do
+		kill -0 "$p"
+	done
+
+	runc delete -f test_busybox
+	[ "$status" -eq 0 ]
+
+	runc state test_busybox
+	[ "$status" -ne 0 ] # "Container does not exist"
+
+	# Make sure all processes are gone.
+	pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone
+	echo "pids: $pids"
+	[ -z "$pids" ]
+}
+
 @test "runc delete --force [paused container]" {
 	runc run -d --console-socket "$CONSOLE_SOCKET" ct1
 	[ "$status" -eq 0 ]
diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats
index ad116df0d6f..3124bb0e572 100644
--- a/tests/integration/kill.bats
+++ b/tests/integration/kill.bats
@@ -109,6 +109,7 @@ test_host_pidns_kill() {
 # 3. Test runc kill on a container whose init process is gone.
 #
 # Issue 4047, case "runc kill".
+# See also: "runc delete --force [host pidns + init gone]" test in delete.bats.
 @test "kill KILL [host pidns + init gone]" {
 	# Apparently, for rootless test, when using systemd cgroup manager,
 	# newer versions of systemd clean up the container as soon as its init

From d3d7f7d85abfe7bd6f4ddbe45ad6b1c188dcf194 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 9 Nov 2023 12:08:01 -0800
Subject: [PATCH 5/8] libct/cg: improve cgroup removal logic

The current code is only doing retries in RemovePaths, which is only
used for cgroup v1 (cgroup v2 uses RemovePath, which makes no retries).

Let's remove all retry logic and logging from RemovePaths, together
with:

 - os.Stat check from RemovePaths (its usage probably made sense before
   commit 19be8e5ba56df7a91c7 but not after);

 - error/warning logging from RemovePaths (this was added by commit
   19be8e5ba56df7a91c7 in 2020 and so far we've seen no errors other
   than EBUSY, so reporting the actual error proved to be useless).

Add the retry logic to rmdir, and the second retry bool argument.
Decrease the initial delay and increase the number of retries from the
old implementation so it can take up to ~1 sec before returning EBUSY
(was about 0.3 sec).

Hopefully, as a result, we'll have less "failed to remove cgroup paths"
errors.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/cgroups/utils.go | 74 +++++++++++++++--------------------
 1 file changed, 32 insertions(+), 42 deletions(-)

diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go
index 965c607c682..186cbc6413f 100644
--- a/libcontainer/cgroups/utils.go
+++ b/libcontainer/cgroups/utils.go
@@ -217,10 +217,26 @@ func PathExists(path string) bool {
 	return true
 }
 
-func rmdir(path string) error {
+// rmdir tries to remove a directory, optionally retrying on EBUSY.
+func rmdir(path string, retry bool) error {
+	delay := time.Millisecond
+	tries := 10
+
+again:
 	err := unix.Rmdir(path)
-	if err == nil || err == unix.ENOENT {
+	switch err { // nolint:errorlint // unix errors are bare
+	case nil, unix.ENOENT:
 		return nil
+	case unix.EINTR:
+		goto again
+	case unix.EBUSY:
+		if retry && tries > 0 {
+			time.Sleep(delay)
+			delay *= 2
+			tries--
+			goto again
+
+		}
 	}
 	return &os.PathError{Op: "rmdir", Path: path, Err: err}
 }
@@ -228,68 +244,42 @@ func rmdir(path string) error {
 // RemovePath aims to remove cgroup path. It does so recursively,
 // by removing any subdirectories (sub-cgroups) first.
 func RemovePath(path string) error {
-	// try the fast path first
-	if err := rmdir(path); err == nil {
+	// Try the fast path first.
+	if err := rmdir(path, false); err == nil {
 		return nil
 	}
 
 	infos, err := os.ReadDir(path)
-	if err != nil {
-		if os.IsNotExist(err) {
-			err = nil
-		}
+	if err != nil && !os.IsNotExist(err) {
 		return err
 	}
 	for _, info := range infos {
 		if info.IsDir() {
-			// We should remove subcgroups dir first
+			// We should remove subcgroup first.
 			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
 				break
 			}
 		}
 	}
 	if err == nil {
-		err = rmdir(path)
+		err = rmdir(path, true)
 	}
 	return err
 }
 
 // RemovePaths iterates over the provided paths removing them.
-// We trying to remove all paths five times with increasing delay between tries.
-// If after all there are not removed cgroups - appropriate error will be
-// returned.
 func RemovePaths(paths map[string]string) (err error) {
-	const retries = 5
-	delay := 10 * time.Millisecond
-	for i := 0; i < retries; i++ {
-		if i != 0 {
-			time.Sleep(delay)
-			delay *= 2
-		}
-		for s, p := range paths {
-			if err := RemovePath(p); err != nil {
-				// do not log intermediate iterations
-				switch i {
-				case 0:
-					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
-				case retries - 1:
-					logrus.WithError(err).Error("Failed to remove cgroup")
-				}
-			}
-			_, err := os.Stat(p)
-			// We need this strange way of checking cgroups existence because
-			// RemoveAll almost always returns error, even on already removed
-			// cgroups
-			if os.IsNotExist(err) {
-				delete(paths, s)
-			}
-		}
-		if len(paths) == 0 {
-			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
-			paths = make(map[string]string)
-			return nil
+	for s, p := range paths {
+		if err := RemovePath(p); err == nil {
+			delete(paths, s)
 		}
 	}
+	if len(paths) == 0 {
+		//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+		// TODO: switch to clear once Go < 1.21 is not supported.
+		paths = make(map[string]string)
+		return nil
+	}
 	return fmt.Errorf("Failed to remove paths: %v", paths)
 }
 

From 7396ca90fa47d0458da4188061b24ca1bff465bf Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 6 Nov 2023 15:38:11 -0800
Subject: [PATCH 6/8] runc delete: do not ignore error from destroy

If container.Destroy() has failed, runc destroy still return 0, which is
wrong and can result in other issues down the line.

Let's always return error from destroy in runc delete.

For runc checkpoint and runc run, we still treat it as a warning.

Co-authored-by: Zhang Tianyang <burning9699@gmail.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 checkpoint.go                   |  4 +++-
 delete.go                       |  7 ++-----
 libcontainer/container_linux.go |  5 ++++-
 utils_linux.go                  | 10 +++-------
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/checkpoint.go b/checkpoint.go
index a8a27f248bc..1f5f5e73975 100644
--- a/checkpoint.go
+++ b/checkpoint.go
@@ -70,7 +70,9 @@ checkpointed.`,
 		err = container.Checkpoint(options)
 		if err == nil && !(options.LeaveRunning || options.PreDump) {
 			// Destroy the container unless we tell CRIU to keep it.
-			destroy(container)
+			if err := container.Destroy(); err != nil {
+				logrus.Warn(err)
+			}
 		}
 		return err
 	},
diff --git a/delete.go b/delete.go
index e8b92ae5a92..fc8133438ea 100644
--- a/delete.go
+++ b/delete.go
@@ -18,8 +18,7 @@ func killContainer(container *libcontainer.Container) error {
 	for i := 0; i < 100; i++ {
 		time.Sleep(100 * time.Millisecond)
 		if err := container.Signal(unix.Signal(0)); err != nil {
-			destroy(container)
-			return nil
+			return container.Destroy()
 		}
 	}
 	return errors.New("container init still running")
@@ -80,13 +79,11 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
 		}
 		switch s {
 		case libcontainer.Stopped:
-			destroy(container)
+			return container.Destroy()
 		case libcontainer.Created:
 			return killContainer(container)
 		default:
 			return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
 		}
-
-		return nil
 	},
 }
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 515755276ad..f36abaf1032 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -874,7 +874,10 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
 func (c *Container) Destroy() error {
 	c.m.Lock()
 	defer c.m.Unlock()
-	return c.state.destroy()
+	if err := c.state.destroy(); err != nil {
+		return fmt.Errorf("unable to destroy container: %w", err)
+	}
+	return nil
 }
 
 // Pause pauses the container, if its state is RUNNING or CREATED, changing
diff --git a/utils_linux.go b/utils_linux.go
index 633d6d68ca6..5de00628493 100644
--- a/utils_linux.go
+++ b/utils_linux.go
@@ -88,12 +88,6 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
 	return lp, nil
 }
 
-func destroy(container *libcontainer.Container) {
-	if err := container.Destroy(); err != nil {
-		logrus.Error(err)
-	}
-}
-
 // setupIO modifies the given process config according to the options.
 func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
 	if createTTY {
@@ -303,7 +297,9 @@ func (r *runner) run(config *specs.Process) (int, error) {
 
 func (r *runner) destroy() {
 	if r.shouldDestroy {
-		destroy(r.container)
+		if err := r.container.Destroy(); err != nil {
+			logrus.Warn(err)
+		}
 	}
 }
 

From ab3cd8d73e61847e16fbc2fd1ce054301dc240d4 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 6 Nov 2023 15:54:37 -0800
Subject: [PATCH 7/8] runc delete, container.Destroy: kill all processes

(For a container with no private PID namespace, that is).

When runc delete (or container.Destroy) is called on a stopped
container without private PID namespace and there are processes
in its cgroup, kill those.

Add a test case.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/state_linux.go   | 11 +++++++++++
 tests/integration/delete.bats | 21 ++++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go
index 0f629922b56..a725feafb33 100644
--- a/libcontainer/state_linux.go
+++ b/libcontainer/state_linux.go
@@ -35,6 +35,17 @@ type containerState interface {
 }
 
 func destroy(c *Container) error {
+	// Usually, when a container init is gone, all other processes in its
+	// cgroup are killed by the kernel. This is not the case for a shared
+	// PID namespace container, which may have some processes left after
+	// its init is killed or exited.
+	//
+	// As the container without init process running is considered stopped,
+	// and destroy is supposed to remove all the container resources, we need
+	// to kill those processes here.
+	if !c.config.Namespaces.IsPrivate(configs.NEWPID) {
+		_ = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
+	}
 	err := c.cgroupManager.Destroy()
 	if c.intelRdtManager != nil {
 		if ierr := c.intelRdtManager.Destroy(); err == nil {
diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats
index 8cc53c19e01..a1a21748536 100644
--- a/tests/integration/delete.bats
+++ b/tests/integration/delete.bats
@@ -62,9 +62,19 @@ function teardown() {
 	[ "$status" -eq 0 ]
 }
 
-# Issue 4047, case "runc delete -f".
-# See also: "kill KILL [host pidns + init gone]" test in kill.bats.
+# Issue 4047, case "runc delete".
+@test "runc delete [host pidns + init gone]" {
+	test_runc_delete_host_pidns
+}
+
+# Issue 4047, case "runc delete --force" (different code path).
+# shellcheck disable=SC2030
 @test "runc delete --force [host pidns + init gone]" {
+	test_runc_delete_host_pidns --force
+}
+
+# See also: "kill KILL [host pidns + init gone]" test in kill.bats.
+function test_runc_delete_host_pidns() {
 	requires cgroups_freezer
 
 	update_config '	  .linux.namespaces -= [{"type": "pid"}]'
@@ -91,6 +101,7 @@ function teardown() {
 	fi
 
 	runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
+	# shellcheck disable=SC2031
 	[ "$status" -eq 0 ]
 	cgpath=$(get_cgroup_path "pids")
 	init_pid=$(cat "$cgpath"/cgroup.procs)
@@ -113,10 +124,14 @@ function teardown() {
 		kill -0 "$p"
 	done
 
-	runc delete -f test_busybox
+	# Must kill those processes and remove container.
+	# shellcheck disable=SC2031
+	runc delete "$@" test_busybox
+	# shellcheck disable=SC2031
 	[ "$status" -eq 0 ]
 
 	runc state test_busybox
+	# shellcheck disable=SC2031
 	[ "$status" -ne 0 ] # "Container does not exist"
 
 	# Make sure all processes are gone.

From a6f4081766a0f405bb9b5e798a4930c1f434c6b1 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 13 Nov 2023 15:39:21 -0800
Subject: [PATCH 8/8] libct: Destroy: don't proceed in case of errors

For some reason, container destroy operation removes container's state
directory even if cgroup removal fails (and then still returns an
error). It has been that way since commit 5c246d038fc47b, which added
cgroup removal.

This is problematic because once the container state dir is removed, we
no longer know container's cgroup and thus can't remove it.

Let's return the error early and fail if cgroup can't be removed.

Same for other operations: do not proceed if we fail.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/state_linux.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go
index a725feafb33..0b1b8716a76 100644
--- a/libcontainer/state_linux.go
+++ b/libcontainer/state_linux.go
@@ -46,19 +46,19 @@ func destroy(c *Container) error {
 	if !c.config.Namespaces.IsPrivate(configs.NEWPID) {
 		_ = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
 	}
-	err := c.cgroupManager.Destroy()
+	if err := c.cgroupManager.Destroy(); err != nil {
+		return fmt.Errorf("unable to remove container's cgroup: %w", err)
+	}
 	if c.intelRdtManager != nil {
-		if ierr := c.intelRdtManager.Destroy(); err == nil {
-			err = ierr
+		if err := c.intelRdtManager.Destroy(); err != nil {
+			return fmt.Errorf("unable to remove container's IntelRDT group: %w", err)
 		}
 	}
-	if rerr := os.RemoveAll(c.stateDir); err == nil {
-		err = rerr
+	if err := os.RemoveAll(c.stateDir); err != nil {
+		return fmt.Errorf("unable to remove container state dir: %w", err)
 	}
 	c.initProcess = nil
-	if herr := runPoststopHooks(c); err == nil {
-		err = herr
-	}
+	err := runPoststopHooks(c)
 	c.state = &stoppedState{c: c}
 	return err
 }