diff --git a/checkpoint.go b/checkpoint.go index a8a27f248bc..1f5f5e73975 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -70,7 +70,9 @@ checkpointed.`, err = container.Checkpoint(options) if err == nil && !(options.LeaveRunning || options.PreDump) { // Destroy the container unless we tell CRIU to keep it. - destroy(container) + if err := container.Destroy(); err != nil { + logrus.Warn(err) + } } return err }, diff --git a/delete.go b/delete.go index 682101ccad8..fc8133438ea 100644 --- a/delete.go +++ b/delete.go @@ -18,8 +18,7 @@ func killContainer(container *libcontainer.Container) error { for i := 0; i < 100; i++ { time.Sleep(100 * time.Millisecond) if err := container.Signal(unix.Signal(0)); err != nil { - destroy(container) - return nil + return container.Destroy() } } return errors.New("container init still running") @@ -66,22 +65,25 @@ status of "ubuntu01" as "stopped" the following will delete resources held for } return err } + // When --force is given, we kill all container processes and + // then destroy the container. This is done even for a stopped + // container, because (in case it does not have its own PID + // namespace) there may be some leftover processes in the + // container's cgroup. + if force { + return killContainer(container) + } s, err := container.Status() if err != nil { return err } switch s { case libcontainer.Stopped: - destroy(container) + return container.Destroy() case libcontainer.Created: return killContainer(container) default: - if force { - return killContainer(container) - } return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s) } - - return nil }, } diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go index 965c607c682..186cbc6413f 100644 --- a/libcontainer/cgroups/utils.go +++ b/libcontainer/cgroups/utils.go @@ -217,10 +217,26 @@ func PathExists(path string) bool { return true } -func rmdir(path string) error { +// rmdir tries to remove a directory, optionally retrying on EBUSY. +func rmdir(path string, retry bool) error { + delay := time.Millisecond + tries := 10 + +again: err := unix.Rmdir(path) - if err == nil || err == unix.ENOENT { + switch err { // nolint:errorlint // unix errors are bare + case nil, unix.ENOENT: return nil + case unix.EINTR: + goto again + case unix.EBUSY: + if retry && tries > 0 { + time.Sleep(delay) + delay *= 2 + tries-- + goto again + + } } return &os.PathError{Op: "rmdir", Path: path, Err: err} } @@ -228,68 +244,42 @@ func rmdir(path string) error { // RemovePath aims to remove cgroup path. It does so recursively, // by removing any subdirectories (sub-cgroups) first. func RemovePath(path string) error { - // try the fast path first - if err := rmdir(path); err == nil { + // Try the fast path first. + if err := rmdir(path, false); err == nil { return nil } infos, err := os.ReadDir(path) - if err != nil { - if os.IsNotExist(err) { - err = nil - } + if err != nil && !os.IsNotExist(err) { return err } for _, info := range infos { if info.IsDir() { - // We should remove subcgroups dir first + // We should remove subcgroup first. if err = RemovePath(filepath.Join(path, info.Name())); err != nil { break } } } if err == nil { - err = rmdir(path) + err = rmdir(path, true) } return err } // RemovePaths iterates over the provided paths removing them. -// We trying to remove all paths five times with increasing delay between tries. -// If after all there are not removed cgroups - appropriate error will be -// returned. func RemovePaths(paths map[string]string) (err error) { - const retries = 5 - delay := 10 * time.Millisecond - for i := 0; i < retries; i++ { - if i != 0 { - time.Sleep(delay) - delay *= 2 - } - for s, p := range paths { - if err := RemovePath(p); err != nil { - // do not log intermediate iterations - switch i { - case 0: - logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") - case retries - 1: - logrus.WithError(err).Error("Failed to remove cgroup") - } - } - _, err := os.Stat(p) - // We need this strange way of checking cgroups existence because - // RemoveAll almost always returns error, even on already removed - // cgroups - if os.IsNotExist(err) { - delete(paths, s) - } - } - if len(paths) == 0 { - //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 - paths = make(map[string]string) - return nil + for s, p := range paths { + if err := RemovePath(p); err == nil { + delete(paths, s) } } + if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + // TODO: switch to clear once Go < 1.21 is not supported. + paths = make(map[string]string) + return nil + } return fmt.Errorf("Failed to remove paths: %v", paths) } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 616cd1ede9f..f36abaf1032 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -115,7 +115,7 @@ func (c *Container) ignoreCgroupError(err error) error { if err == nil { return nil } - if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() { + if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() { return nil } return err @@ -364,16 +364,6 @@ func (c *Container) start(process *Process) (retErr error) { func (c *Container) Signal(s os.Signal) error { c.m.Lock() defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - // To avoid a PID reuse attack, don't kill non-running container. - switch status { - case Running, Created, Paused: - default: - return ErrNotRunning - } // When a container has its own PID namespace, inside it the init PID // is 1, and thus it is handled specially by the kernel. In particular, @@ -381,20 +371,28 @@ func (c *Container) Signal(s os.Signal) error { // all other processes in that PID namespace (see pid_namespaces(7)). // // OTOH, if PID namespace is shared, we should kill all pids to avoid - // leftover processes. + // leftover processes. Handle this special case here. if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { - err = signalAllProcesses(c.cgroupManager, unix.SIGKILL) - } else { - err = c.initProcess.signal(s) + if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { + return fmt.Errorf("unable to kill all processes: %w", err) + } + return nil } - if err != nil { + + // To avoid a PID reuse attack, don't kill non-running container. + if !c.hasInit() { + return ErrNotRunning + } + if err := c.initProcess.signal(s); err != nil { return fmt.Errorf("unable to signal init: %w", err) } - if status == Paused && s == unix.SIGKILL { + if s == unix.SIGKILL { // For cgroup v1, killing a process in a frozen cgroup // does nothing until it's thawed. Only thaw the cgroup // for SIGKILL. - _ = c.cgroupManager.Freeze(configs.Thawed) + if paused, _ := c.isPaused(); paused { + _ = c.cgroupManager.Freeze(configs.Thawed) + } } return nil } @@ -876,7 +874,10 @@ func (c *Container) newInitConfig(process *Process) *initConfig { func (c *Container) Destroy() error { c.m.Lock() defer c.m.Unlock() - return c.state.destroy() + if err := c.state.destroy(); err != nil { + return fmt.Errorf("unable to destroy container: %w", err) + } + return nil } // Pause pauses the container, if its state is RUNNING or CREATED, changing @@ -1006,34 +1007,31 @@ func (c *Container) refreshState() error { if paused { return c.state.transition(&pausedState{c: c}) } - t := c.runType() - switch t { - case Created: + if !c.hasInit() { + return c.state.transition(&stoppedState{c: c}) + } + // The presence of exec fifo helps to distinguish between + // the created and the running states. + if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil { return c.state.transition(&createdState{c: c}) - case Running: - return c.state.transition(&runningState{c: c}) } - return c.state.transition(&stoppedState{c: c}) + return c.state.transition(&runningState{c: c}) } -func (c *Container) runType() Status { +// hasInit tells whether the container init process exists. +func (c *Container) hasInit() bool { if c.initProcess == nil { - return Stopped + return false } pid := c.initProcess.pid() stat, err := system.Stat(pid) if err != nil { - return Stopped + return false } if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { - return Stopped - } - // We'll create exec fifo and blocking on it after container is created, - // and delete it after start container. - if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil { - return Created + return false } - return Running + return true } func (c *Container) isPaused() (bool, error) { diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index ec0f002a5fc..6ab0698eed8 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -673,6 +673,9 @@ func setupPersonality(config *configs.Config) error { // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { + if !m.Exists() { + return ErrNotRunning + } // Use cgroup.kill, if available. if s == unix.SIGKILL { if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid. diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index c6967e5f4f5..0b1b8716a76 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -35,19 +35,30 @@ type containerState interface { } func destroy(c *Container) error { - err := c.cgroupManager.Destroy() + // Usually, when a container init is gone, all other processes in its + // cgroup are killed by the kernel. This is not the case for a shared + // PID namespace container, which may have some processes left after + // its init is killed or exited. + // + // As the container without init process running is considered stopped, + // and destroy is supposed to remove all the container resources, we need + // to kill those processes here. + if !c.config.Namespaces.IsPrivate(configs.NEWPID) { + _ = signalAllProcesses(c.cgroupManager, unix.SIGKILL) + } + if err := c.cgroupManager.Destroy(); err != nil { + return fmt.Errorf("unable to remove container's cgroup: %w", err) + } if c.intelRdtManager != nil { - if ierr := c.intelRdtManager.Destroy(); err == nil { - err = ierr + if err := c.intelRdtManager.Destroy(); err != nil { + return fmt.Errorf("unable to remove container's IntelRDT group: %w", err) } } - if rerr := os.RemoveAll(c.stateDir); err == nil { - err = rerr + if err := os.RemoveAll(c.stateDir); err != nil { + return fmt.Errorf("unable to remove container state dir: %w", err) } c.initProcess = nil - if herr := runPoststopHooks(c); err == nil { - err = herr - } + err := runPoststopHooks(c) c.state = &stoppedState{c: c} return err } @@ -103,7 +114,7 @@ func (r *runningState) status() Status { func (r *runningState) transition(s containerState) error { switch s.(type) { case *stoppedState: - if r.c.runType() == Running { + if r.c.hasInit() { return ErrRunning } r.c.state = s @@ -118,7 +129,7 @@ func (r *runningState) transition(s containerState) error { } func (r *runningState) destroy() error { - if r.c.runType() == Running { + if r.c.hasInit() { return ErrRunning } return destroy(r.c) @@ -170,14 +181,13 @@ func (p *pausedState) transition(s containerState) error { } func (p *pausedState) destroy() error { - t := p.c.runType() - if t != Running && t != Created { - if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { - return err - } - return destroy(p.c) + if p.c.hasInit() { + return ErrPaused + } + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err } - return ErrPaused + return destroy(p.c) } // restoredState is the same as the running state but also has associated checkpoint diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index e9b80b64d06..a1a21748536 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -62,6 +62,84 @@ function teardown() { [ "$status" -eq 0 ] } +# Issue 4047, case "runc delete". +@test "runc delete [host pidns + init gone]" { + test_runc_delete_host_pidns +} + +# Issue 4047, case "runc delete --force" (different code path). +# shellcheck disable=SC2030 +@test "runc delete --force [host pidns + init gone]" { + test_runc_delete_host_pidns --force +} + +# See also: "kill KILL [host pidns + init gone]" test in kill.bats. +function test_runc_delete_host_pidns() { + requires cgroups_freezer + + update_config ' .linux.namespaces -= [{"type": "pid"}]' + set_cgroups_path + if [ $EUID -ne 0 ]; then + requires rootless_cgroup + # Apparently, for rootless test, when using systemd cgroup manager, + # newer versions of systemd clean up the container as soon as its init + # process is gone. This is all fine and dandy, except it prevents us to + # test this case, thus we skip the test. + # + # It is not entirely clear which systemd version got this feature: + # v245 works fine, and v249 does not. + if [ -v RUNC_USE_SYSTEMD ] && [ "$(systemd_version)" -gt 245 ]; then + skip "rootless+systemd conflicts with systemd > 245" + fi + # Can't mount real /proc when rootless + no pidns, + # so change it to a bind-mounted one from the host. + update_config ' .mounts |= map((select(.type == "proc") + | .type = "none" + | .source = "/proc" + | .options = ["rbind", "nosuid", "nodev", "noexec"] + ) // .)' + fi + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + # shellcheck disable=SC2031 + [ "$status" -eq 0 ] + cgpath=$(get_cgroup_path "pids") + init_pid=$(cat "$cgpath"/cgroup.procs) + + # Start a few more processes. + for _ in 1 2 3 4 5; do + __runc exec -d test_busybox sleep 1h + done + + # Now kill the container's init process. Since the container do + # not have own PID ns, its init is no special and the container + # will still be up and running. + kill -9 "$init_pid" + + # Get the list of all container processes. + pids=$(cat "$cgpath"/cgroup.procs) + echo "pids: $pids" + # Sanity check -- make sure all processes exist. + for p in $pids; do + kill -0 "$p" + done + + # Must kill those processes and remove container. + # shellcheck disable=SC2031 + runc delete "$@" test_busybox + # shellcheck disable=SC2031 + [ "$status" -eq 0 ] + + runc state test_busybox + # shellcheck disable=SC2031 + [ "$status" -ne 0 ] # "Container does not exist" + + # Make sure all processes are gone. + pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone + echo "pids: $pids" + [ -z "$pids" ] +} + @test "runc delete --force [paused container]" { runc run -d --console-socket "$CONSOLE_SOCKET" ct1 [ "$status" -eq 0 ] diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index 8f2f4a7b4e8..3124bb0e572 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -10,6 +10,7 @@ function teardown() { teardown_bundle } +# shellcheck disable=SC2030 @test "kill detached busybox" { # run busybox detached runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox @@ -35,16 +36,15 @@ function teardown() { [ "$status" -eq 0 ] } -# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. -@test "kill KILL [host pidns]" { - # kill -a currently requires cgroup freezer. +test_host_pidns_kill() { requires cgroups_freezer update_config ' .linux.namespaces -= [{"type": "pid"}]' set_cgroups_path if [ $EUID -ne 0 ]; then - # kill --all requires working cgroups. requires rootless_cgroup + # Can't mount real /proc when rootless + no pidns, + # so change it to a bind-mounted one from the host. update_config ' .mounts |= map((select(.type == "proc") | .type = "none" | .source = "/proc" @@ -53,17 +53,28 @@ function teardown() { fi runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + # shellcheck disable=SC2031 [ "$status" -eq 0 ] + cgpath=$(get_cgroup_path "pids") + init_pid=$(cat "$cgpath"/cgroup.procs) # Start a few more processes. for _ in 1 2 3 4 5; do __runc exec -d test_busybox sleep 1h - [ "$status" -eq 0 ] done + if [ -v KILL_INIT ]; then + # Now kill the container's init process. Since the container do + # not have own PID ns, its init is no special and the container + # will still be up and running (except for rootless container + # AND systemd cgroup driver AND systemd > v245, when systemd + # kills the container; see "kill KILL [host pidns + init gone]" + # below). + kill -9 "$init_pid" + fi + # Get the list of all container processes. - path=$(get_cgroup_path "pids") - pids=$(cat "$path"/cgroup.procs) + pids=$(cat "$cgpath"/cgroup.procs) echo "pids: $pids" # Sanity check -- make sure all processes exist. for p in $pids; do @@ -71,14 +82,46 @@ function teardown() { done runc kill test_busybox KILL + # shellcheck disable=SC2031 [ "$status" -eq 0 ] wait_for_container 10 1 test_busybox stopped # Make sure all processes are gone. - pids=$(cat "$path"/cgroup.procs) || true # OK if cgroup is gone + pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone echo "pids: $pids" - for p in $pids; do - run ! kill -0 "$p" - [[ "$output" = *"No such process" ]] - done + [ -z "$pids" ] +} + +# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. +# The differences are: +# +# 1. Here we use separate processes to create and to kill a container, so the +# processes inside a container are not children of "runc kill". +# +# 2. We hit different codepaths (nonChildProcess.signal rather than initProcess.signal). +@test "kill KILL [host pidns]" { + unset KILL_INIT + test_host_pidns_kill +} + +# Same as above plus: +# +# 3. Test runc kill on a container whose init process is gone. +# +# Issue 4047, case "runc kill". +# See also: "runc delete --force [host pidns + init gone]" test in delete.bats. +@test "kill KILL [host pidns + init gone]" { + # Apparently, for rootless test, when using systemd cgroup manager, + # newer versions of systemd clean up the container as soon as its init + # process is gone. This is all fine and dandy, except it prevents us to + # test this case, thus we skip the test. + # + # It is not entirely clear which systemd version got this feature: + # v245 works fine, and v249 does not. + if [ $EUID -ne 0 ] && [ -v RUNC_USE_SYSTEMD ] && [ "$(systemd_version)" -gt 245 ]; then + skip "rootless+systemd conflicts with systemd > 245" + fi + KILL_INIT=1 + test_host_pidns_kill + unset KILL_INIT } diff --git a/utils_linux.go b/utils_linux.go index 633d6d68ca6..5de00628493 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -88,12 +88,6 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { return lp, nil } -func destroy(container *libcontainer.Container) { - if err := container.Destroy(); err != nil { - logrus.Error(err) - } -} - // setupIO modifies the given process config according to the options. func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) { if createTTY { @@ -303,7 +297,9 @@ func (r *runner) run(config *specs.Process) (int, error) { func (r *runner) destroy() { if r.shouldDestroy { - destroy(r.container) + if err := r.container.Destroy(); err != nil { + logrus.Warn(err) + } } }