diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index 82ef0fdf1b5..c51d60ce4c5 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -139,7 +139,7 @@ func (e *mountError) Unwrap() error { // mount is a simple unix.Mount wrapper, returning an error with more context // in case it failed. -func mount(source, target, fstype string, flags uintptr, data string) error { +func mount(source, target, fstype string, flags uintptr, data string) error { //nolint:unparam // mount(2) wrapper return mountViaFds(source, nil, target, "", fstype, flags, data) } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index ef764df5f90..720b3dbdf13 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -419,7 +419,7 @@ func mountCgroupV2(m mountEntry, c *mountConfig) error { // Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted // with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`). err = utils.WithProcfdFile(m.dstFile, func(procfd string) error { - return maskPaths([]string{procfd}, c.label) + return maskPaths(c.root, []string{procfd}, c.label) }) } return err @@ -1328,12 +1328,64 @@ func verifyDevNull(f *os.File) error { }) } +// mountFunc matches mountViaFds; tests replace it with a recorder. +type mountFunc func(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error + +// isProcFdPath reports whether path is one of the procfs fd aliases runc uses +// for mount targets. These aliases should not be kept as reusable sources. +// +// The accepted forms are (where and are decimal integers): +// +// /proc/thread-self/fd/... +// /proc/self/fd/... +// /proc//fd/... +// /proc/self/task//fd/... +// /proc//task//fd/... +func isProcFdPath(path string) bool { + parts := strings.Split(strings.Trim(pathrs.LexicallyCleanPath(path), "/"), "/") + if len(parts) < 3 || parts[0] != "proc" { + return false + } + if parts[1] == "thread-self" { + return parts[2] == "fd" + } + if parts[2] == "fd" && (parts[1] == "self" || isProcPidOrTid(parts[1])) { + return true + } + if len(parts) < 5 || parts[2] != "task" || parts[4] != "fd" { + return false + } + return isProcPidOrTid(parts[3]) && (parts[1] == "self" || isProcPidOrTid(parts[1])) +} + +func isProcPidOrTid(value string) bool { + _, err := strconv.ParseUint(value, 10, 64) + return err == nil +} + +func maskPathsAfterChroot(paths []string, mountLabel string) error { + if len(paths) == 0 { + return nil + } + rootFd, err := os.OpenFile("/", unix.O_DIRECTORY|unix.O_CLOEXEC|unix.O_PATH, 0) + if err != nil { + return fmt.Errorf("open rootfs handle for masked paths: %w", err) + } + defer rootFd.Close() + return maskPaths(rootFd, paths, mountLabel) +} + +func maskPaths(rootFd *os.File, paths []string, mountLabel string) error { + return maskPathsWithMount(rootFd, paths, mountLabel, mountViaFds) +} + // maskPaths masks the top of the specified paths inside a container to avoid // security issues from processes reading information from non-namespace aware // mounts ( proc/kcore ). // For files, maskPath bind mounts /dev/null over the top of the specified path. -// For directories, maskPath mounts read-only tmpfs over the top of the specified path. -func maskPaths(paths []string, mountLabel string) error { +// For directories, maskPath uses a shared read-only tmpfs: the first directory +// gets a tmpfs mount, and later directories bind-mount the same tmpfs. +func maskPathsWithMount(rootFd *os.File, paths []string, mountLabel string, mountFn mountFunc) error { devNull, err := os.OpenFile("/dev/null", unix.O_PATH, 0) if err != nil { return fmt.Errorf("can't mask paths: %w", err) @@ -1346,6 +1398,15 @@ func maskPaths(paths []string, mountLabel string) error { procSelfFd, closer := utils.ProcThreadSelf("fd/") defer closer() + var sharedDirMask *os.File + defer func() { + if sharedDirMask != nil { + _ = sharedDirMask.Close() + } + }() + var sharedDirMaskSrc *mountSource + maskedDirs := make(map[string]struct{}) + for _, path := range paths { // Open the target path; skip if it doesn't exist. dstFh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) @@ -1362,14 +1423,38 @@ func maskPaths(paths []string, mountLabel string) error { } var dstType string if st.IsDir() { - // Destination is a directory: bind mount a ro tmpfs over it. + // Directory targets use a read-only tmpfs. dstType = "dir" - err = mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) + cleanPath := pathrs.LexicallyCleanPath(path) + if _, ok := maskedDirs[cleanPath]; ok { + dstFh.Close() + continue + } + maskedDirs[cleanPath] = struct{}{} + + dstFd := filepath.Join(procSelfFd, strconv.Itoa(int(dstFh.Fd()))) + if sharedDirMaskSrc == nil { + err = mountFn("tmpfs", nil, path, dstFd, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) + // Procfs fd aliases are not durable rootfs paths; let the + // next ordinary directory become the reusable source. + if err == nil && !isProcFdPath(path) { + sharedDirMask, err = reopenAfterMount(rootFd, dstFh, unix.O_PATH|unix.O_CLOEXEC) + if err != nil { + err = fmt.Errorf("can't reopen shared directory mask: %w", err) + } + if err == nil { + sharedDirMaskSrc = &mountSource{Type: mountSourcePlain, file: sharedDirMask} + } + } + } else { + // clone_mnt copies MNT_READONLY from the source vfsmount. + err = mountFn("", sharedDirMaskSrc, path, dstFd, "", unix.MS_BIND, "") + } } else { // Destination is a file: mount it to /dev/null. dstType = "path" dstFd := filepath.Join(procSelfFd, strconv.Itoa(int(dstFh.Fd()))) - err = mountViaFds("", devNullSrc, path, dstFd, "", unix.MS_BIND, "") + err = mountFn("", devNullSrc, path, dstFd, "", unix.MS_BIND, "") } dstFh.Close() if err != nil { diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go index 03bfbb8d121..16294b554e4 100644 --- a/libcontainer/rootfs_linux_test.go +++ b/libcontainer/rootfs_linux_test.go @@ -1,9 +1,12 @@ package libcontainer import ( + "os" + "path/filepath" "testing" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" "golang.org/x/sys/unix" ) @@ -208,3 +211,152 @@ func TestNeedsSetupDevStrangeSourceDest(t *testing.T) { t.Fatal("expected needsSetupDev to be true, got false") } } + +type recordedMount struct { + source string + srcFileName string + srcFileType mountSourceType + target string + dstFd string + fstype string + flags uintptr + data string +} + +func recordMounts(calls *[]recordedMount) mountFunc { + return func(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error { + call := recordedMount{ + source: source, + target: target, + dstFd: dstFd, + fstype: fstype, + flags: flags, + data: data, + } + if srcFile != nil { + call.srcFileName = srcFile.file.Name() + call.srcFileType = srcFile.Type + } + *calls = append(*calls, call) + return nil + } +} + +func TestMaskPathsWithSharedDirMask(t *testing.T) { + root := t.TempDir() + dir1 := filepath.Join(root, "dir1") + dir2 := filepath.Join(root, "dir2") + file := filepath.Join(root, "file") + missing := filepath.Join(root, "missing") + rootFd, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC|unix.O_PATH, 0) + if err != nil { + t.Fatal(err) + } + defer rootFd.Close() + for _, dir := range []string{dir1, dir2} { + if err := os.Mkdir(dir, 0o755); err != nil { + t.Fatal(err) + } + } + if err := os.WriteFile(file, []byte("secret"), 0o644); err != nil { + t.Fatal(err) + } + + var calls []recordedMount + mountLabel := "system_u:object_r:container_file_t:s0" + if err := maskPathsWithMount(rootFd, []string{ + missing, + dir1, + dir2, + filepath.Join(dir1, "."), + file, + }, mountLabel, recordMounts(&calls)); err != nil { + t.Fatal(err) + } + + if len(calls) != 3 { + t.Fatalf("expected 3 mount calls, got %d: %#v", len(calls), calls) + } + + if call := calls[0]; call.source != "tmpfs" || call.fstype != "tmpfs" || call.flags != unix.MS_RDONLY || + call.target != dir1 || call.dstFd == "" || call.data != `context="system_u:object_r:container_file_t:s0"` { + t.Fatalf("unexpected shared tmpfs mount call: %#v", call) + } + if call := calls[1]; call.srcFileType != mountSourcePlain || + call.target != dir2 || call.dstFd == "" || call.fstype != "" || call.flags != unix.MS_BIND || call.data != "" { + t.Fatalf("unexpected shared tmpfs bind mount call: %#v", call) + } + if call := calls[2]; call.srcFileName != "/dev/null" || call.srcFileType != mountSourcePlain || + call.target != file || call.dstFd == "" || call.fstype != "" || call.flags != unix.MS_BIND || call.data != "" { + t.Fatalf("unexpected file mask mount call: %#v", call) + } +} + +func TestIsProcFdPath(t *testing.T) { + for _, path := range []string{ + "/proc/thread-self/fd/7", + "/proc/self/fd/7", + "/proc/self/task/123/fd/7", + "/proc/self/task/123/fd/../fd/7", + "/proc/123/fd/7", + "/proc/123/task/456/fd/7", + } { + if !isProcFdPath(path) { + t.Errorf("expected %q to be a procfd path", path) + } + } + for _, path := range []string{ + "/proc/acpi", + "/proc/self/fdinfo/7", + "/proc/self/task/123/fdinfo/7", + "/proc/self/task/foo/fd/7", + "/proc/foo/fd/7", + "/proc/123/task/foo/fd/7", + "/sys/devices/system/cpu/cpu0/thermal_throttle", + } { + if isProcFdPath(path) { + t.Errorf("expected %q not to be a procfd path", path) + } + } +} + +func TestMaskPathsDoesNotReuseProcFdMaskAsSharedSource(t *testing.T) { + root := t.TempDir() + dir1 := filepath.Join(root, "dir1") + dir2 := filepath.Join(root, "dir2") + dir3 := filepath.Join(root, "dir3") + rootFd, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC|unix.O_PATH, 0) + if err != nil { + t.Fatal(err) + } + defer rootFd.Close() + for _, dir := range []string{dir1, dir2, dir3} { + if err := os.Mkdir(dir, 0o755); err != nil { + t.Fatal(err) + } + } + dir1File, err := os.OpenFile(dir1, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + t.Fatal(err) + } + defer dir1File.Close() + procFd, closer := utils.ProcThreadSelfFd(dir1File.Fd()) + defer closer() + + var calls []recordedMount + if err := maskPathsWithMount(rootFd, []string{procFd, dir2, dir3}, "", recordMounts(&calls)); err != nil { + t.Fatal(err) + } + if len(calls) != 3 { + t.Fatalf("expected 3 mount calls, got %d: %#v", len(calls), calls) + } + for _, call := range calls[:2] { + if call.fstype != "tmpfs" || call.flags != unix.MS_RDONLY { + t.Fatalf("expected procfd source to force separate tmpfs mounts, got %#v", calls) + } + } + if call := calls[2]; call.srcFileType != mountSourcePlain || + call.target != dir3 || call.fstype != "" || call.flags != unix.MS_BIND { + t.Fatalf("expected third directory to bind mount from second directory, got %#v", call) + } +} diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 570472bd41f..2e4a3ced008 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -142,7 +142,7 @@ func (l *linuxStandardInit) Init() error { } } - if err := maskPaths(l.config.Config.MaskPaths, l.config.Config.MountLabel); err != nil { + if err := maskPathsAfterChroot(l.config.Config.MaskPaths, l.config.Config.MountLabel); err != nil { return err } pdeath, err := system.GetParentDeathSignal() diff --git a/tests/integration/mask.bats b/tests/integration/mask.bats index 5783332ea18..b83c26547ba 100644 --- a/tests/integration/mask.bats +++ b/tests/integration/mask.bats @@ -6,11 +6,11 @@ function setup() { setup_busybox # Create fake rootfs. - mkdir rootfs/testdir + mkdir rootfs/testdir rootfs/testdir2 rootfs/testdir3 echo "Forbidden information!" >rootfs/testfile # add extra masked paths - update_config '(.. | select(.maskedPaths? != null)) .maskedPaths += ["/testdir", "/testfile"]' + update_config '(.. | select(.maskedPaths? != null)) .maskedPaths += ["/testdir", "/testdir2", "/testdir3", "/testfile"]' } function teardown() { @@ -55,6 +55,34 @@ function teardown() { [[ "${output}" == *"Operation not permitted"* ]] } +@test "mask paths [directories share tmpfs]" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # shellcheck disable=SC2016 + runc exec test_busybox sh -euc ' + set -- $(stat -c %d /testdir /testdir2 /testdir3) + [ "$1" = "$2" ] + [ "$2" = "$3" ] + ' + [ "$status" -eq 0 ] + + runc exec test_busybox touch /testdir2/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] +} + +@test "mask paths [directory with read-only rootfs]" { + update_config '.root.readonly = true' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox ls /testdir + [ "$status" -eq 0 ] + [ -z "$output" ] +} + @test "mask paths [prohibit symlink /proc]" { ln -s /symlink rootfs/proc runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox