From dc0dc672fb12505747637e72a9d1adc92eb19c64 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 11 May 2026 20:16:25 +0000 Subject: [PATCH 01/22] linux: fix NULL pointer dereference The check was inverted as it dereferenced src_nofollow when the pointer was NULL instead of when it was non-NULL. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index fcb62dbfea..d3ad248f20 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -4313,7 +4313,7 @@ is_bind_mount (runtime_spec_schema_defs_mount *mnt, bool *recursive, bool *src_n bool ret = false; size_t i; - if (src_nofollow == NULL) + if (src_nofollow != NULL) *src_nofollow = false; for (i = 0; i < mnt->options_len; i++) From ef23680bb65598f336029ca93b24de1894a9bc90 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:04:46 +0000 Subject: [PATCH 02/22] linux: open procfd early and store in private_data Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index d3ad248f20..322cd33b18 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -137,6 +137,8 @@ struct private_data_s unsigned long rootfs_propagation; bool deny_setgroups; + int procfd; + const char *rootfs; int rootfsfd; @@ -170,6 +172,8 @@ cleanup_private_data (void *private_data) if (p->rootfsfd >= 0) TEMP_FAILURE_RETRY (close (p->rootfsfd)); + if (p->procfd >= 0) + TEMP_FAILURE_RETRY (close (p->procfd)); if (p->maskdir_fd >= 0) TEMP_FAILURE_RETRY (close (p->maskdir_fd)); if (p->mount_fds) @@ -193,6 +197,7 @@ get_private_data (struct libcrun_container_s *container) struct private_data_s *p = xmalloc0 (sizeof (*p)); container->private_data = p; p->rootfsfd = -1; + p->procfd = -1; p->notify_socket_tree_fd = -1; p->maskdir_fd = -1; container->cleanup_private_data = cleanup_private_data; @@ -891,6 +896,24 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) #endif } +static int +get_procfd (struct private_data_s *data, libcrun_error_t *err) +{ + int fd; + + if (data->procfd >= 0) + return data->procfd; + + fd = fsopen_mount ("proc", NULL, NULL); + if (fd < 0) + fd = open ("/proc", O_DIRECTORY | O_CLOEXEC); + if (UNLIKELY (fd < 0)) + return crun_make_error (err, errno, "open `/proc`"); + + data->procfd = fd; + return fd; +} + static int fs_move_mount_to (int fd, int dirfd, const char *name) { @@ -2981,6 +3004,7 @@ int libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char *rootfs, libcrun_error_t *err) { int ret; + if (get_private_data (container)->unshare_flags & CLONE_NEWNS) { if (no_pivot) From ef066840e38a9eada34ec07fe1d29d1440eec4eb Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:05:21 +0000 Subject: [PATCH 03/22] linux: use fsetxattr with procfd in do_mount Replace setxattr via /proc/self/fd path with openat(procfd) + fsetxattr. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 15 ++++++++++----- src/libcrun/utils.h | 6 ++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 322cd33b18..1cee6298d8 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1320,12 +1320,17 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, #ifdef HAVE_FGETXATTR if (label_how == LABEL_XATTR) { - proc_fd_path_t proc_file; - - get_proc_self_fd_path (proc_file, fd); + int procfd = get_procfd (get_private_data (container), err); + if (procfd >= 0) + { + proc_fd_path_t proc_self_path; + cleanup_close int xfd = -1; - /* We need to go through the proc_file since fd itself is opened as O_PATH. */ - (void) setxattr (proc_file, "security.selinux", label, strlen (label), 0); + get_self_fd_path (proc_self_path, fd); + xfd = openat (procfd, proc_self_path, O_RDONLY | O_CLOEXEC); + if (xfd >= 0) + (void) fsetxattr (xfd, "security.selinux", label, strlen (label), 0); + } } #endif diff --git a/src/libcrun/utils.h b/src/libcrun/utils.h index c916378da1..67485aa9d6 100644 --- a/src/libcrun/utils.h +++ b/src/libcrun/utils.h @@ -502,6 +502,12 @@ get_proc_self_fd_path (proc_fd_path_t path, int fd) get_proc_fd_path (path, 0, fd); } +static inline void +get_self_fd_path (proc_fd_path_t path, int fd) +{ + snprintf (path, sizeof (proc_fd_path_t), "self/fd/%d", fd); +} + static inline int validate_options (unsigned int specified_options, unsigned int supported_options, libcrun_error_t *err) { From 022c7c5575ab02a4b6aadd75b2c8d0afca95fbfa Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:06:02 +0000 Subject: [PATCH 04/22] linux: use fchmodat/fchownat with procfd in libcrun_create_dev Replace chmod/chown via /proc/self/fd path with fchmodat/fchownat using the trusted procfd directory fd. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 1cee6298d8..33544821f5 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1752,6 +1752,9 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, else { proc_fd_path_t fd_buffer; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; dev = makedev (device->major, device->minor); @@ -1772,15 +1775,15 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, if (UNLIKELY (fd < 0)) return fd; - get_proc_self_fd_path (fd_buffer, fd); + get_self_fd_path (fd_buffer, fd); - ret = chmod (fd_buffer, device->mode); + ret = fchmodat (procfd, fd_buffer, device->mode, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chmod `%s`", device->path); + return crun_make_error (err, errno, "fchmodat `%s`", device->path); - ret = chown (fd_buffer, device->uid, device->gid); /* lgtm [cpp/toctou-race-condition] */ + ret = fchownat (procfd, fd_buffer, device->uid, device->gid, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chown `%s`", device->path); + return crun_make_error (err, errno, "fchownat `%s`", device->path); } else { @@ -1827,15 +1830,15 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, if (UNLIKELY (fd < 0)) return crun_error_wrap (err, "openat `%s`", device->path); - get_proc_self_fd_path (fd_buffer, fd); + get_self_fd_path (fd_buffer, fd); - ret = chmod (fd_buffer, device->mode); + ret = fchmodat (procfd, fd_buffer, device->mode, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chmod `%s`", device->path); + return crun_make_error (err, errno, "fchmodat `%s`", device->path); - ret = chown (fd_buffer, device->uid, device->gid); /* lgtm [cpp/toctou-race-condition] */ + ret = fchownat (procfd, fd_buffer, device->uid, device->gid, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chown `%s`", device->path); + return crun_make_error (err, errno, "fchownat `%s`", device->path); } } return 0; From 42b16e0c213d31b913aa8f85d15272130bdd2914 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:06:28 +0000 Subject: [PATCH 05/22] linux: use procfd to read /proc/self/cgroup in do_mount_cgroup_v1 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/cgroup.h | 6 +++++- src/libcrun/linux.c | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/libcrun/cgroup.h b/src/libcrun/cgroup.h index 74aa6ca0bb..0e1acfb3ee 100644 --- a/src/libcrun/cgroup.h +++ b/src/libcrun/cgroup.h @@ -26,8 +26,12 @@ # define CGROUP_ROOT "/sys/fs/cgroup" #endif +#ifndef SELF_CGROUP +# define SELF_CGROUP "self/cgroup" +#endif + #ifndef PROC_SELF_CGROUP -# define PROC_SELF_CGROUP "/proc/self/cgroup" +# define PROC_SELF_CGROUP "/proc/" SELF_CGROUP #endif enum diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 33544821f5..6b3a41fa9d 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1520,7 +1520,13 @@ do_mount_cgroup_v1 (libcrun_container_t *container, const char *source, int targ return tmpfsdirfd; targetfd = tmpfsdirfd; - ret = read_all_file (PROC_SELF_CGROUP, &content, NULL, err); + { + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + ret = read_all_file_at (procfd, SELF_CGROUP, &content, NULL, err); + } if (UNLIKELY (ret < 0)) return ret; From 73d731974251962e26102a88dfdfd90c530f41c7 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:06:54 +0000 Subject: [PATCH 06/22] linux: use procfd to stat source_mountfd in process_single_mount Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 6b3a41fa9d..44bb892135 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -2336,18 +2336,19 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, proc_fd_path_t proc_buf; const char *path = mount->source; + if ((extra_flags & OPTION_COPY_SYMLINK) && (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_DEST_NOFOLLOW))) + return crun_make_error (err, 0, "`copy-symlink` is mutually exclusive with `src-nofollow` and `dest-nofollow`"); + /* If copy-symlink is provided, ignore the pre-opened file descriptor since its source was resolved. */ if (source_mountfd >= 0 && ! (extra_flags & OPTION_COPY_SYMLINK)) { - get_proc_self_fd_path (proc_buf, source_mountfd); + get_self_fd_path (proc_buf, source_mountfd); path = proc_buf; - } - if ((extra_flags & OPTION_COPY_SYMLINK) && (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_DEST_NOFOLLOW))) - return crun_make_error (err, 0, "`copy-symlink` is mutually exclusive with `src-nofollow` and `dest-nofollow`"); - - /* Do not resolve the symlink only when src-nofollow and copy-symlink are used. */ - ret = get_file_type (&src_mode, (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false, path); + ret = get_file_type_at (source_mountfd, &src_mode, true, NULL); + } + else + ret = get_file_type (&src_mode, (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false, path); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "cannot stat `%s`", path); From 89a7fbb0518e00e1899c38e32ef5c2f0500ec30a Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:07:29 +0000 Subject: [PATCH 07/22] linux: use procfd in do_masked_or_readonly_path Use procfd-relative paths for get_bind_mount and fstatfs instead of /proc/self/fd string paths. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 44bb892135..0ed03d6ba9 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1085,17 +1085,21 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path if (readonly) { + int procfd = get_procfd (get_private_data (container), err); proc_fd_path_t source_buffer; cleanup_close int mountfd = -1; - get_proc_self_fd_path (source_buffer, pathfd); + if (UNLIKELY (procfd < 0)) + return procfd; + + get_self_fd_path (source_buffer, pathfd); /* Try open_tree + mount_setattr to apply MS_RDONLY atomically. Only when keep_flags is false, since keep_flags needs statfs to inherit parent mount flags. */ if (! keep_flags) { - mountfd = get_bind_mount (-1, source_buffer, true, true, false, MS_PRIVATE, err); + mountfd = get_bind_mount (procfd, source_buffer, true, true, false, MS_PRIVATE, err); if (mountfd >= 0) ret = fs_move_mount_to (mountfd, pathfd, NULL); else @@ -1107,21 +1111,29 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path if (keep_flags || ret < 0) { + proc_fd_path_t abs_source; + mount_flags = MS_BIND | MS_PRIVATE | MS_RDONLY | MS_REC; if (keep_flags) { - ret = statfs (source_buffer, &sfs); + cleanup_close int sfd = openat (procfd, source_buffer, O_PATH | O_CLOEXEC); + if (UNLIKELY (sfd < 0)) + return crun_make_error (err, errno, "openat procfd for `%s`", rel_path); + + ret = fstatfs (sfd, &sfs); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "statfs `%s`", source_buffer); + return crun_make_error (err, errno, "fstatfs `%s`", rel_path); mount_flags = mount_flags | sfs.f_flags; - // Parent might contain `MS_REMOUNT` but the new readonly path is not - // actually mounted. Specifically in the case of `/proc` this will end - // up with EINVAL therefore remove `MS_REMOUNT` if it's getting - // inherited from the parent. + /* Parent might contain MS_REMOUNT but the new readonly path is not + actually mounted. Specifically in the case of /proc this will end + up with EINVAL therefore remove MS_REMOUNT if it is getting + inherited from the parent. */ mount_flags = mount_flags & ~MS_REMOUNT; } - ret = do_mount (container, source_buffer, pathfd, rel_path, NULL, mount_flags, NULL, + + get_proc_self_fd_path (abs_source, pathfd); + ret = do_mount (container, abs_source, pathfd, rel_path, NULL, mount_flags, NULL, LABEL_NONE, err); if (UNLIKELY (ret < 0)) return ret; From 30d88cd1152ee4851e0c412ca2961a2056c07416 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 13 May 2026 11:14:44 +0000 Subject: [PATCH 08/22] linux: use procfd in do_masked_or_readonly_path keep_flags fallback Use get_bind_mount with the cached procfd and relative self/fd path instead of formatting an absolute /proc/self/fd path and going through do_mount. Apply the extra mount flags from fstatfs via do_mount_setattr on the detached mount before moving it into place. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 0ed03d6ba9..dcd0e28014 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1111,8 +1111,6 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path if (keep_flags || ret < 0) { - proc_fd_path_t abs_source; - mount_flags = MS_BIND | MS_PRIVATE | MS_RDONLY | MS_REC; if (keep_flags) { @@ -1132,11 +1130,29 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path mount_flags = mount_flags & ~MS_REMOUNT; } - get_proc_self_fd_path (abs_source, pathfd); - ret = do_mount (container, abs_source, pathfd, rel_path, NULL, mount_flags, NULL, - LABEL_NONE, err); - if (UNLIKELY (ret < 0)) - return ret; + close_and_reset (&mountfd); + mountfd = get_bind_mount (procfd, source_buffer, true, true, false, MS_PRIVATE, err); + if (mountfd >= 0) + { + ret = do_mount_setattr (true, rel_path, mountfd, 0, mount_flags & ~(MS_BIND | MS_PRIVATE), err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = fs_move_mount_to (mountfd, pathfd, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "move mount for readonly path `%s`", rel_path); + } + else + { + proc_fd_path_t abs_source; + + crun_error_release (err); + get_proc_self_fd_path (abs_source, pathfd); + ret = do_mount (container, abs_source, pathfd, rel_path, NULL, mount_flags, NULL, + LABEL_NONE, err); + if (UNLIKELY (ret < 0)) + return ret; + } } } else From aea44d3887f2ffe73cac1525fefaa40e6c9d8114 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:08:02 +0000 Subject: [PATCH 09/22] linux: use procfd in get_shared_empty_dir_cached and mount_masked_dir Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index dcd0e28014..3012604ded 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -979,6 +979,7 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path struct private_data_s *private_data = get_private_data (container); cleanup_close int fd = -1; cleanup_free char *empty_dir_path = NULL; + proc_fd_path_t fd_path; int ret; /* Fast path: return cached proc fd path if already set up */ @@ -998,8 +999,8 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path if (fd < 0) return crun_make_error (err, errno, "open directory `%s`", empty_dir_path); - /* Cache the /proc/self/fd path for fast mounting */ - xasprintf (&private_data->maskdir_proc_path, "/proc/self/fd/%d", fd); + get_self_fd_path (fd_path, fd); + private_data->maskdir_proc_path = xstrdup (fd_path); private_data->maskdir_fd = fd; fd = -1; /* Don't auto-close */ @@ -1012,14 +1013,16 @@ static int mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_path, libcrun_error_t *err) { struct private_data_s *private_data = get_private_data (container); + cleanup_close int mountfd = -1; + proc_fd_path_t abs_source; char *proc_fd_path = NULL; libcrun_error_t tmp_err = NULL; + int procfd; int ret; if (private_data->maskdir_bind_failed) goto fallback_to_tmpfs; - /* Get cached /proc/self/fd path (fast after first call) */ ret = get_shared_empty_dir_cached (container, &proc_fd_path, &tmp_err); if (ret < 0) { @@ -1029,20 +1032,21 @@ mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_pa goto fallback_to_tmpfs; } - { - cleanup_close int mountfd = -1; + procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; - mountfd = get_bind_mount (-1, proc_fd_path, false, true, false, MS_PRIVATE, &tmp_err); - if (mountfd >= 0) - { - ret = fs_move_mount_to (mountfd, pathfd, NULL); - if (LIKELY (ret == 0)) - return 0; - } - crun_error_release (&tmp_err); - } + mountfd = get_bind_mount (procfd, proc_fd_path, false, true, false, MS_PRIVATE, &tmp_err); + if (mountfd >= 0) + { + ret = fs_move_mount_to (mountfd, pathfd, NULL); + if (LIKELY (ret == 0)) + return 0; + } + crun_error_release (&tmp_err); - ret = do_mount (container, proc_fd_path, pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, &tmp_err); + get_proc_self_fd_path (abs_source, private_data->maskdir_fd); + ret = do_mount (container, abs_source, pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, &tmp_err); if (LIKELY (ret >= 0)) return ret; From cf41f37e0a729c52326e302edd7209d1756b1df6 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 14:09:06 +0000 Subject: [PATCH 10/22] linux: use procfd to read unified cgroup path Replace libcrun_get_cgroup_process() with direct read via procfd to avoid dependency on /proc being mounted. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/cgroup-utils.c | 74 +++++++++++++++++++++++++------------- src/libcrun/cgroup-utils.h | 2 ++ src/libcrun/linux.c | 5 ++- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/libcrun/cgroup-utils.c b/src/libcrun/cgroup-utils.c index b894e24ae1..87780f4d18 100644 --- a/src/libcrun/cgroup-utils.c +++ b/src/libcrun/cgroup-utils.c @@ -210,35 +210,13 @@ libcrun_get_cgroup_mode (libcrun_error_t *err) return cgroup_mode; } -int -libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err) +static int +get_cgroup_process_from_content (char *content, int cgroup_mode, char **path, bool absolute, libcrun_error_t *err) { - cleanup_free char *content = NULL; - char proc_cgroup_file[64]; char *cg_path = NULL; - size_t content_size; char *controller; char *saveptr; - int cgroup_mode; bool has_data; - int ret; - - cgroup_mode = libcrun_get_cgroup_mode (err); - if (UNLIKELY (cgroup_mode < 0)) - return cgroup_mode; - - if (pid == 0) - strcpy (proc_cgroup_file, PROC_SELF_CGROUP); - else - { - int len = snprintf (proc_cgroup_file, sizeof (proc_cgroup_file), "/proc/%d/cgroup", pid); - if (UNLIKELY (len >= (int) sizeof (proc_cgroup_file))) - return crun_make_error (err, 0, "internal error: static buffer too small"); - } - - ret = read_all_file (proc_cgroup_file, &content, &content_size, err); - if (UNLIKELY (ret < 0)) - return ret; for (has_data = read_proc_cgroup (content, &saveptr, NULL, &controller, &cg_path); has_data; @@ -266,6 +244,54 @@ libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error return 0; } +int +libcrun_get_cgroup_process_at (int dirfd, char **path, bool absolute, libcrun_error_t *err) +{ + cleanup_free char *content = NULL; + size_t content_size; + int cgroup_mode; + int ret; + + cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + ret = read_all_file_at (dirfd, SELF_CGROUP, &content, &content_size, err); + if (UNLIKELY (ret < 0)) + return ret; + + return get_cgroup_process_from_content (content, cgroup_mode, path, absolute, err); +} + +int +libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err) +{ + cleanup_free char *content = NULL; + char proc_cgroup_file[64]; + size_t content_size; + int cgroup_mode; + int ret; + + cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + if (pid == 0) + strcpy (proc_cgroup_file, PROC_SELF_CGROUP); + else + { + int len = snprintf (proc_cgroup_file, sizeof (proc_cgroup_file), "/proc/%d/cgroup", pid); + if (UNLIKELY (len >= (int) sizeof (proc_cgroup_file))) + return crun_make_error (err, 0, "internal error: static buffer too small"); + } + + ret = read_all_file (proc_cgroup_file, &content, &content_size, err); + if (UNLIKELY (ret < 0)) + return ret; + + return get_cgroup_process_from_content (content, cgroup_mode, path, absolute, err); +} + static int read_pids_cgroup (int dfd, bool recurse, pid_t **pids, size_t *n_pids, size_t *allocated, libcrun_error_t *err) { diff --git a/src/libcrun/cgroup-utils.h b/src/libcrun/cgroup-utils.h index 1cec12472d..3ec24d295c 100644 --- a/src/libcrun/cgroup-utils.h +++ b/src/libcrun/cgroup-utils.h @@ -28,6 +28,8 @@ int libcrun_cgroups_create_symlinks (int dirfd, libcrun_error_t *err); int libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err); +int libcrun_get_cgroup_process_at (int dirfd, char **path, bool absolute, libcrun_error_t *err); + int libcrun_get_cgroup_mode (libcrun_error_t *err); int libcrun_get_cgroup_dirfd (struct libcrun_cgroup_status *status, const char *sub_cgroup, libcrun_error_t *err); diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 3012604ded..fcea0674c7 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -2910,9 +2910,12 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont if (cgroup_mode == CGROUP_MODE_UNIFIED) { char *unified_cgroup_path = NULL; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; /* Read the cgroup path before we enter the cgroupns. */ - ret = libcrun_get_cgroup_process (0, &unified_cgroup_path, true, err); + ret = libcrun_get_cgroup_process_at (procfd, &unified_cgroup_path, true, err); if (UNLIKELY (ret < 0)) return ret; From e5067199138fd1729df7e93007f00873c668627c Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 15:11:03 +0000 Subject: [PATCH 11/22] linux: try mount_setattr in do_remount Use mount_setattr() when available before falling back to the mount() syscall. mount_setattr() is fd-based and does not need /proc/self/fd paths. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 40 ++++++++++++++++++++++++++++++++++++++-- src/libcrun/syscalls.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index fcea0674c7..81004eff0c 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -278,6 +278,32 @@ syscall_pidfd_send_signal (int pidfd, int sig, siginfo_t *info, unsigned int fla #endif } +/* Translate MS_* mount flags to MOUNT_ATTR_* values for mount_setattr(). */ +static uint64_t +ms_flags_to_mount_attr (uint64_t ms_flags) +{ + uint64_t attr = 0; + + if (ms_flags & MS_RDONLY) + attr |= MOUNT_ATTR_RDONLY; + if (ms_flags & MS_NOSUID) + attr |= MOUNT_ATTR_NOSUID; + if (ms_flags & MS_NODEV) + attr |= MOUNT_ATTR_NODEV; + if (ms_flags & MS_NOEXEC) + attr |= MOUNT_ATTR_NOEXEC; + if (ms_flags & MS_NOATIME) + attr |= MOUNT_ATTR_NOATIME; + if (ms_flags & MS_STRICTATIME) + attr |= MOUNT_ATTR_STRICTATIME; + if (ms_flags & MS_NODIRATIME) + attr |= MOUNT_ATTR_NODIRATIME; + if (ms_flags & MS_NOSYMFOLLOW) + attr |= MOUNT_ATTR_NOSYMFOLLOW; + + return attr; +} + static int do_mount_setattr (bool recursive, const char *target, int targetfd, uint64_t clear, uint64_t set, libcrun_error_t *err) { @@ -290,8 +316,11 @@ do_mount_setattr (bool recursive, const char *target, int targetfd, uint64_t cle clear &= ~MS_BIND; attr.propagation = set & ALL_PROPAGATIONS_NO_REC; - attr.attr_set = set & (~ALL_PROPAGATIONS); - attr.attr_clr = clear & (~ALL_PROPAGATIONS); + attr.attr_set = ms_flags_to_mount_attr (set & (~ALL_PROPAGATIONS)); + attr.attr_clr = ms_flags_to_mount_attr (clear & (~ALL_PROPAGATIONS)); + + if (attr.attr_set & MOUNT_ATTR__ATIME) + attr.attr_clr |= MOUNT_ATTR__ATIME; ret = syscall_mount_setattr (targetfd, "", (recursive ? AT_RECURSIVE : 0) | AT_EMPTY_PATH, &attr); if (UNLIKELY (ret < 0)) @@ -771,6 +800,13 @@ do_remount (int targetfd, const char *target, unsigned long flags, const char *d if (targetfd >= 0) { + unsigned long set_flags = flags & ~(MS_REMOUNT | MS_BIND); + unsigned long clear_flags = (MS_RDONLY | MS_NOSUID | MS_NODEV | MS_NOEXEC) & ~set_flags; + ret = do_mount_setattr (false, target, targetfd, clear_flags, flags & ~MS_REMOUNT, err); + if (LIKELY (ret == 0)) + return 0; + crun_error_release (err); + get_proc_self_fd_path (target_buffer, targetfd); real_target = target_buffer; } diff --git a/src/libcrun/syscalls.h b/src/libcrun/syscalls.h index 9c90ef8606..35c5368afd 100644 --- a/src/libcrun/syscalls.h +++ b/src/libcrun/syscalls.h @@ -54,10 +54,42 @@ # define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ #endif +#ifndef MOUNT_ATTR_NOSUID +# define MOUNT_ATTR_NOSUID 0x00000002 +#endif + +#ifndef MOUNT_ATTR_NODEV +# define MOUNT_ATTR_NODEV 0x00000004 +#endif + +#ifndef MOUNT_ATTR_NOEXEC +# define MOUNT_ATTR_NOEXEC 0x00000008 +#endif + +#ifndef MOUNT_ATTR__ATIME +# define MOUNT_ATTR__ATIME 0x00000070 +#endif + +#ifndef MOUNT_ATTR_NOATIME +# define MOUNT_ATTR_NOATIME 0x00000010 +#endif + +#ifndef MOUNT_ATTR_STRICTATIME +# define MOUNT_ATTR_STRICTATIME 0x00000020 +#endif + +#ifndef MOUNT_ATTR_NODIRATIME +# define MOUNT_ATTR_NODIRATIME 0x00000080 +#endif + #ifndef MOUNT_ATTR_IDMAP # define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ #endif +#ifndef MOUNT_ATTR_NOSYMFOLLOW +# define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 +#endif + /* close_range flags */ #ifndef CLOSE_RANGE_CLOEXEC # define CLOSE_RANGE_CLOEXEC (1U << 2) From 53adb9a1c5719568c4d0ff3aecc5c2426868ac26 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 12 May 2026 07:46:18 +0000 Subject: [PATCH 12/22] linux: try mount_setattr in make_parent_mount_private Use mount_setattr() when available before falling back to the mount() syscall. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 81004eff0c..2206192aca 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -2849,10 +2849,16 @@ make_parent_mount_private (const char *rootfs, libcrun_error_t *err) /* prevent a potential infinite loop. */ while (n_slashes-- > 0) { + libcrun_error_t tmp_err = NULL; int ret; errno = 0; cleanup_close int parentfd = -1; + ret = do_mount_setattr (false, rootfs, rootfsfd, 0, MS_PRIVATE, &tmp_err); + if (ret == 0) + return 0; + crun_error_release (&tmp_err); + get_proc_self_fd_path (proc_path, rootfsfd); ret = mount (NULL, proc_path, NULL, MS_PRIVATE, NULL); if (ret == 0) From a98380bed173fed6df24932c689daef24421bb66 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 12 May 2026 07:46:40 +0000 Subject: [PATCH 13/22] linux: use fstat to detect root in make_parent_mount_private Replace the slash-counting loop bound with fstat-based root detection. The old code counted '/' characters in the path string to cap iterations; for relative paths like "rootfs" (zero slashes) the loop ran only once, which could be insufficient. Compare inode/device of each directory with its parent instead: when ".." resolves to the same inode we have reached the filesystem root. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 2206192aca..40fdc4ab4b 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -2833,25 +2833,22 @@ static int make_parent_mount_private (const char *rootfs, libcrun_error_t *err) { cleanup_close int rootfsfd = -1; + struct stat prev_st; proc_fd_path_t proc_path; - size_t n_slashes = 1; - const char *it; - - for (it = rootfs; *it; it++) - if (*it == '/') - n_slashes++; /* rootfs could be a relative path. */ rootfsfd = open (rootfs, O_PATH | O_CLOEXEC); if (UNLIKELY (rootfsfd < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); - /* prevent a potential infinite loop. */ - while (n_slashes-- > 0) + if (UNLIKELY (fstat (rootfsfd, &prev_st) < 0)) + return crun_make_error (err, errno, "fstat `%s`", rootfs); + + for (;;) { + struct stat cur_st; libcrun_error_t tmp_err = NULL; int ret; - errno = 0; cleanup_close int parentfd = -1; ret = do_mount_setattr (false, rootfs, rootfsfd, 0, MS_PRIVATE, &tmp_err); @@ -2874,11 +2871,18 @@ make_parent_mount_private (const char *rootfs, libcrun_error_t *err) return crun_make_error (err, saved_errno, "make `%s` private: cannot open component", rootfs); } + if (UNLIKELY (fstat (parentfd, &cur_st) < 0)) + return crun_make_error (err, errno, "fstat parent of `%s`", rootfs); + + /* Reached the root of the filesystem: ".." points to itself. */ + if (cur_st.st_dev == prev_st.st_dev && cur_st.st_ino == prev_st.st_ino) + break; + + prev_st = cur_st; close_and_reset (&rootfsfd); rootfsfd = get_and_reset (&parentfd); } - /* should never get this far. */ return crun_make_error (err, 0, "make `%s` private", rootfs); } From 9259e891acd25e49ae96cce8b595eb1a46be73e7 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 5 May 2026 15:20:10 +0000 Subject: [PATCH 14/22] linux: use new mount API in do_mount when available Try fsopen_mount()+fs_move_mount_to() for filesystem mounts and get_bind_mount()+fs_move_mount_to() for bind mounts before falling back to mount(). For propagation changes, try mount_setattr() first. This removes the dependency on /proc/self/fd paths for the initial mount and propagation steps when the new mount API is available. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 205 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 187 insertions(+), 18 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 40fdc4ab4b..c04852e78f 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -143,6 +143,7 @@ struct private_data_s int rootfsfd; int notify_socket_tree_fd; + int old_root_fd; struct libcrun_fd_map *mount_fds; struct libcrun_fd_map *dev_fds; @@ -176,6 +177,8 @@ cleanup_private_data (void *private_data) TEMP_FAILURE_RETRY (close (p->procfd)); if (p->maskdir_fd >= 0) TEMP_FAILURE_RETRY (close (p->maskdir_fd)); + if (p->old_root_fd >= 0) + TEMP_FAILURE_RETRY (close (p->old_root_fd)); if (p->mount_fds) cleanup_close_mapp (&(p->mount_fds)); if (p->dev_fds) @@ -199,6 +202,7 @@ get_private_data (struct libcrun_container_s *container) p->rootfsfd = -1; p->procfd = -1; p->notify_socket_tree_fd = -1; + p->old_root_fd = -1; p->maskdir_fd = -1; container->cleanup_private_data = cleanup_private_data; } @@ -750,7 +754,26 @@ get_mount_flags_or_option (const char *name, int current_flags, unsigned long *e __attribute__ ((unused)) cleanup_free char *prev = NULL; unsigned long flags = get_mount_flags (name, current_flags, &found, extra_flags, rec_clear, rec_set); if (found) - return flags; + { + /* MS_SYNCHRONOUS and MS_DIRSYNC cannot be set through fsmount() + attr_flags. Pass them also as data so that fsconfig(SET_FLAG) + can set SB_SYNCHRONOUS / SB_DIRSYNC on the superblock. */ + const char *data_flag = NULL; + if ((flags & MS_SYNCHRONOUS) && ! (current_flags & MS_SYNCHRONOUS)) + data_flag = "sync"; + else if ((flags & MS_DIRSYNC) && ! (current_flags & MS_DIRSYNC)) + data_flag = "dirsync"; + + if (data_flag) + { + prev = *option; + if (*option && **option) + xasprintf (option, "%s,%s", *option, data_flag); + else + *option = xstrdup (data_flag); + } + return flags; + } prev = *option; if (*option && **option) @@ -903,7 +926,7 @@ open_mount_target (libcrun_container_t *container, const char *target_rel, libcr /* Attempt to open a mount of the specified type. */ static int -fsopen_mount (const char *type, const char *labeltype, const char *label) +fsopen_mount (const char *type, const char *source_name, const char *labeltype, const char *label, const char *data) { #ifdef HAVE_NEW_MOUNT_API cleanup_close int fsfd = -1; @@ -913,6 +936,11 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) if (UNLIKELY (fsfd < 0)) return fsfd; + /* Best-effort: pseudo file systems (tmpfs, proc, ...) ignore "source", + so a failure here is expected and harmless. Real failures surface + at FSCONFIG_CMD_CREATE below. */ + (void) syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, "source", source_name ? source_name : type, 0); + if (labeltype) { ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, labeltype, label, 0); @@ -920,6 +948,28 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) return ret; } + if (data && data[0]) + { + cleanup_free char *data_copy = xstrdup (data); + char *saveptr = NULL; + char *token; + + for (token = strtok_r (data_copy, ",", &saveptr); token; token = strtok_r (NULL, ",", &saveptr)) + { + char *eq = strchr (token, '='); + if (eq) + { + *eq = '\0'; + ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, token, eq + 1, 0); + } + else + ret = syscall_fsconfig (fsfd, FSCONFIG_SET_FLAG, token, NULL, 0); + + if (UNLIKELY (ret < 0)) + return ret; + } + } + ret = syscall_fsconfig (fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); if (UNLIKELY (ret < 0)) return ret; @@ -927,6 +977,8 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) return syscall_fsmount (fsfd, FSMOUNT_CLOEXEC, 0); #else (void) type; + (void) source_name; + (void) data; errno = ENOSYS; return -1; #endif @@ -940,7 +992,7 @@ get_procfd (struct private_data_s *data, libcrun_error_t *err) if (data->procfd >= 0) return data->procfd; - fd = fsopen_mount ("proc", NULL, NULL); + fd = fsopen_mount ("proc", NULL, NULL, NULL, NULL); if (fd < 0) fd = open ("/proc", O_DIRECTORY | O_CLOEXEC); if (UNLIKELY (fd < 0)) @@ -1268,6 +1320,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, int label_how, libcrun_error_t *err) { cleanup_free char *data_with_label = NULL; + const char *context_type = NULL; cleanup_close int ms_move_fd = -1; const char *real_target = target; bool single_instance = false; @@ -1293,7 +1346,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (label_how == LABEL_MOUNT) { - const char *context_type = get_selinux_context_type (container, err); + context_type = get_selinux_context_type (container, err); if (UNLIKELY (context_type == NULL)) return -1; @@ -1324,7 +1377,61 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, { unsigned long flags = mountflags & ~(ALL_PROPAGATIONS_NO_REC | MS_RDONLY); - ret = mount (source, real_target, fstype, flags, data); + ret = -1; + + if (targetfd >= 0) + { + if (mountflags & MS_BIND) + { + cleanup_close int bindfd = -1; + libcrun_error_t tmp_err = NULL; + + bindfd = get_bind_mount (AT_FDCWD, source, + (mountflags & MS_REC) != 0, + false, false, 0, &tmp_err); + if (bindfd < 0) + { + int old_root_fd = get_private_data (container)->old_root_fd; + + if (old_root_fd >= 0 && source && source[0] == '/') + { + crun_error_release (&tmp_err); + bindfd = get_bind_mount (old_root_fd, (source[1] == '\0') ? "." : source + 1, + (mountflags & MS_REC) != 0, + false, false, 0, &tmp_err); + } + } + if (bindfd >= 0) + ret = fs_move_mount_to (bindfd, targetfd, NULL); + if (ret < 0) + crun_error_release (&tmp_err); + } + else + { + cleanup_close int newfs = fsopen_mount (fstype, source, context_type, label, data); + if (newfs >= 0) + ret = fs_move_mount_to (newfs, targetfd, NULL); + } + } + + if (ret < 0) + { + ret = mount (source, real_target, fstype, flags, data); + if (ret < 0 && (mountflags & MS_BIND) && source && source[0] == '/') + { + int old_root_fd = get_private_data (container)->old_root_fd; + + if (old_root_fd >= 0) + { + cleanup_free char *proc_source_buf = NULL; + proc_fd_path_t old_root_path; + + get_proc_self_fd_path (old_root_path, old_root_fd); + xasprintf (&proc_source_buf, "%s/%s", old_root_path, source + 1); + ret = mount (proc_source_buf, real_target, fstype, flags, data); + } + } + } if (UNLIKELY (ret < 0)) { int saved_errno = errno; @@ -1340,17 +1447,44 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (ret > 0) { cleanup_close int mountfd = -1; + int sys_old_root_fd = get_private_data (container)->old_root_fd; - if (! has_mount_for (container, "/sys/fs/cgroup")) + if (sys_old_root_fd >= 0) + mountfd = get_bind_mount (sys_old_root_fd, "sys", true, false, false, MS_PRIVATE, err); + if (mountfd < 0) { - ret = mount ("/sys", real_target, NULL, MS_BIND | MS_REC, NULL); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "bind mount `/sys` from the host"); + crun_error_release (err); + mountfd = get_bind_mount (AT_FDCWD, "/sys", true, false, false, MS_PRIVATE, err); + } + if (! has_mount_for (container, "/sys/fs/cgroup")) + { + if (mountfd >= 0) + { + ret = fs_move_mount_to (mountfd, targetfd, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "move mount `/sys` to `%s`", real_target); + } + else + { + crun_error_release (err); + if (sys_old_root_fd >= 0) + { + cleanup_free char *sys_path = NULL; + proc_fd_path_t sys_root_path; + + get_proc_self_fd_path (sys_root_path, sys_old_root_fd); + xasprintf (&sys_path, "%s/sys", sys_root_path); + ret = mount (sys_path, real_target, NULL, MS_BIND | MS_REC, NULL); + } + else + ret = mount ("/sys", real_target, NULL, MS_BIND | MS_REC, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "bind mount `/sys` from the host"); + } return do_masked_or_readonly_path (container, "/sys/fs/cgroup", false, false, err); } - mountfd = get_bind_mount (-1, "/sys", true, true, false, MS_PRIVATE, err); if (UNLIKELY (mountfd < 0)) return mountfd; @@ -1377,7 +1511,27 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, /* We are replacing the rootfs, reopen it. */ if (is_empty_string (target)) { - int tmp = dup (fd); + int procfd = get_procfd (get_private_data (container), err); + int tmp; + if (UNLIKELY (procfd < 0)) + return procfd; + + { + cleanup_close int nsfd = openat (procfd, "self/ns/mnt", O_RDONLY | O_CLOEXEC); + if (UNLIKELY (nsfd < 0)) + return crun_make_error (err, errno, "open `/proc/self/ns/mnt`"); + + ret = setns (nsfd, CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "setns `CLONE_NEWNS`"); + } + + close_and_reset (&fd); + fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (fd < 0)) + return crun_make_error (err, errno, "reopen rootfs after mount on /"); + + tmp = dup (fd); if (UNLIKELY (tmp < 0)) return crun_make_error (err, errno, "dup"); @@ -1410,9 +1564,24 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (mountflags & ALL_PROPAGATIONS_NO_REC) { - ret = mount (NULL, real_target, NULL, mountflags & ALL_PROPAGATIONS, NULL); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "set propagation for `%s`", target); + bool propagation_done = false; + + if (targetfd >= 0) + { + libcrun_error_t tmp_err = NULL; + ret = do_mount_setattr (false, target, targetfd, 0, mountflags & ALL_PROPAGATIONS, &tmp_err); + if (LIKELY (ret == 0)) + propagation_done = true; + else + crun_error_release (&tmp_err); + } + + if (! propagation_done) + { + ret = mount (NULL, real_target, NULL, mountflags & ALL_PROPAGATIONS, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "set propagation for `%s`", target); + } } if (mountflags & (MS_BIND | MS_RDONLY)) @@ -1430,7 +1599,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if ((remount_flags & MS_RDONLY) == 0) { - ret = do_remount (fd, real_target, remount_flags, data, err); + ret = do_remount (fd >= 0 ? fd : targetfd, real_target, remount_flags, data, err); if (UNLIKELY (ret < 0)) return ret; } @@ -4812,7 +4981,7 @@ prepare_and_send_dev_mounts (libcrun_container_t *container, int sync_socket_hos } } - devs_mountfd = fsopen_mount ("tmpfs", context_type, label); + devs_mountfd = fsopen_mount ("tmpfs", NULL, context_type, label, NULL); if (UNLIKELY (devs_mountfd < 0)) { ret = crun_make_error (err, errno, "fsopen_mount `tmpfs`"); @@ -5053,9 +5222,9 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc An error will be generated later if it is not possible to join the namespace. */ if (init_status->join_pidns && strcmp (def->mounts[i]->type, "proc") == 0) - fd = fsopen_mount (def->mounts[i]->type, NULL, NULL); + fd = fsopen_mount (def->mounts[i]->type, def->mounts[i]->source, NULL, NULL, NULL); if (init_status->join_ipcns && strcmp (def->mounts[i]->type, "mqueue") == 0) - fd = fsopen_mount (def->mounts[i]->type, NULL, NULL); + fd = fsopen_mount (def->mounts[i]->type, def->mounts[i]->source, NULL, NULL, NULL); if (fd >= 0) { From 5672105871691bc1904c1c4b9b5afb93a893f950 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 May 2026 08:56:57 +0000 Subject: [PATCH 15/22] linux: pre-open needed devices in parent for userns containers In user namespace containers, devices under /dev are created via bind mounts from the host. Pre-open the 6 standard devices (/dev/null, /dev/zero, /dev/full, /dev/tty, /dev/random, /dev/urandom) using open_tree(OPEN_TREE_CLONE) in the parent process after clone(), and send them to the child via the existing sync socket fd-passing mechanism. This reuses the same send_mounts/receive_mounts pattern already used for custom devices (dev_fds) and bind mount sources (mount_fds). This is a preparatory change for OPEN_TREE_NAMESPACE support, where host paths are not accessible after setns() into the new mount namespace. When open_tree() is not available or fails (e.g. rootless without CAP_SYS_ADMIN), the fds remain -1 and the child falls back to the existing bind mount path. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 65 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index c04852e78f..69d9058268 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -147,6 +147,7 @@ struct private_data_s struct libcrun_fd_map *mount_fds; struct libcrun_fd_map *dev_fds; + struct libcrun_fd_map *needed_devs_fds; /* Used to save stdin, stdout, stderr during checkpointing to descriptors.json * and needed during restore. */ @@ -183,6 +184,8 @@ cleanup_private_data (void *private_data) cleanup_close_mapp (&(p->mount_fds)); if (p->dev_fds) cleanup_close_mapp (&(p->dev_fds)); + if (p->needed_devs_fds) + cleanup_close_mapp (&(p->needed_devs_fds)); free (p->unified_cgroup_path); free (p->host_notify_socket_path); @@ -2112,10 +2115,14 @@ create_missing_devs (libcrun_container_t *container, bool binds, libcrun_error_t runtime_spec_schema_config_schema *def = container->container_def; const char *rootfs = get_private_data (container)->rootfs; cleanup_close_map struct libcrun_fd_map *dev_fds = NULL; + cleanup_close_map struct libcrun_fd_map *needed_fds = NULL; dev_fds = get_private_data (container)->dev_fds; get_private_data (container)->dev_fds = NULL; + needed_fds = get_private_data (container)->needed_devs_fds; + get_private_data (container)->needed_devs_fds = NULL; + if (! def || ! def->linux) return 0; @@ -2142,12 +2149,16 @@ create_missing_devs (libcrun_container_t *container, bool binds, libcrun_error_t return ret; } - for (it = needed_devs; it->path; it++) + for (it = needed_devs, i = 0; it->path; it++, i++) { + cleanup_close int srcfd = (needed_fds && i < needed_fds->nfds) ? needed_fds->fds[i] : -1; + /* make sure the parent directory exists only on the first iteration. */ - ret = libcrun_create_dev (container, devfd, -1, it, binds, it == needed_devs, err); + ret = libcrun_create_dev (container, devfd, srcfd, it, binds, it == needed_devs, err); if (UNLIKELY (ret < 0)) return ret; + if (srcfd >= 0) + needed_fds->fds[i] = -1; } for (i = 0; symlinks[i].target; i++) @@ -4582,6 +4593,21 @@ get_devices_fd_map (libcrun_container_t *container) return dev_fds; } +#define NUM_NEEDED_DEVS (sizeof (needed_devs) / sizeof (needed_devs[0]) - 1) + +static struct libcrun_fd_map * +get_needed_devs_fd_map (libcrun_container_t *container) +{ + struct libcrun_fd_map *fds = get_private_data (container)->needed_devs_fds; + + if (fds == NULL) + { + fds = make_libcrun_fd_map (NUM_NEEDED_DEVS); + get_private_data (container)->needed_devs_fds = fds; + } + return fds; +} + static struct libcrun_fd_map * get_fd_map (libcrun_container_t *container) { @@ -5041,6 +5067,33 @@ prepare_and_send_dev_mounts (libcrun_container_t *container, int sync_socket_hos return ret; } +static int +prepare_and_send_needed_dev_mounts (libcrun_container_t *container, int sync_socket_host, libcrun_error_t *err) +{ + cleanup_close_map struct libcrun_fd_map *fds = NULL; + bool has_userns = (get_private_data (container)->unshare_flags & CLONE_NEWUSER) ? true : false; + size_t how_many = 0; + size_t i; + + fds = make_libcrun_fd_map (NUM_NEEDED_DEVS); + + if (! has_userns || geteuid () > 0) + return send_mounts (sync_socket_host, fds, 0, NUM_NEEDED_DEVS, err); + + for (i = 0; i < NUM_NEEDED_DEVS && needed_devs[i].path; i++) + { + int fd = syscall_open_tree (AT_FDCWD, needed_devs[i].path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd >= 0) + { + fds->fds[i] = fd; + how_many++; + } + } + + return send_mounts (sync_socket_host, fds, how_many, NUM_NEEDED_DEVS, err); +} + static int prepare_and_send_mounts (libcrun_container_t *container, pid_t pid, int sync_socket_host, libcrun_error_t *err) { @@ -5058,6 +5111,10 @@ prepare_and_send_mounts (libcrun_container_t *container, pid_t pid, int sync_soc if (UNLIKELY (ret < 0)) return ret; + ret = prepare_and_send_needed_dev_mounts (container, sync_socket_host, err); + if (UNLIKELY (ret < 0)) + return ret; + return 0; } @@ -5361,6 +5418,10 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc if (UNLIKELY (ret < 0)) return ret; + ret = receive_mounts (get_needed_devs_fd_map (container), sync_socket_container, err); + if (UNLIKELY (ret < 0)) + return ret; + ret = libcrun_container_setgroups (container, container->container_def->process, err); if (UNLIKELY (ret < 0)) return ret; From f67b00ce318984f8ab0f387233bd8aa567a837e1 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 May 2026 09:37:13 +0000 Subject: [PATCH 16/22] linux: use fchmodat/fchownat in libcrun_create_dev Replace the openat(procfd) + fchmod/fchown approach with direct fchmodat/fchownat calls. These operate on the directory entry without opening the device file, avoiding ENXIO errors on character devices like /dev/tty that fail to open when no controlling terminal exists. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 75 +++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 47 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 69d9058268..f075d410b0 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1918,6 +1918,28 @@ relative_path_under_dev (const char *path) return NULL; } +static int +mknod_and_set_attrs (int dirfd, const char *name, struct device_s *device, mode_t type, dev_t dev, libcrun_error_t *err) +{ + int ret; + + ret = mknodat (dirfd, name, device->mode | type, dev); + if (UNLIKELY (ret < 0 && errno == EEXIST)) + return 0; + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "mknodat `%s`", device->path); + + ret = fchmodat (dirfd, name, device->mode, 0); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "fchmod `%s`", device->path); + + ret = fchownat (dirfd, name, device->uid, device->gid, AT_SYMLINK_NOFOLLOW); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "fchown `%s`", device->path); + + return 1; +} + int libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, struct device_s *device, bool binds, bool ensure_parent_dir, @@ -1927,7 +1949,6 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, dev_t dev; mode_t type = (device->type[0] == 'b') ? S_IFBLK : ((device->type[0] == 'p') ? S_IFIFO : S_IFCHR); const char *fullname = device->path; - cleanup_close int fd = -1; const char *rootfs = get_private_data (container)->rootfs; if (is_empty_string (fullname)) return crun_make_error (err, EINVAL, "device path is empty"); @@ -1997,11 +2018,6 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, } else { - proc_fd_path_t fd_buffer; - int procfd = get_procfd (get_private_data (container), err); - if (UNLIKELY (procfd < 0)) - return procfd; - dev = makedev (device->major, device->minor); /* Check whether the path is directly under /dev. Since we already have an open fd to /dev and mknodat(2) @@ -2010,26 +2026,9 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, */ if (rel_dev) { - ret = mknodat (devfd, rel_dev, device->mode | type, dev); - /* We don't fail when the file already exists. */ - if (UNLIKELY (ret < 0 && errno == EEXIST)) - return 0; - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "mknodat `%s`", device->path); - - fd = safe_openat (devfd, rootfs, rel_dev, O_PATH | O_CLOEXEC, 0, err); - if (UNLIKELY (fd < 0)) - return fd; - - get_self_fd_path (fd_buffer, fd); - - ret = fchmodat (procfd, fd_buffer, device->mode, 0); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fchmodat `%s`", device->path); - - ret = fchownat (procfd, fd_buffer, device->uid, device->gid, 0); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fchownat `%s`", device->path); + ret = mknod_and_set_attrs (devfd, rel_dev, device, type, dev, err); + if (ret <= 0) + return ret; } else { @@ -2064,27 +2063,9 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, return dirfd; } - ret = mknodat (dirfd, basename, device->mode | type, dev); - - /* We don't fail when the file already exists. */ - if (UNLIKELY (ret < 0 && errno == EEXIST)) - return 0; - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "mknodat `%s`", device->path); - - fd = safe_openat (dirfd, rootfs, basename, O_PATH | O_CLOEXEC, 0, err); - if (UNLIKELY (fd < 0)) - return crun_error_wrap (err, "openat `%s`", device->path); - - get_self_fd_path (fd_buffer, fd); - - ret = fchmodat (procfd, fd_buffer, device->mode, 0); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fchmodat `%s`", device->path); - - ret = fchownat (procfd, fd_buffer, device->uid, device->gid, 0); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fchownat `%s`", device->path); + ret = mknod_and_set_attrs (dirfd, basename, device, type, dev, err); + if (ret <= 0) + return ret; } } return 0; From 0450679d8b2112b53848945fd93414339cbd4ffe Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Fri, 8 May 2026 14:29:52 +0200 Subject: [PATCH 17/22] linux: change signature for open_mount_of_type and rename it Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index f075d410b0..ceef8caf56 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -4664,23 +4664,23 @@ get_idmapped_option (runtime_spec_schema_defs_mount *mnt, bool *recursive) } static int -open_mount_of_type (runtime_spec_schema_defs_mount *mnt, int *out_fd, libcrun_error_t *err) +open_mount_type (const char *type, int *out_fd, libcrun_error_t *err) { cleanup_close int fsopen_fd = -1; cleanup_close int newfs_fd = -1; int ret; - fsopen_fd = syscall_fsopen (mnt->type, FSOPEN_CLOEXEC); + fsopen_fd = syscall_fsopen (type, FSOPEN_CLOEXEC); if (UNLIKELY (fsopen_fd < 0)) - return crun_make_error (err, errno, "fsopen `%s`", mnt->type); + return crun_make_error (err, errno, "fsopen `%s`", type); ret = syscall_fsconfig (fsopen_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fsconfig create `%s`", mnt->type); + return crun_make_error (err, errno, "fsconfig create `%s`", type); newfs_fd = syscall_fsmount (fsopen_fd, FSMOUNT_CLOEXEC, 0); if (UNLIKELY (newfs_fd < 0)) - return crun_make_error (err, errno, "fsmount `%s`", mnt->type); + return crun_make_error (err, errno, "fsmount `%s`", type); *out_fd = get_and_reset (&newfs_fd); return 0; @@ -4748,7 +4748,7 @@ maybe_get_idmapped_mount (libcrun_container_t *container, runtime_spec_schema_co } else { - ret = open_mount_of_type (mnt, &newfs_fd, err); + ret = open_mount_type (mnt->type, &newfs_fd, err); if (UNLIKELY (ret < 0)) return ret; } @@ -6785,7 +6785,7 @@ libcrun_make_runtime_mounts (libcrun_container_t *container, libcrun_container_s } else { - ret = open_mount_of_type (mounts[i], &(fds->fds[i]), err); + ret = open_mount_type (mounts[i]->type, &(fds->fds[i]), err); if (UNLIKELY (ret < 0)) return ret; } From c50e946ab20d55c8a40c3066b0f023c7db71f834 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 13 May 2026 14:43:12 +0000 Subject: [PATCH 18/22] tests: check directory type instead of nlink for masked paths The masked-dir-nlink test checked that st_nlink >= 2 to verify a masked path is a directory. This fails on btrfs where directories have st_nlink=1 (btrfs does not count . and .. in nlink). Add an 'isdir' command to the test init binary and use S_ISDIR to verify the masked path is a directory, which works regardless of filesystem type. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 1 - tests/init.c | 26 +++++++++++ tests/test_mounts.py | 105 ++++++++++++++++++++++++++++++++++++++++++- tests/test_paths.py | 28 +++++++++++- 4 files changed, 157 insertions(+), 3 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index ceef8caf56..f6519016a6 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -4908,7 +4908,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy if (UNLIKELY (mount_fd < 0)) crun_error_release (err); } - if (mount_fd >= 0) how_many++; diff --git a/tests/init.c b/tests/init.c index 4eb85b8dbe..a00017d552 100644 --- a/tests/init.c +++ b/tests/init.c @@ -679,6 +679,32 @@ main (int argc, char **argv) return 0; } + if (strcmp (argv[1], "nlink") == 0) + { + struct stat st; + + if (argc < 3) + error (EXIT_FAILURE, 0, "'nlink' requires a path argument"); + if (stat (argv[2], &st) < 0) + error (EXIT_FAILURE, errno, "stat %s", argv[2]); + + printf ("%lu\n", (unsigned long) st.st_nlink); + return 0; + } + + if (strcmp (argv[1], "isdir") == 0) + { + struct stat st; + + if (argc < 3) + error (EXIT_FAILURE, 0, "'isdir' requires a path argument"); + if (stat (argv[2], &st) < 0) + error (EXIT_FAILURE, errno, "stat %s", argv[2]); + + printf ("%d\n", S_ISDIR (st.st_mode) ? 1 : 0); + return 0; + } + if (strcmp (argv[1], "id") == 0) { int ret; diff --git a/tests/test_mounts.py b/tests/test_mounts.py index 2b08b6e28d..d44ce2a02a 100755 --- a/tests/test_mounts.py +++ b/tests/test_mounts.py @@ -156,7 +156,6 @@ def test_mount_tmpfs_to_rootfs(): conf = base_config() conf['process']['args'] = ['/init', 'true'] add_all_namespaces(conf) - tmpdir = tempfile.mkdtemp() mounts = [ {"destination": "/", "type": "tmpfs", "source": "tmpfs", "options": ["tmpcopyup"]}, @@ -1066,6 +1065,107 @@ def test_mount_propagation_slave(): logger.info("mountinfo output: %s", out) return -1 +def test_mount_tmpfs_size(): + """Verify tmpfs mounts with size= data option work via fsopen_mount.""" + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/proc/self/mountinfo'] + add_all_namespaces(conf) + conf['mounts'].append({ + "destination": "/sized-tmpfs", + "type": "tmpfs", + "source": "tmpfs", + "options": ["nosuid", "nodev", "size=64k"], + }) + out, _ = run_and_get_output(conf, hide_stderr=True) + with tempfile.NamedTemporaryFile(mode='w', delete=True) as f: + f.write(out) + f.flush() + t = libmount.Table(f.name) + m = t.find_target('/sized-tmpfs') + if m is None: + logger.info("tmpfs size test: /sized-tmpfs not found in mountinfo") + return -1 + if m.fs_options is None or 'size=64k' not in m.fs_options: + logger.info("tmpfs size test: expected 'size=64k' in fs_options, got: %s", m.fs_options) + return -1 + return 0 + +def test_mount_tmpfs_size_userns(): + """Verify tmpfs with data options works inside a user namespace. + + This exercises the fsopen_mount pre-mount path: before pivot_root, + crun uses fsopen/fsconfig/fsmount to create a detached tmpfs with + the specified data, then moves it into place via move_mount. + """ + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/proc/self/mountinfo'] + add_all_namespaces(conf, userns=True) + conf['mounts'].append({ + "destination": "/sized-tmpfs", + "type": "tmpfs", + "source": "tmpfs", + "options": ["nosuid", "nodev", "mode=1777", "size=128k"], + }) + out, _ = run_and_get_output(conf, hide_stderr=True) + with tempfile.NamedTemporaryFile(mode='w', delete=True) as f: + f.write(out) + f.flush() + t = libmount.Table(f.name) + m = t.find_target('/sized-tmpfs') + if m is None: + logger.info("tmpfs userns test: /sized-tmpfs not found in mountinfo") + return -1 + if m.fs_options is None or 'size=128k' not in m.fs_options: + logger.info("tmpfs userns test: expected 'size=128k' in fs_options, got: %s", m.fs_options) + return -1 + return 0 + +def test_mount_overlay_fs(): + """Verify overlay mounts with lowerdir/upperdir/workdir work via fsopen_mount. + + Creates overlay directories on the host and mounts them into the + container. The premount loop in setup_mount_namespace uses + fsopen/fsconfig/fsmount to create a detached overlay before + pivot_root (while host paths are still reachable), then moves + the mount into place. + """ + if is_rootless(): + return (77, "requires root privileges") + + overlay_base = tempfile.mkdtemp(prefix="crun-overlay-test-") + lower = os.path.join(overlay_base, "lower") + upper = os.path.join(overlay_base, "upper") + work = os.path.join(overlay_base, "work") + try: + for d in [lower, upper, work]: + os.makedirs(d, exist_ok=True) + with open(os.path.join(lower, "from-lower"), "w") as f: + f.write("lower-data") + + def prepare(rootfs): + os.makedirs(os.path.join(rootfs, "overlay-mnt"), exist_ok=True) + + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/overlay-mnt/from-lower'] + add_all_namespaces(conf) + conf['mounts'].append({ + "destination": "/overlay-mnt", + "type": "overlay", + "source": "overlay", + "options": [ + "lowerdir=%s" % lower, + "upperdir=%s" % upper, + "workdir=%s" % work, + ], + }) + out, _ = run_and_get_output(conf, hide_stderr=True, callback_prepare_rootfs=prepare) + if "lower-data" not in out: + logger.info("overlay test: expected 'lower-data', got: %s", out) + return -1 + finally: + shutil.rmtree(overlay_base, ignore_errors=True) + return 0 + all_tests = { "mount-ro" : test_mount_ro, "mount-rro" : test_mount_rro, @@ -1106,6 +1206,9 @@ def test_mount_propagation_slave(): "mount-propagation-private": test_mount_propagation_private, "mount-no-leak-to-host": test_mount_no_leak_to_host, "mount-propagation-slave": test_mount_propagation_slave, + "mount-tmpfs-size": test_mount_tmpfs_size, + "mount-tmpfs-size-userns": test_mount_tmpfs_size_userns, + "mount-overlay-fs": test_mount_overlay_fs, } if __name__ == "__main__": diff --git a/tests/test_paths.py b/tests/test_paths.py index 1edc2b04dc..7a6e4fdcf5 100755 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with crun. If not, see . +import os from tests_utils import * def test_readonly_paths(): @@ -39,10 +40,35 @@ def test_masked_paths(): if len(out) > 0: return -1 return 0 - + +def test_masked_paths_no_run_crun(): + def prepare(rootfs): + os.makedirs(os.path.join(rootfs, "run"), exist_ok=True) + conf = base_config() + conf['process']['args'] = ['/init', 'ls', '/run'] + conf['linux']['maskedPaths'] = ['/proc/acpi', '/proc/kcore'] + add_all_namespaces(conf) + out, _ = run_and_get_output(conf, hide_stderr=True, callback_prepare_rootfs=prepare) + if 'crun' in out: + return -1 + return 0 + +def test_masked_dir_nlink(): + conf = base_config() + conf['process']['args'] = ['/init', 'isdir', '/proc/acpi'] + conf['linux']['maskedPaths'] = ['/proc/acpi'] + add_all_namespaces(conf) + out, _ = run_and_get_output(conf, hide_stderr=True) + is_dir = int(out.strip()) + if is_dir != 1: + return -1 + return 0 + all_tests = { "readonly-paths" : test_readonly_paths, "masked-paths" : test_masked_paths, + "masked-paths-no-run-crun" : test_masked_paths_no_run_crun, + "masked-dir-nlink" : test_masked_dir_nlink, } if __name__ == "__main__": From 2da2413abba0293ab51cf33b525562b63b84d3e8 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 25 May 2026 14:29:03 +0000 Subject: [PATCH 19/22] tests: add procless container tests Verify that containers can run without /proc, /sys, or cgroup mounts in the OCI spec. This exercises the procfd-based code paths that no longer depend on /proc/self/fd. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- tests/test_mounts.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_mounts.py b/tests/test_mounts.py index d44ce2a02a..3d992b2c14 100755 --- a/tests/test_mounts.py +++ b/tests/test_mounts.py @@ -1166,6 +1166,25 @@ def prepare(rootfs): shutil.rmtree(overlay_base, ignore_errors=True) return 0 +def test_no_proc(): + """Container without /proc in the OCI spec should still work.""" + conf = base_config() + conf['process']['args'] = ['/init', 'true'] + add_all_namespaces(conf) + conf['mounts'] = [m for m in conf['mounts'] if m.get('destination') != '/proc'] + run_and_get_output(conf, hide_stderr=True) + return 0 + +def test_no_proc_sysfs_cgroup(): + """Container without /proc, /sys, or cgroup mounts should still work.""" + conf = base_config() + conf['process']['args'] = ['/init', 'true'] + add_all_namespaces(conf) + skip = {'/proc', '/sys', '/sys/fs/cgroup'} + conf['mounts'] = [m for m in conf['mounts'] if m.get('destination') not in skip] + run_and_get_output(conf, hide_stderr=True) + return 0 + all_tests = { "mount-ro" : test_mount_ro, "mount-rro" : test_mount_rro, @@ -1209,6 +1228,8 @@ def prepare(rootfs): "mount-tmpfs-size": test_mount_tmpfs_size, "mount-tmpfs-size-userns": test_mount_tmpfs_size_userns, "mount-overlay-fs": test_mount_overlay_fs, + "mount-no-proc": test_no_proc, + "mount-no-proc-sysfs-cgroup": test_no_proc_sysfs_cgroup, } if __name__ == "__main__": From 5506b6e492174629cdbebfa50f42d1b21c29f2d9 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 11 May 2026 15:32:02 +0000 Subject: [PATCH 20/22] linux: move pivot_root before container mounts Use the new mount API (fsopen/fsconfig/fsmount) to create detached mounts for each non-bind filesystem type before pivot_root. After the pivot, place them via move_mount so the kernel mnt_already_visible check is satisfied. set_mounts then mounts fresh instances on top with the correct OCI flags. This fixes mounting proc/sysfs/cgroup in containers that use a user namespace, where the kernel denies these mounts unless a mount of the same type is already visible. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/container.c | 21 +- src/libcrun/linux.c | 597 +++++++++++++++++++++++++++++++--------- src/libcrun/linux.h | 2 +- src/libcrun/utils.c | 14 +- tests/test_mounts.py | 37 ++- 5 files changed, 519 insertions(+), 152 deletions(-) diff --git a/src/libcrun/container.c b/src/libcrun/container.c index 5fd045f46f..9c1f9f96ae 100644 --- a/src/libcrun/container.c +++ b/src/libcrun/container.c @@ -1339,11 +1339,6 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, if (UNLIKELY (ret < 0)) return ret; - /* sync 2 and 3 are sent as part of libcrun_set_mounts. */ - ret = libcrun_set_mounts (entrypoint_args, container, rootfs, send_sync_cb, &sync_socket, err); - if (UNLIKELY (ret < 0)) - return ret; - if (def->hooks && def->hooks->create_container_len) { libcrun_error_t tmp_err = NULL; @@ -1357,6 +1352,15 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, return ret; } + ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, &rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* sync 2 and 3 are sent as part of libcrun_set_mounts. */ + ret = libcrun_set_mounts (entrypoint_args, container, rootfs, send_sync_cb, &sync_socket, err); + if (UNLIKELY (ret < 0)) + return ret; + ret = libcrun_finalize_mounts (entrypoint_args, container, rootfs, err); if (UNLIKELY (ret < 0)) return ret; @@ -1376,13 +1380,6 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, if (UNLIKELY (ret < 0)) crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err); - if (rootfs) - { - ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - } - ret = libcrun_reopen_dev_null (err); if (UNLIKELY (ret < 0)) return ret; diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index f6519016a6..019993cb6c 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -149,6 +149,9 @@ struct private_data_s struct libcrun_fd_map *dev_fds; struct libcrun_fd_map *needed_devs_fds; + char **copy_symlink_targets; + size_t n_copy_symlink_targets; + /* Used to save stdin, stdout, stderr during checkpointing to descriptors.json * and needed during restore. */ char *external_descriptors; @@ -158,6 +161,7 @@ struct private_data_s char *maskdir_proc_path; bool maskdir_bind_failed; bool maskdir_warned; + bool joined_mount_ns; }; struct linux_namespace_s @@ -187,6 +191,13 @@ cleanup_private_data (void *private_data) if (p->needed_devs_fds) cleanup_close_mapp (&(p->needed_devs_fds)); + if (p->copy_symlink_targets) + { + size_t i; + for (i = 0; i < p->n_copy_symlink_targets; i++) + free (p->copy_symlink_targets[i]); + free (p->copy_symlink_targets); + } free (p->unified_cgroup_path); free (p->host_notify_socket_path); free (p->container_notify_socket_path); @@ -820,8 +831,8 @@ make_remount (int targetfd, const char *target, unsigned long flags, const char static int do_remount (int targetfd, const char *target, unsigned long flags, const char *data, libcrun_error_t *err) { - int ret; proc_fd_path_t target_buffer; + int ret; const char *real_target = target; if (targetfd >= 0) @@ -1005,6 +1016,16 @@ get_procfd (struct private_data_s *data, libcrun_error_t *err) return fd; } +static int +get_old_root_fd (struct private_data_s *data) +{ + if (data->old_root_fd >= 0) + return data->old_root_fd; + + data->old_root_fd = open ("/", O_PATH | O_CLOEXEC); + return data->old_root_fd; +} + static int fs_move_mount_to (int fd, int dirfd, const char *name) { @@ -1076,7 +1097,8 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path /* Fast path: return cached proc fd path if already set up */ if (private_data->maskdir_proc_path != NULL) { - *proc_fd_path = private_data->maskdir_proc_path; + if (proc_fd_path) + *proc_fd_path = private_data->maskdir_proc_path; return 0; } @@ -1096,16 +1118,41 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path private_data->maskdir_fd = fd; fd = -1; /* Don't auto-close */ - *proc_fd_path = private_data->maskdir_proc_path; + if (proc_fd_path) + *proc_fd_path = private_data->maskdir_proc_path; return 0; } +static void +reclone_maskdir_fd (libcrun_container_t *container) +{ + struct private_data_s *pd = get_private_data (container); + cleanup_close int new_fd = -1; + + if (pd->maskdir_fd < 0) + return; + + new_fd = syscall_open_tree (pd->maskdir_fd, "", + AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (new_fd >= 0) + { + proc_fd_path_t fd_path; + + close (pd->maskdir_fd); + pd->maskdir_fd = new_fd; + new_fd = -1; + + get_self_fd_path (fd_path, pd->maskdir_fd); + free (pd->maskdir_proc_path); + pd->maskdir_proc_path = xstrdup (fd_path); + } +} + static int mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_path, libcrun_error_t *err) { struct private_data_s *private_data = get_private_data (container); cleanup_close int mountfd = -1; - proc_fd_path_t abs_source; char *proc_fd_path = NULL; libcrun_error_t tmp_err = NULL; int procfd; @@ -1134,18 +1181,9 @@ mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_pa if (LIKELY (ret == 0)) return 0; } - crun_error_release (&tmp_err); - - get_proc_self_fd_path (abs_source, private_data->maskdir_fd); - ret = do_mount (container, abs_source, pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, &tmp_err); - if (LIKELY (ret >= 0)) - return ret; - /* Bind mount failed - mark as failed and fall back for all future mounts */ private_data->maskdir_bind_failed = true; - libcrun_warning ("bind mount failed for %s to %s: %s, falling back to tmpfs", - proc_fd_path, rel_path, tmp_err->msg); - warn_tmpfs_fallback_once (private_data, tmp_err->msg); + warn_tmpfs_fallback_once (private_data, tmp_err ? tmp_err->msg : "unknown error"); crun_error_release (&tmp_err); fallback_to_tmpfs: @@ -1323,11 +1361,11 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, int label_how, libcrun_error_t *err) { cleanup_free char *data_with_label = NULL; + proc_fd_path_t target_buffer; const char *context_type = NULL; cleanup_close int ms_move_fd = -1; const char *real_target = target; bool single_instance = false; - proc_fd_path_t target_buffer; bool needs_remount = false; cleanup_close int fd = -1; const char *label = NULL; @@ -1341,9 +1379,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (targetfd >= 0) { get_proc_self_fd_path (target_buffer, targetfd); - real_target = target_buffer; - needs_remount = true; } @@ -1394,7 +1430,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, false, false, 0, &tmp_err); if (bindfd < 0) { - int old_root_fd = get_private_data (container)->old_root_fd; + int old_root_fd = get_old_root_fd (get_private_data (container)); if (old_root_fd >= 0 && source && source[0] == '/') { @@ -1422,7 +1458,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, ret = mount (source, real_target, fstype, flags, data); if (ret < 0 && (mountflags & MS_BIND) && source && source[0] == '/') { - int old_root_fd = get_private_data (container)->old_root_fd; + int old_root_fd = get_old_root_fd (get_private_data (container)); if (old_root_fd >= 0) { @@ -1450,7 +1486,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (ret > 0) { cleanup_close int mountfd = -1; - int sys_old_root_fd = get_private_data (container)->old_root_fd; + int sys_old_root_fd = get_old_root_fd (get_private_data (container)); if (sys_old_root_fd >= 0) mountfd = get_bind_mount (sys_old_root_fd, "sys", true, false, false, MS_PRIVATE, err); @@ -1530,7 +1566,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, } close_and_reset (&fd); - fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC); + fd = openat (procfd, "self/root", O_PATH | O_CLOEXEC); if (UNLIKELY (fd < 0)) return crun_make_error (err, errno, "reopen rootfs after mount on /"); @@ -1560,8 +1596,6 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, #endif targetfd = fd; - get_proc_self_fd_path (target_buffer, targetfd); - real_target = target_buffer; } } @@ -1628,15 +1662,15 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, static void try_umount (int targetfd, const char *target) { - const char *real_target = target; proc_fd_path_t target_buffer; + const char *real_target = target; if (targetfd >= 0) { - /* Best effort cleanup for the tmpfs. */ get_proc_self_fd_path (target_buffer, targetfd); real_target = target_buffer; } + umount2 (real_target, MNT_DETACH); } @@ -2217,9 +2251,9 @@ do_masked_and_readonly_paths (libcrun_container_t *container, libcrun_error_t *e static int do_pivot (libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) { - int ret; cleanup_close int oldrootfd = -1; cleanup_close int newrootfd = -1; + int ret; oldrootfd = open ("/", O_DIRECTORY | O_PATH | O_CLOEXEC); if (UNLIKELY (oldrootfd < 0)) @@ -2241,6 +2275,8 @@ do_pivot (libcrun_container_t *container, const char *rootfs, libcrun_error_t *e if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "fchdir `%s`", rootfs); + reclone_maskdir_fd (container); + ret = do_mount (container, NULL, -1, ".", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) return ret; @@ -2425,18 +2461,28 @@ safe_create_symlink (int rootfsfd, const char *rootfs, const char *target, const static int handle_copy_symlink (libcrun_container_t *container, const char *rootfs, - runtime_spec_schema_defs_mount *mount, libcrun_error_t *err) + runtime_spec_schema_defs_mount *mount, + const char *link_target, libcrun_error_t *err) { cleanup_free char *target = NULL; - ssize_t len; - /* Copy the origin symlink instead of performing the mount operation. */ - len = safe_readlinkat (AT_FDCWD, mount->source, &target, 0, err); - if (UNLIKELY (len < 0)) - return len; + if (link_target == NULL) + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + const char *source = mount->source; + ssize_t len; + + if (old_root_fd >= 0 && source[0] == '/') + len = safe_readlinkat (old_root_fd, source + 1, &target, 0, err); + else + len = safe_readlinkat (AT_FDCWD, source, &target, 0, err); + if (UNLIKELY (len < 0)) + return len; + link_target = target; + } return safe_create_symlink (get_private_data (container)->rootfsfd, rootfs, - target, mount->destination, err); + link_target, mount->destination, err); } static int @@ -2480,6 +2526,12 @@ handle_tmpcopyup (libcrun_container_t *container, const char *rootfs, const char { int destfd, ret; cleanup_close int tmpfd = copy_from_fd; + + target = consume_slashes (target); + + if (is_empty_string (target)) + return crun_make_error (err, 0, "tmpcopyup on `/` is not supported"); + destfd = safe_openat (get_private_data (container)->rootfsfd, rootfs, target, O_CLOEXEC | O_DIRECTORY, 0, err); if (UNLIKELY (destfd < 0)) @@ -2488,7 +2540,6 @@ handle_tmpcopyup (libcrun_container_t *container, const char *rootfs, const char // copy_recursive_fd_to_fd closes tmpfd and destfd ret = copy_recursive_fd_to_fd (tmpfd, destfd, target, target, err); tmpfd = -1; - return ret; } @@ -2576,6 +2627,24 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, ret = get_file_type_at (source_mountfd, &src_mode, true, NULL); } + else if ((extra_flags & OPTION_COPY_SYMLINK) + && get_private_data (container)->copy_symlink_targets + && mount_index < get_private_data (container)->n_copy_symlink_targets + && get_private_data (container)->copy_symlink_targets[mount_index]) + { + src_mode = S_IFLNK; + ret = 0; + } + else if (path[0] == '/') + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + bool nofollow = (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false; + + if (old_root_fd >= 0) + ret = get_file_type_at (old_root_fd, &src_mode, nofollow, (path[1] == '\0') ? NULL : path + 1); + else + ret = get_file_type_at (AT_FDCWD, &src_mode, nofollow, path); + } else ret = get_file_type (&src_mode, (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false, path); if (UNLIKELY (ret < 0)) @@ -2583,7 +2652,16 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, if (S_ISLNK (src_mode) && (extra_flags & OPTION_DEST_NOFOLLOW) && source_mountfd < 0) { - ret = get_bind_mount (AT_FDCWD, mount->source, true, true, extra_flags & OPTION_SRC_NOFOLLOW, MS_PRIVATE, err); + int bind_dirfd = AT_FDCWD; + const char *bind_source = mount->source; + int old_root_fd = get_old_root_fd (get_private_data (container)); + + if (old_root_fd >= 0 && mount->source && mount->source[0] == '/') + { + bind_dirfd = old_root_fd; + bind_source = (mount->source[1] == '\0') ? "." : mount->source + 1; + } + ret = get_bind_mount (bind_dirfd, bind_source, true, true, extra_flags & OPTION_SRC_NOFOLLOW, MS_PRIVATE, err); if (UNLIKELY (ret < 0)) return ret; @@ -2595,7 +2673,13 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, if (S_ISLNK (src_mode) && (extra_flags & OPTION_COPY_SYMLINK)) { - ret = handle_copy_symlink (container, rootfs, mount, err); + const char *cached_target = NULL; + + if (get_private_data (container)->copy_symlink_targets + && mount_index < get_private_data (container)->n_copy_symlink_targets) + cached_target = get_private_data (container)->copy_symlink_targets[mount_index]; + + ret = handle_copy_symlink (container, rootfs, mount, cached_target, err); if (UNLIKELY (ret < 0)) return ret; @@ -2652,15 +2736,27 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, /* Check if there is already a mount for the requested file system. */ if (! mounted && source_mountfd >= 0) { - ret = fs_move_mount_to (source_mountfd, targetfd, NULL); if (LIKELY (ret == 0)) { - /* Force no MS_BIND flag to not attempt again the bind mount. */ - ret = do_mount (container, NULL, source_mountfd, target, NULL, flags & ~MS_BIND, data, LABEL_NONE, err); - if (UNLIKELY (ret < 0)) - return ret; + unsigned long remaining_flags = flags & ~MS_BIND; + if (remaining_flags) + { + ret = do_mount (container, NULL, source_mountfd, target, NULL, remaining_flags, data, LABEL_NONE, err); + if (UNLIKELY (ret < 0)) + return ret; + } mounted = true; + + if (is_empty_string (target)) + { + int new_rootfsfd = open (rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (new_rootfsfd < 0)) + return crun_make_error (err, errno, "reopen rootfs after mount on /"); + + TEMP_FAILURE_RETRY (close (get_private_data (container)->rootfsfd)); + get_private_data (container)->rootfsfd = new_rootfsfd; + } } } @@ -2866,7 +2962,7 @@ get_notify_fd (libcrun_context_t *context, libcrun_container_t *container, int * #ifdef HAVE_SYSTEMD static int -do_notify_socket (libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) +do_notify_socket (libcrun_container_t *container, libcrun_error_t *err) { int ret; const char *notify_socket = container->context->notify_socket; @@ -2886,7 +2982,7 @@ do_notify_socket (libcrun_container_t *container, const char *rootfs, libcrun_er if (UNLIKELY (ret < 0)) return ret; - ret = append_paths (&container_notify_socket_path, err, rootfs, notify_socket, "notify", NULL); + ret = append_paths (&container_notify_socket_path, err, "/", notify_socket, "notify", NULL); if (UNLIKELY (ret < 0)) return ret; @@ -3035,7 +3131,7 @@ make_parent_mount_private (const char *rootfs, libcrun_error_t *err) if (UNLIKELY (fstat (parentfd, &cur_st) < 0)) return crun_make_error (err, errno, "fstat parent of `%s`", rootfs); - /* Reached the root of the filesystem: ".." points to itself. */ + /* Reached the root of the file system: ".." points to itself. */ if (cur_st.st_dev == prev_st.st_dev && cur_st.st_ino == prev_st.st_ino) break; @@ -3051,37 +3147,12 @@ int libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, set_mounts_cb_t cb, void *cb_data, libcrun_error_t *err) { runtime_spec_schema_config_schema *def = container->container_def; - unsigned long rootfs_propagation = 0; - int cgroup_mode; int is_user_ns = 0; int ret = 0; if (rootfs == NULL || def->mounts == NULL) return 0; - if (def->linux && def->linux->rootfs_propagation) - rootfs_propagation = get_mount_flags (def->linux->rootfs_propagation, 0, NULL, NULL, NULL, NULL); - - if ((rootfs_propagation & ALL_PROPAGATIONS_NO_REC) == 0) - rootfs_propagation = MS_REC | MS_PRIVATE; - - get_private_data (container)->rootfs_propagation = rootfs_propagation; - - if (get_private_data (container)->unshare_flags & CLONE_NEWNS) - { - ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = make_parent_mount_private (rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = do_mount (container, rootfs, -1, rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - } - ret = open (rootfs, O_PATH | O_CLOEXEC); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); @@ -3108,35 +3179,7 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont get_private_data (container)->remounts = r; } - if (! container->context->force_no_cgroup) - { - cgroup_mode = libcrun_get_cgroup_mode (err); - if (UNLIKELY (cgroup_mode < 0)) - return cgroup_mode; - - if (cgroup_mode == CGROUP_MODE_UNIFIED) - { - char *unified_cgroup_path = NULL; - int procfd = get_procfd (get_private_data (container), err); - if (UNLIKELY (procfd < 0)) - return procfd; - - /* Read the cgroup path before we enter the cgroupns. */ - ret = libcrun_get_cgroup_process_at (procfd, &unified_cgroup_path, true, err); - if (UNLIKELY (ret < 0)) - return ret; - - get_private_data (container)->unified_cgroup_path = unified_cgroup_path; - } - } - - ret = libcrun_container_enter_cgroup_ns (container, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = do_mounts (container, rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; + /* Mounts are already done before pivot_root in setup_mount_namespace. */ is_user_ns = (get_private_data (container)->unshare_flags & CLONE_NEWUSER); if (! is_user_ns) @@ -3257,42 +3300,340 @@ move_root (const char *rootfs, libcrun_error_t *err) return 0; } -int -libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char *rootfs, libcrun_error_t *err) +static struct libcrun_fd_map * +get_fd_map (libcrun_container_t *container) +{ + struct libcrun_fd_map *mount_fds = get_private_data (container)->mount_fds; + + if (mount_fds == NULL) + { + runtime_spec_schema_config_schema *def = container->container_def; + mount_fds = make_libcrun_fd_map (def->mounts_len); + get_private_data (container)->mount_fds = mount_fds; + } + return mount_fds; +} + +static int +open_mount_of_type (libcrun_container_t *container, + runtime_spec_schema_defs_mount *mount, + libcrun_error_t *err) { + runtime_spec_schema_config_schema *def = container->container_def; + cleanup_free char *mnt_data = NULL; + unsigned long mnt_flags = 0, mnt_extra = 0; + uint64_t mnt_rec_clear = 0, mnt_rec_set = 0; + const char *label_type = NULL; + const char *label_val = NULL; + const char *fstype; + int mnt_fd; + size_t j; + + if (mount->type == NULL) + return -1; + + fstype = mount->type; + + for (j = 0; j < mount->options_len; j++) + mnt_flags |= get_mount_flags_or_option (mount->options[j], + mnt_flags, &mnt_extra, &mnt_data, + &mnt_rec_clear, &mnt_rec_set); + + if (def->linux && def->linux->mount_label) + { + bool is_sysfs_or_proc = strcmp (fstype, "sysfs") == 0 + || strcmp (fstype, "proc") == 0; + int label_how = get_mount_label_how (fstype, is_sysfs_or_proc); + if (label_how == LABEL_MOUNT) + { + label_type = get_selinux_context_type (container, err); + if (label_type == NULL) + crun_error_release (err); + else + label_val = def->linux->mount_label; + } + } + + if (strcmp (fstype, "cgroup") == 0) + { + int cgroup_mode = libcrun_get_cgroup_mode (err); + if (cgroup_mode == CGROUP_MODE_UNIFIED) + fstype = "cgroup2"; + crun_error_release (err); + } + + mnt_fd = fsopen_mount (fstype, mount->source, label_type, label_val, mnt_data); + if (mnt_fd < 0 && label_type) + mnt_fd = fsopen_mount (fstype, mount->source, NULL, NULL, mnt_data); + + return mnt_fd; +} + +static int +setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err) +{ + runtime_spec_schema_config_schema *def = container->container_def; + unsigned long rootfs_propagation = 0; + libcrun_error_t tmp_err = NULL; + size_t i; int ret; - if (get_private_data (container)->unshare_flags & CLONE_NEWNS) + if (def->linux && def->linux->rootfs_propagation) + rootfs_propagation = get_mount_flags (def->linux->rootfs_propagation, 0, NULL, NULL, NULL, NULL); + + if ((rootfs_propagation & ALL_PROPAGATIONS_NO_REC) == 0) + rootfs_propagation = MS_REC | MS_PRIVATE; + + get_private_data (container)->rootfs_propagation = rootfs_propagation; + + ret = libcrun_container_enter_cgroup_ns (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + if (! get_private_data (container)->joined_mount_ns) + { + ret = unshare (CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "unshare `CLONE_NEWNS`"); + } + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = make_parent_mount_private (*rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* Pre-create mounts and cache paths before pivot_root, + while the host file system is still reachable. */ + for (i = 0; i < def->mounts_len; i++) { - if (no_pivot) + struct libcrun_fd_map *mount_fds = get_fd_map (container); + cleanup_free char *dest_path = NULL; + cleanup_close int mnt_fd = -1; + bool recursive_bind, nofollow; + struct stat st; + + if (mount_fds && mount_fds->fds[i] >= 0) + continue; + + /* Pre-read symlink targets for copy-symlink mounts while the host file system is still reachable. */ + if (def->mounts[i]->source != NULL + && def->mounts[i]->source[0] == '/') { - ret = move_root (rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; + cleanup_free char *mnt_data = NULL; + unsigned long mnt_flags = 0; + unsigned long mnt_extra = 0; + uint64_t mnt_rec_clear = 0, mnt_rec_set = 0; + size_t j; + + for (j = 0; j < def->mounts[i]->options_len; j++) + mnt_flags |= get_mount_flags_or_option (def->mounts[i]->options[j], + mnt_flags, &mnt_extra, &mnt_data, + &mnt_rec_clear, &mnt_rec_set); + + if (mnt_extra & OPTION_COPY_SYMLINK) + { + cleanup_free char *target = NULL; + libcrun_error_t tmp_err = NULL; + ssize_t len; + + len = safe_readlinkat (AT_FDCWD, def->mounts[i]->source, &target, 0, &tmp_err); + if (UNLIKELY (len < 0)) + crun_error_release (&tmp_err); + + if (get_private_data (container)->copy_symlink_targets == NULL) + { + get_private_data (container)->copy_symlink_targets + = xmalloc0 (def->mounts_len * sizeof (char *)); + get_private_data (container)->n_copy_symlink_targets = def->mounts_len; + } + get_private_data (container)->copy_symlink_targets[i] = target; + target = NULL; + } } - else + + if (is_bind_mount (def->mounts[i], &recursive_bind, &nofollow)) + { + if (mount_fds && def->mounts[i]->source != NULL) + { + libcrun_error_t tmp_err = NULL; + + mount_fds->fds[i] = get_bind_mount (AT_FDCWD, def->mounts[i]->source, + recursive_bind, false, nofollow, + MS_PRIVATE, &tmp_err); + if (mount_fds->fds[i] < 0) + crun_error_release (&tmp_err); + } + continue; + } + + mnt_fd = open_mount_of_type (container, def->mounts[i], err); + if (mnt_fd < 0) + { + crun_error_release (err); + continue; + } + + ret = append_paths (&dest_path, err, *rootfs, def->mounts[i]->destination, NULL); + if (UNLIKELY (ret < 0)) + return ret; + if (stat (dest_path, &st) == 0) { - ret = do_pivot (container, rootfs, err); + proc_fd_path_t fd_path; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + get_self_fd_path (fd_path, mnt_fd); + ret = fchmodat (procfd, fd_path, st.st_mode & 07777, 0); + (void) ret; + } + + mount_fds->fds[i] = mnt_fd; + mnt_fd = -1; + } + + /* Mount everything before pivot_root while host paths are still reachable. + Use the pre-created fd when available, fall back to mount(). */ + ret = open (*rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "open `%s`", *rootfs); + get_private_data (container)->rootfsfd = ret; + get_private_data (container)->rootfs = *rootfs; + + if (! container->context->force_no_cgroup) + { + int cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + if (cgroup_mode == CGROUP_MODE_UNIFIED) + { + char *unified_cgroup_path = NULL; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + ret = libcrun_get_cgroup_process_at (procfd, &unified_cgroup_path, true, err); if (UNLIKELY (ret < 0)) return ret; + + get_private_data (container)->unified_cgroup_path = unified_cgroup_path; + } + } + + ret = do_mounts (container, *rootfs, err); + if (UNLIKELY (ret < 0)) + { + close_and_reset (&get_private_data (container)->rootfsfd); + get_private_data (container)->rootfs = NULL; + return ret; + } + + close_and_reset (&get_private_data (container)->rootfsfd); + get_private_data (container)->rootfs = NULL; + + /* Pre-open needed device fds for rootless containers. + The parent skips open_tree for rootless (EPERM), but the child + has CAP_SYS_ADMIN in its user namespace. */ + if (get_private_data (container)->needed_devs_fds) + { + struct libcrun_fd_map *dev_fds = get_private_data (container)->needed_devs_fds; + + for (i = 0; needed_devs[i].path; i++) + { + if (i < dev_fds->nfds && dev_fds->fds[i] >= 0) + continue; + + int fd = syscall_open_tree (AT_FDCWD, needed_devs[i].path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd >= 0 && i < dev_fds->nfds) + dev_fds->fds[i] = fd; + else if (fd >= 0) + close (fd); } + } + + if (get_private_data (container)->notify_socket_tree_fd < 0 + && get_private_data (container)->host_notify_socket_path) + { + int fd = syscall_open_tree (AT_FDCWD, get_private_data (container)->host_notify_socket_path, + OPEN_TREE_CLONE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd >= 0) + get_private_data (container)->notify_socket_tree_fd = fd; + } + + get_old_root_fd (get_private_data (container)); + + /* After pivot_root, /proc is not yet mounted. */ + ret = get_procfd (get_private_data (container), &tmp_err); + if (ret < 0) + crun_error_release (&tmp_err); + + check_running_in_user_namespace (&tmp_err); + crun_error_release (&tmp_err); + + ret = get_shared_empty_dir_cached (container, NULL, &tmp_err); + if (ret < 0) + { + crun_error_release (&tmp_err); + get_private_data (container)->maskdir_bind_failed = true; + } - ret = do_mount (container, NULL, -1, "/", NULL, get_private_data (container)->rootfs_propagation, NULL, - LABEL_MOUNT, err); + if (no_pivot) + { + ret = move_root (*rootfs, err); if (UNLIKELY (ret < 0)) return ret; } else { - ret = chroot (rootfs); + ret = do_pivot (container, *rootfs, err); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chroot to `%s`", rootfs); + return ret; + } + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + return 0; +} + +int +libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err) +{ + int ret; + + if (*rootfs == NULL) + return 0; + + if (get_private_data (container)->unshare_flags & CLONE_NEWNS) + { + ret = setup_mount_namespace (container, no_pivot, rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + } + else + { + ret = chroot (*rootfs); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chroot to `%s`", *rootfs); } ret = chdir ("/"); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "chdir to `/`"); + free (*rootfs); + *rootfs = xstrdup ("/"); + return 0; } @@ -4589,26 +4930,15 @@ get_needed_devs_fd_map (libcrun_container_t *container) return fds; } -static struct libcrun_fd_map * -get_fd_map (libcrun_container_t *container) -{ - struct libcrun_fd_map *mount_fds = get_private_data (container)->mount_fds; - - if (mount_fds == NULL) - { - runtime_spec_schema_config_schema *def = container->container_def; - mount_fds = make_libcrun_fd_map (def->mounts_len); - get_private_data (container)->mount_fds = mount_fds; - } - return mount_fds; -} - bool is_bind_mount (runtime_spec_schema_defs_mount *mnt, bool *recursive, bool *src_nofollow) { bool ret = false; size_t i; + if (recursive) + *recursive = false; + if (src_nofollow != NULL) *src_nofollow = false; @@ -4616,9 +4946,6 @@ is_bind_mount (runtime_spec_schema_defs_mount *mnt, bool *recursive, bool *src_n { if (strcmp (mnt->options[i], "bind") == 0) { - if (recursive) - *recursive = false; - ret = true; /* if src_nofollow is not specified, or already found, shortcut. */ @@ -4871,8 +5198,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy mount_fds = make_libcrun_fd_map (def->mounts_len); - /* If the container is already running in a user namespace, apply the same logic as if a new - user namespace was created as part of the container itself. */ if (! has_userns) { int is_in_userns = check_running_in_user_namespace (err); @@ -4908,6 +5233,7 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy if (UNLIKELY (mount_fd < 0)) crun_error_release (err); } + if (mount_fd >= 0) how_many++; @@ -5313,10 +5639,10 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc if (UNLIKELY (ret < 0)) return ret; - if (init_status->namespaces_to_unshare & ~CLONE_NEWCGROUP) + if (init_status->namespaces_to_unshare & ~(CLONE_NEWCGROUP | CLONE_NEWNS)) { /* New namespaces to create for the container. */ - ret = unshare (init_status->namespaces_to_unshare & ~CLONE_NEWCGROUP); + ret = unshare (init_status->namespaces_to_unshare & ~(CLONE_NEWCGROUP | CLONE_NEWNS)); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "unshare"); } @@ -5462,6 +5788,9 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin return ret; get_private_data (container)->unshare_flags = init_status.all_namespaces; + get_private_data (container)->joined_mount_ns + = (init_status.all_namespaces & CLONE_NEWNS) + && ! (init_status.namespaces_to_unshare & CLONE_NEWNS); /* cgroup will be unshared later. Once the process is in the correct cgroup. */ init_status.all_namespaces &= ~CLONE_NEWCGROUP; get_private_data (container)->unshare_cgroupns = init_status.namespaces_to_unshare & CLONE_NEWCGROUP; @@ -5476,7 +5805,7 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin #ifdef HAVE_SYSTEMD if (def->root) { - ret = do_notify_socket (container, def->root->path, err); + ret = do_notify_socket (container, err); if (UNLIKELY (ret < 0)) return ret; } @@ -5550,7 +5879,7 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin else if ((init_status.all_namespaces & CLONE_NEWUSER) == 0) { /* If it doesn't create a user namespace or need to join one, create the new requested namespaces now. */ - first_clone_args = init_status.namespaces_to_unshare & ~(CLONE_NEWTIME | CLONE_NEWCGROUP); + first_clone_args = init_status.namespaces_to_unshare & ~(CLONE_NEWTIME | CLONE_NEWCGROUP | CLONE_NEWNS); } init_status.namespaces_to_unshare &= ~first_clone_args; diff --git a/src/libcrun/linux.h b/src/libcrun/linux.h index 8dc1abc4fe..4cee3037b6 100644 --- a/src/libcrun/linux.h +++ b/src/libcrun/linux.h @@ -68,7 +68,7 @@ int libcrun_set_mounts (struct container_entrypoint_s *args, libcrun_container_t int libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err); int libcrun_init_caps (libcrun_container_t *container, libcrun_error_t *err); -int libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char *rootfs, libcrun_error_t *err); +int libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err); int libcrun_reopen_dev_null (libcrun_error_t *err); int libcrun_set_usernamespace (libcrun_container_t *container, pid_t pid, libcrun_error_t *err); int libcrun_set_caps (runtime_spec_schema_config_schema_process_capabilities *capabilities, uid_t uid, gid_t gid, diff --git a/src/libcrun/utils.c b/src/libcrun/utils.c index 34a9364e4e..9a3cacbeef 100644 --- a/src/libcrun/utils.c +++ b/src/libcrun/utils.c @@ -385,9 +385,17 @@ safe_openat (int dirfd, const char *rootfs, const char *path, int flags, int mod if (UNLIKELY (fd < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); - ret = check_fd_is_path (rootfs, fd, path, err); - if (UNLIKELY (ret < 0)) - return ret; + /* Skip the readlink-based check when opening the root + directory itself (rootfs="/", path=""). After pivot_root, + "/" can only refer to the container root so the readlink + verification is redundant, and after setns the /proc-based + readlink may not be reachable by path yet. */ + if (rootfs[0] != '/' || rootfs[1] != '\0') + { + ret = check_fd_is_path (rootfs, fd, path, err); + if (UNLIKELY (ret < 0)) + return ret; + } ret = fd; fd = -1; diff --git a/tests/test_mounts.py b/tests/test_mounts.py index 3d992b2c14..31e895431f 100755 --- a/tests/test_mounts.py +++ b/tests/test_mounts.py @@ -153,6 +153,8 @@ def test_mount_bind_to_rootfs(): return 0 def test_mount_tmpfs_to_rootfs(): + # tmpcopyup on "/" is rejected: after pivot_root is moved before mounts, + # there is no original rootfs content to copy from. conf = base_config() conf['process']['args'] = ['/init', 'true'] add_all_namespaces(conf) @@ -161,8 +163,11 @@ def test_mount_tmpfs_to_rootfs(): {"destination": "/", "type": "tmpfs", "source": "tmpfs", "options": ["tmpcopyup"]}, ] conf['mounts'] = mounts + conf['mounts'] - _, _ = run_and_get_output(conf, hide_stderr=True) - return 0 + try: + _, _ = run_and_get_output(conf, hide_stderr=True) + return -1 + except Exception as e: + return 0 def test_ro_cgroup(): for cgroupns in [True, False]: @@ -1185,6 +1190,33 @@ def test_no_proc_sysfs_cgroup(): run_and_get_output(conf, hide_stderr=True) return 0 +def test_rbind_with_bind_option(): + """Verify rbind carries submounts even when 'bind' also appears in options.""" + if is_rootless(): + return (77, "requires root privileges") + + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/proc/self/mountinfo'] + add_all_namespaces(conf) + mount_opt = { + "destination": "/mnt", + "type": "bind", + "source": "/sys", + "options": ["rbind", "rprivate", "rw", "bind"] + } + conf['mounts'].append(mount_opt) + out, _ = run_and_get_output(conf, hide_stderr=True) + found_sub = False + for line in out.splitlines(): + if ' /mnt/' in line: + found_sub = True + break + if not found_sub: + logger.info("/mnt/* submounts not found - rbind did not carry submounts") + logger.info("mountinfo output: %s", out) + return -1 + return 0 + all_tests = { "mount-ro" : test_mount_ro, "mount-rro" : test_mount_rro, @@ -1230,6 +1262,7 @@ def test_no_proc_sysfs_cgroup(): "mount-overlay-fs": test_mount_overlay_fs, "mount-no-proc": test_no_proc, "mount-no-proc-sysfs-cgroup": test_no_proc_sysfs_cgroup, + "mount-rbind-with-bind-option": test_rbind_with_bind_option, } if __name__ == "__main__": From 25790ca1e22893523c3e78db745271d44976104b Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 7 May 2026 05:54:03 +0000 Subject: [PATCH 21/22] linux: add OPEN_TREE_NAMESPACE support Use open_tree(OPEN_TREE_NAMESPACE) + setns(CLONE_NEWNS) to replace the traditional unshare(CLONE_NEWNS) + bind mount rootfs + pivot_root sequence. OPEN_TREE_NAMESPACE creates a new mount namespace with the rootfs as the root mount. setns() enters that namespace directly, so no bind mount or pivot_root is needed. The kernel automatically sets the process root and cwd to the new namespace's root when the old root is not reachable. On older kernels (< 7.0) or when OPEN_TREE_NAMESPACE is not supported, the code falls back to the traditional path. Closes: https://github.com/containers/crun/issues/2086 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 297 ++++++++++++++++++++++++++++++++------------ 1 file changed, 216 insertions(+), 81 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 019993cb6c..3e5f38bcd7 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -88,6 +88,10 @@ # define OPEN_TREE_CLOEXEC O_CLOEXEC #endif +#ifndef OPEN_TREE_NAMESPACE +# define OPEN_TREE_NAMESPACE 2 +#endif + #ifndef MOVE_MOUNT_F_EMPTY_PATH # define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 #endif @@ -162,6 +166,8 @@ struct private_data_s bool maskdir_bind_failed; bool maskdir_warned; bool joined_mount_ns; + bool needs_pivot; + bool no_pivot; }; struct linux_namespace_s @@ -974,6 +980,7 @@ fsopen_mount (const char *type, const char *source_name, const char *labeltype, if (eq) { *eq = '\0'; + ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, token, eq + 1, 0); } else @@ -1566,7 +1573,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, } close_and_reset (&fd); - fd = openat (procfd, "self/root", O_PATH | O_CLOEXEC); + fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC); if (UNLIKELY (fd < 0)) return crun_make_error (err, errno, "reopen rootfs after mount on /"); @@ -2619,18 +2626,22 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, if ((extra_flags & OPTION_COPY_SYMLINK) && (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_DEST_NOFOLLOW))) return crun_make_error (err, 0, "`copy-symlink` is mutually exclusive with `src-nofollow` and `dest-nofollow`"); - /* If copy-symlink is provided, ignore the pre-opened file descriptor since its source was resolved. */ - if (source_mountfd >= 0 && ! (extra_flags & OPTION_COPY_SYMLINK)) + bool is_cached_symlink = (extra_flags & OPTION_COPY_SYMLINK) + && get_private_data (container)->copy_symlink_targets + && mount_index < get_private_data (container)->n_copy_symlink_targets + && get_private_data (container)->copy_symlink_targets[mount_index]; + + /* When the source is a cached symlink, skip the pre-opened fd since + get_bind_mount() followed the symlink and the fd points to the target. + For regular files with copy-symlink, the pre-opened fd is still valid. */ + if (source_mountfd >= 0 && ! is_cached_symlink) { get_self_fd_path (proc_buf, source_mountfd); path = proc_buf; ret = get_file_type_at (source_mountfd, &src_mode, true, NULL); } - else if ((extra_flags & OPTION_COPY_SYMLINK) - && get_private_data (container)->copy_symlink_targets - && mount_index < get_private_data (container)->n_copy_symlink_targets - && get_private_data (container)->copy_symlink_targets[mount_index]) + else if (is_cached_symlink) { src_mode = S_IFLNK; ret = 0; @@ -2746,6 +2757,7 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, if (UNLIKELY (ret < 0)) return ret; } + mounted = true; if (is_empty_string (target)) @@ -3204,10 +3216,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont return ret; } - ret = do_finalize_notify_socket (container, err); - if (UNLIKELY (ret < 0)) - return ret; - if (def->process && def->process->cwd) { libcrun_error_t tmp_err = NULL; @@ -3224,25 +3232,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont return 0; } -int -libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) -{ - int ret; - - ret = finalize_mounts (container, err); - if (UNLIKELY (ret < 0)) - return ret; - - // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS - ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err); - if (UNLIKELY (ret < 0)) - return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS"); - - close_and_reset (&(get_private_data (container)->rootfsfd)); - - return 0; -} - static int umount_or_hide (const char *target, libcrun_error_t *err) { @@ -3300,6 +3289,83 @@ move_root (const char *rootfs, libcrun_error_t *err) return 0; } +int +libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) +{ + int ret; + + // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS + ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err); + if (UNLIKELY (ret < 0)) + return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS"); + + close_and_reset (&(get_private_data (container)->rootfsfd)); + + if (get_private_data (container)->needs_pivot) + { + get_private_data (container)->needs_pivot = false; + + if (get_private_data (container)->no_pivot) + { + ret = move_root (rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + } + else + { + ret = do_pivot (container, rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + ret = do_mount (container, NULL, -1, "/", NULL, + get_private_data (container)->rootfs_propagation, + NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); + } + + ret = do_finalize_notify_socket (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = finalize_mounts (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + return 0; +} + +static int +maybe_open_tree_namespace (const char *rootfs, int *out_fd, libcrun_error_t *err) +{ + cleanup_close int rootfs_fd = -1; + int tree_fd; + + *out_fd = -1; + + rootfs_fd = open (rootfs, O_DIRECTORY | O_PATH | O_CLOEXEC); + if (UNLIKELY (rootfs_fd < 0)) + return crun_make_error (err, errno, "open `%s`", rootfs); + + tree_fd = syscall_open_tree (rootfs_fd, "", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC + | AT_EMPTY_PATH | AT_RECURSIVE); + if (tree_fd < 0) + { + if (errno == EINVAL || errno == ENOSYS || errno == EPERM) + return 0; + return crun_make_error (err, errno, "open_tree `%s`", rootfs); + } + + *out_fd = tree_fd; + return 0; +} + static struct libcrun_fd_map * get_fd_map (libcrun_container_t *container) { @@ -3369,12 +3435,43 @@ open_mount_of_type (libcrun_container_t *container, return mnt_fd; } +static bool +can_use_open_tree_namespace (libcrun_container_t *container) +{ + runtime_spec_schema_config_schema *def = container->container_def; + struct libcrun_fd_map *mount_fds; + bool has_hooks = def->hooks + && (def->hooks->prestart_len || def->hooks->create_runtime_len); + bool has_userns = get_private_data (container)->unshare_flags & CLONE_NEWUSER; + size_t i; + + if (has_hooks || has_userns) + return false; + + mount_fds = get_fd_map (container); + for (i = 0; i < def->mounts_len; i++) + { + if (mount_fds->fds[i] < 0) + { + bool has_cached_target = get_private_data (container)->copy_symlink_targets + && i < get_private_data (container)->n_copy_symlink_targets + && get_private_data (container)->copy_symlink_targets[i]; + if (! has_cached_target) + return false; + } + } + + return true; +} + static int setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err) { runtime_spec_schema_config_schema *def = container->container_def; unsigned long rootfs_propagation = 0; + cleanup_close int tree_fd = -1; libcrun_error_t tmp_err = NULL; + bool use_open_tree = false; size_t i; int ret; @@ -3390,25 +3487,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo if (UNLIKELY (ret < 0)) return ret; - if (! get_private_data (container)->joined_mount_ns) - { - ret = unshare (CLONE_NEWNS); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "unshare `CLONE_NEWNS`"); - } - - ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = make_parent_mount_private (*rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - /* Pre-create mounts and cache paths before pivot_root, while the host file system is still reachable. */ for (i = 0; i < def->mounts_len; i++) @@ -3460,7 +3538,11 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo if (is_bind_mount (def->mounts[i], &recursive_bind, &nofollow)) { - if (mount_fds && def->mounts[i]->source != NULL) + bool has_cached_target = get_private_data (container)->copy_symlink_targets + && i < get_private_data (container)->n_copy_symlink_targets + && get_private_data (container)->copy_symlink_targets[i]; + + if (mount_fds && def->mounts[i]->source != NULL && ! has_cached_target) { libcrun_error_t tmp_err = NULL; @@ -3499,6 +3581,74 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo mnt_fd = -1; } + ret = get_shared_empty_dir_cached (container, NULL, &tmp_err); + if (ret < 0) + { + crun_error_release (&tmp_err); + get_private_data (container)->maskdir_bind_failed = true; + } + + if (! get_private_data (container)->joined_mount_ns + && can_use_open_tree_namespace (container)) + use_open_tree = true; + + if (use_open_tree) + { + ret = maybe_open_tree_namespace (*rootfs, &tree_fd, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + if (tree_fd >= 0) + { + ret = setns (tree_fd, CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "setns `CLONE_NEWNS`"); + + reclone_maskdir_fd (container); + + ret = mount (NULL, "/", NULL, MS_REMOUNT | MS_BIND, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "remount `/`"); + + ret = do_mount (container, NULL, -1, "/", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + get_private_data (container)->needs_pivot = false; + get_private_data (container)->no_pivot = no_pivot; + free (*rootfs); + *rootfs = xstrdup ("/"); + } + else + { + if (! get_private_data (container)->joined_mount_ns) + { + ret = unshare (CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "unshare `CLONE_NEWNS`"); + } + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = make_parent_mount_private (*rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + get_private_data (container)->needs_pivot = true; + get_private_data (container)->no_pivot = no_pivot; + } + /* Mount everything before pivot_root while host paths are still reachable. Use the pre-created fd when available, fall back to mount(). */ ret = open (*rootfs, O_PATH | O_CLOEXEC); @@ -3579,30 +3729,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo check_running_in_user_namespace (&tmp_err); crun_error_release (&tmp_err); - ret = get_shared_empty_dir_cached (container, NULL, &tmp_err); - if (ret < 0) - { - crun_error_release (&tmp_err); - get_private_data (container)->maskdir_bind_failed = true; - } - - if (no_pivot) - { - ret = move_root (*rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - } - else - { - ret = do_pivot (container, *rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - } - - ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - return 0; } @@ -3619,20 +3745,30 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **roo ret = setup_mount_namespace (container, no_pivot, rootfs, err); if (UNLIKELY (ret < 0)) return ret; + + /* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is + already set to "/". Otherwise pivot_root is deferred until + after the mounts are created. */ + if (strcmp (*rootfs, "/") == 0) + { + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); + } } else { ret = chroot (*rootfs); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "chroot to `%s`", *rootfs); - } - ret = chdir ("/"); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chdir to `/`"); + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); - free (*rootfs); - *rootfs = xstrdup ("/"); + free (*rootfs); + *rootfs = xstrdup ("/"); + } return 0; } @@ -5228,7 +5364,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy if (propagation == 0) propagation = MS_PRIVATE; - /* If the bind mount failed, do not fail here, but attempt to create it from within the container. */ mount_fd = get_bind_mount (-1, def->mounts[i]->source, recursive, false, nofollow, propagation, err); if (UNLIKELY (mount_fd < 0)) crun_error_release (err); From 8a5e0e97a6ae8901d8eb66f6e402cd4eaa193347 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 7 Jun 2026 18:25:47 +0000 Subject: [PATCH 22/22] linux: fallback to receiver-side device creation when fsopen fails When fsopen() is not available (returns ENOSYS), the host-side device pre-creation in prepare_and_send_dev_mounts() cannot proceed since it relies on the new mount API to create a detached tmpfs. Instead of failing, send empty fds to the container process so that libcrun_create_dev() falls back to its existing legacy path (bind mounts or mknod) inside the container's mount namespace. Closes: https://github.com/containers/crun/issues/2104 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Giuseppe Scrivano --- src/libcrun/linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 3e5f38bcd7..0ea9afe213 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -5451,7 +5451,7 @@ prepare_and_send_dev_mounts (libcrun_container_t *container, int sync_socket_hos devs_mountfd = fsopen_mount ("tmpfs", NULL, context_type, label, NULL); if (UNLIKELY (devs_mountfd < 0)) { - ret = crun_make_error (err, errno, "fsopen_mount `tmpfs`"); + ret = send_mounts (sync_socket_host, dev_fds, 0, def->linux->devices_len, err); goto restore_mountns; }