diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 3aa50ed..d81d2b1 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -387,6 +387,7 @@ typedef struct { #define LINUX_O_TRUNC 0x0200 #define LINUX_O_APPEND 0x0400 #define LINUX_O_NONBLOCK 0x0800 +#define LINUX_O_DSYNC 0x1000 #define LINUX_O_ASYNC 0x2000 /* aarch64-linux open flag values (from asm-generic/fcntl.h). * These differ from x86_64-linux values. @@ -397,7 +398,10 @@ typedef struct { #define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */ #define LINUX_O_NOATIME 0x40000 /* 01000000 octal */ #define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */ +#define LINUX_O_SYNC 0x101000 /* __O_SYNC | O_DSYNC */ #define LINUX_O_PATH 0x200000 /* 010000000 octal */ +#define LINUX___O_TMPFILE 0x400000 +#define LINUX_O_TMPFILE (LINUX___O_TMPFILE | LINUX_O_DIRECTORY) /* Linux fallocate(2) mode bits (linux/falloc.h). PUNCH_HOLE requires the * caller to also set KEEP_SIZE per the manpage; collapse/insert/zero/unshare diff --git a/src/syscall/path.c b/src/syscall/path.c index d2bf6d8..7db008f 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -655,35 +655,6 @@ int path_openat2_normalize_in_root(const char *path, char *out, size_t outsz) return 0; } -int path_openat2_is_proc_magiclink(guest_fd_t dirfd, const char *path) -{ - if (!path) - return 0; - - char normalized[LINUX_PATH_MAX]; - const char *candidate = path; - - if (path[0] == '/') { - if (strncmp(path, "/proc", 5) || (path[5] != '\0' && path[5] != '/')) - return 0; - - size_t marks[PROC_PATH_COMPONENTS_MAX]; - size_t depth; - if (proc_seed_absolute_path(path, normalized, sizeof(normalized), marks, - ARRAY_SIZE(marks), &depth) < 0) - return 0; - candidate = normalized; - } else { - int rc = - resolve_proc_at_path(dirfd, path, normalized, sizeof(normalized)); - if (rc <= 0) - return 0; - candidate = normalized; - } - - return !strncmp(candidate, "/proc/self/fd/", 14); -} - static int path_openat2_dirfd_host_path(guest_fd_t dirfd, char *out, size_t outsz) @@ -909,6 +880,51 @@ static int dirfd_guest_base_path(guest_fd_t dirfd, char *out, size_t outsz) return 0; } +static bool normalized_proc_self_fd_anchor(const char *path) +{ + if (!strncmp(path, "proc/self/fd/", 13)) + return true; + if (strncmp(path, "proc/", 5)) + return false; + + char *endp; + const char *pid_start = path + sizeof("proc/") - 1; + errno = 0; + long long pid = strtoll(pid_start, &endp, 10); + if (endp == pid_start || errno == ERANGE || + pid != (long long) proc_get_pid()) + return false; + return strncmp(endp, "/fd/", 4) == 0; +} + +bool path_openat2_is_fd_magiclink_anchor(guest_fd_t dirfd, const char *path) +{ + if (!path) + return false; + + char normalized[LINUX_PATH_MAX]; + + if (path[0] == '/') { + if (path_openat2_normalize_in_root(path, normalized, + sizeof(normalized)) < 0) + return false; + } else { + char base[LINUX_PATH_MAX]; + char joined[LINUX_PATH_MAX]; + if (dirfd_guest_base_path(dirfd, base, sizeof(base)) < 0) + return false; + if (snprintf(joined, sizeof(joined), "%s/%s", base, path) >= + (int) sizeof(joined)) + return false; + if (path_openat2_normalize_in_root(joined, normalized, + sizeof(normalized)) < 0) + return false; + } + + return strncmp(normalized, "dev/fd/", 7) == 0 || + normalized_proc_self_fd_anchor(normalized); +} + /* Pop one trailing component from an absolute path, refusing to drop * below the supplied floor length. floor_len is strlen of the walk root * (1 == "/" for the bare-absolute case, dirfd-base length for IN_ROOT diff --git a/src/syscall/path.h b/src/syscall/path.h index fba5250..ff9c961 100644 --- a/src/syscall/path.h +++ b/src/syscall/path.h @@ -65,7 +65,7 @@ const char *path_resolve_sysroot_create_path(const char *path, int path_openat2_stays_beneath(const char *path, bool clamp_at_root); int path_openat2_normalize_in_root(const char *path, char *out, size_t outsz); -int path_openat2_is_proc_magiclink(guest_fd_t dirfd, const char *path); +bool path_openat2_is_fd_magiclink_anchor(guest_fd_t dirfd, const char *path); int path_openat2_resolved_within_root(guest_fd_t dirfd, const char *path, uint64_t oflags, diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 81ed5f5..e8f7d7c 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1362,6 +1362,69 @@ static int64_t sc_memfd_create(guest_t *g, (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \ RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED) +/* Linux openat2() treats the user-supplied open_how size as an ABI + * version. The first published layout has three u64 fields + * (flags, mode, resolve), so v0 is 24 bytes. Bytes beyond that are + * future extension fields: all-zero tails are ignored, nonzero tails + * return E2BIG. Keep the same page-sized upper bound Linux applies + * before checking the tail. + */ +#define OPEN_HOW_SIZE_VER0 24 +#define OPEN_HOW_MAX_SIZE 4096 + +static int64_t openat2_check_zero_tail(guest_t *g, + uint64_t how_gva, + uint64_t size) +{ + if (size == OPEN_HOW_SIZE_VER0) + return 0; + + uint64_t off = OPEN_HOW_SIZE_VER0; + while (off < size) { + uint8_t tail[64]; + size_t chunk = (size_t) (size - off); + if (chunk > sizeof(tail)) + chunk = sizeof(tail); + if (how_gva > UINT64_MAX - off || + guest_read_small(g, how_gva + off, tail, chunk) < 0) + return -LINUX_EFAULT; + for (size_t i = 0; i < chunk; i++) { + if (tail[i] != 0) + return -LINUX_E2BIG; + } + off += chunk; + } + return 0; +} + +static bool openat2_flags_valid(uint64_t flags, uint64_t mode) +{ + const uint64_t known_flags = + LINUX_O_ACCMODE | LINUX_O_CREAT | LINUX_O_EXCL | LINUX_O_NOCTTY | + LINUX_O_TRUNC | LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_DSYNC | + LINUX_O_ASYNC | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | + LINUX_O_LARGEFILE | LINUX_O_NOATIME | LINUX_O_CLOEXEC | LINUX_O_SYNC | + LINUX_O_PATH | LINUX___O_TMPFILE; + + if (flags & ~known_flags) + return false; + if ((flags & LINUX_O_PATH) && + (flags & ~(LINUX_O_PATH | LINUX_O_CLOEXEC | LINUX_O_DIRECTORY | + LINUX_O_NOFOLLOW))) + return false; + if (flags & LINUX___O_TMPFILE) + return false; + if ((flags & (LINUX_O_DIRECTORY | LINUX_O_CREAT)) == + (LINUX_O_DIRECTORY | LINUX_O_CREAT)) + return false; + if (mode & ~07777ULL) + return false; + if (mode != 0 && !(flags & LINUX_O_CREAT)) + return false; + + return true; +} + static int64_t sc_openat2(guest_t *g, uint64_t x0, uint64_t x1, @@ -1374,17 +1437,26 @@ static int64_t sc_openat2(guest_t *g, (void) x4; (void) x5; (void) verbose; - if (x3 < 24) + if (x3 < OPEN_HOW_SIZE_VER0) return -LINUX_EINVAL; + if (x3 > OPEN_HOW_MAX_SIZE) + return -LINUX_E2BIG; uint64_t how[3]; if (guest_read_small(g, x2, how, sizeof(how)) < 0) return -LINUX_EFAULT; + int64_t tail_rc = openat2_check_zero_tail(g, x2, x3); + if (tail_rc < 0) + return tail_rc; uint64_t oflags = how[0], mode = how[1]; uint64_t resolve = how[2]; + if (!openat2_flags_valid(oflags, mode)) + return -LINUX_EINVAL; if (resolve & ~(uint64_t) RESOLVE_ALL) return -LINUX_EINVAL; + if ((resolve & RESOLVE_BENEATH) && (resolve & RESOLVE_IN_ROOT)) + return -LINUX_EINVAL; /* RESOLVE_CACHED asks the kernel to satisfy lookup from cache only. * elfuse has no dentry cache, so report EAGAIN and let the guest retry @@ -1403,6 +1475,10 @@ static int64_t sc_openat2(guest_t *g, if (guest_read_str(g, x1, path, sizeof(path)) < 0) return -LINUX_EFAULT; + if ((resolve & (RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS)) && + path_openat2_is_fd_magiclink_anchor((int) x0, path)) + return -LINUX_ELOOP; + if (resolve & RESOLVE_NO_SYMLINKS) { if (sys_path_has_symlink((int) x0, path) < 0) { if (errno == ELOOP) @@ -1425,32 +1501,15 @@ static int64_t sc_openat2(guest_t *g, !path_openat2_stays_beneath(path, true)) return -LINUX_EXDEV; - if ((resolve & RESOLVE_NO_MAGICLINKS) && - path_openat2_is_proc_magiclink((int) x0, path)) - return -LINUX_ELOOP; - int no_xdev_start_class = -1; if (resolve & RESOLVE_NO_XDEV) { - /* A /proc/self/fd/N traversal follows a magic link out of procfs - * into whatever mount holds the target fd, which is a mount - * crossing under Linux NO_XDEV. The post-open class check cannot - * detect this because procemu stamps the resulting fd's proc_path - * with the symbolic /proc/self/fd/N path, hiding the real landing - * mount. Reject the traversal up front. - * path_openat2_is_proc_magiclink normalizes both absolute and - * dirfd/cwd-relative forms against /proc, so /proc/self/fd/N, - * "self/fd/N" / "fd/N" from a /proc or /proc/self anchor, and - * traversals like "task/../fd/N" all collapse onto the same - * /proc/self/fd/N candidate. - * - * Not yet detected: /dev/fd/N (procemu intercepts this as an - * alias for /proc/self/fd/N and dup()s the underlying fd) and - * /proc//fd/N with an explicit pid. A NO_XDEV resolution - * landing on either is still a real cross, but the precheck has - * no normalization path for /dev or for explicit-pid procfs - * anchors yet, so the bypass remains. + /* /proc/self/fd/N, /proc//fd/N, and /dev/fd/N all + * traverse fd magic links into whatever mount holds the target fd. + * Reject the normalized anchor up front because procemu can stamp + * the resulting fd with the symbolic proc/dev path, hiding the real + * landing mount from the post-open class check. */ - if (path_openat2_is_proc_magiclink((int) x0, path)) + if (path_openat2_is_fd_magiclink_anchor((int) x0, path)) return -LINUX_EXDEV; int crossed = path_openat2_crosses_mount( (int) x0, path, (resolve & RESOLVE_IN_ROOT) != 0, diff --git a/tests/test-syscall-fidelity.c b/tests/test-syscall-fidelity.c index 091a297..f2d77fd 100644 --- a/tests/test-syscall-fidelity.c +++ b/tests/test-syscall-fidelity.c @@ -171,6 +171,53 @@ struct open_how { #define RESOLVE_BENEATH 0x08 #define RESOLVE_IN_ROOT 0x10 +#ifndef O_TMPFILE +#define O_TMPFILE (020000000 | O_DIRECTORY) +#endif + +static void expect_openat2_errno(const char *name, + int dirfd, + const char *path, + unsigned long long flags, + unsigned long long mode, + unsigned long long resolve, + int expected_errno) +{ + TEST(name); + struct open_how how = {.flags = flags, .mode = mode, .resolve = resolve}; + errno = 0; + long fd = syscall(SYS_openat2, dirfd, path, &how, sizeof(how)); + if (fd >= 0) { + close((int) fd); + FAIL("openat2 unexpectedly succeeded"); + return; + } + EXPECT_TRUE(errno == expected_errno, "wrong errno"); +} + +static void expect_fd_magiclink_rejected(const char *path_label, + int dirfd, + const char *path) +{ + static const struct { + const char *resolve_name; + unsigned long long resolve; + int expected_errno; + } cases[] = { + {"NO_MAGICLINKS", RESOLVE_NO_MAGICLINKS, ELOOP}, + {"NO_SYMLINKS", RESOLVE_NO_SYMLINKS, ELOOP}, + {"NO_XDEV", RESOLVE_NO_XDEV, EXDEV}, + }; + + for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + char name[128]; + snprintf(name, sizeof(name), "openat2 %s rejects %s", + cases[i].resolve_name, path_label); + expect_openat2_errno(name, dirfd, path, O_RDONLY, 0, cases[i].resolve, + cases[i].expected_errno); + } +} + static void test_openat2_basic(void) { TEST("openat2 basic open"); @@ -184,6 +231,86 @@ static void test_openat2_basic(void) PASS(); } +static void test_openat2_rejects_nonzero_how_extension(void) +{ + TEST("openat2 rejects nonzero open_how extension"); + struct { + struct open_how how; + unsigned long long extra; + } ext = { + .how = {.flags = O_RDONLY, .mode = 0, .resolve = 0}, + .extra = 1, + }; + long fd = syscall(SYS_openat2, AT_FDCWD, "/dev/null", &ext, sizeof(ext)); + if (fd >= 0) { + close((int) fd); + FAIL("accepted nonzero open_how extension"); + return; + } + EXPECT_TRUE(errno == E2BIG, "wrong errno"); +} + +static void test_openat2_rejects_oversized_how(void) +{ + TEST("openat2 rejects oversized open_how"); + struct open_how how = {.flags = O_RDONLY, .mode = 0, .resolve = 0}; + long fd = syscall(SYS_openat2, AT_FDCWD, "/dev/null", &how, 4097); + if (fd >= 0) { + close((int) fd); + FAIL("accepted oversized open_how"); + return; + } + EXPECT_TRUE(errno == E2BIG, "wrong errno"); +} + +static void test_openat2_rejects_unknown_flags(void) +{ + TEST("openat2 rejects unknown flags"); + struct open_how how = {.flags = 1ULL << 63, .mode = 0, .resolve = 0}; + long fd = syscall(SYS_openat2, AT_FDCWD, "/dev/null", &how, sizeof(how)); + if (fd >= 0) { + close((int) fd); + FAIL("accepted unknown flags"); + return; + } + EXPECT_TRUE(errno == EINVAL, "wrong errno"); +} + +static void test_openat2_rejects_mode_without_create(void) +{ + TEST("openat2 rejects mode without create"); + struct open_how how = {.flags = O_RDONLY, .mode = 0600, .resolve = 0}; + long fd = syscall(SYS_openat2, AT_FDCWD, "/dev/null", &how, sizeof(how)); + if (fd >= 0) { + close((int) fd); + FAIL("accepted mode without create"); + return; + } + EXPECT_TRUE(errno == EINVAL, "wrong errno"); +} + +static void test_openat2_rejects_beneath_in_root(void) +{ + expect_openat2_errno("openat2 rejects BENEATH|IN_ROOT", AT_FDCWD, + "/dev/null", O_RDONLY, 0, + RESOLVE_BENEATH | RESOLVE_IN_ROOT, EINVAL); +} + +static void test_openat2_rejects_directory_create(void) +{ + const char *path = "/tmp/elfuse-openat2-directory-create-probe"; + unlink(path); + expect_openat2_errno("openat2 rejects O_DIRECTORY|O_CREAT", AT_FDCWD, path, + O_RDONLY | O_DIRECTORY | O_CREAT, 0600, 0, EINVAL); + unlink(path); +} + +static void test_openat2_rejects_tmpfile_readonly(void) +{ + expect_openat2_errno("openat2 rejects O_TMPFILE|O_RDONLY", AT_FDCWD, "/tmp", + O_TMPFILE | O_RDONLY, 0600, 0, EINVAL); +} + static void test_openat2_resolve_beneath(void) { TEST("openat2 RESOLVE_BENEATH rejects .."); @@ -816,6 +943,156 @@ static void test_openat2_resolve_no_xdev_rejects_proc_fd_magiclink(void) EXPECT_TRUE(errno == EXDEV, "wrong errno"); } +static void test_openat2_rejects_proc_pid_fd_magiclink(void) +{ + int helper = open("/tmp", O_RDONLY | O_DIRECTORY); + if (helper < 0) { + TEST("openat2 prepares /proc//fd helper"); + FAIL("open /tmp helper"); + return; + } + + char path[128]; + if (snprintf(path, sizeof(path), "/proc/%ld/fd/%d", (long) getpid(), + helper) >= (int) sizeof(path)) { + close(helper); + TEST("openat2 prepares /proc//fd helper"); + FAIL("path too long"); + return; + } + expect_fd_magiclink_rejected("absolute /proc//fd", AT_FDCWD, path); + + int procfd = open("/proc", O_RDONLY | O_DIRECTORY); + if (procfd < 0) { + close(helper); + TEST("openat2 prepares /proc//fd helper"); + FAIL("open /proc"); + return; + } + if (snprintf(path, sizeof(path), "%ld/fd/%d", (long) getpid(), helper) >= + (int) sizeof(path)) { + close(procfd); + close(helper); + TEST("openat2 prepares /proc//fd helper"); + FAIL("path too long"); + return; + } + expect_fd_magiclink_rejected("dirfd /proc /fd", procfd, path); + close(procfd); + close(helper); +} + +static void test_openat2_resolve_no_xdev_rejects_dev_fd_magiclink(void) +{ + TEST("openat2 RESOLVE_NO_XDEV rejects /dev/fd magic link"); + int helper = open("/tmp", O_RDONLY | O_DIRECTORY); + if (helper < 0) { + FAIL("open /tmp helper"); + return; + } + int dirfd = open("/dev", O_RDONLY | O_DIRECTORY); + if (dirfd < 0) { + close(helper); + FAIL("open /dev"); + return; + } + + char link_path[64]; + snprintf(link_path, sizeof(link_path), "fd/%d", helper); + struct open_how how = { + .flags = O_RDONLY, .mode = 0, .resolve = RESOLVE_NO_XDEV}; + long fd = syscall(SYS_openat2, dirfd, link_path, &how, sizeof(how)); + close(dirfd); + close(helper); + if (fd >= 0) { + close((int) fd); + FAIL("expected EXDEV for /dev/fd magic-link traversal"); + return; + } + EXPECT_TRUE(errno == EXDEV, "wrong errno"); +} + +static void test_openat2_rejects_dev_fd_magiclink_variants(void) +{ + int helper = open("/tmp", O_RDONLY | O_DIRECTORY); + if (helper < 0) { + TEST("openat2 prepares /dev/fd helper"); + FAIL("open /tmp helper"); + return; + } + + char path[128]; + if (snprintf(path, sizeof(path), "/dev/fd/%d", helper) >= + (int) sizeof(path)) { + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("path too long"); + return; + } + expect_fd_magiclink_rejected("absolute /dev/fd", AT_FDCWD, path); + + int rootfd = open("/", O_RDONLY | O_DIRECTORY); + if (rootfd < 0) { + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("open /"); + return; + } + if (snprintf(path, sizeof(path), "dev/fd/%d", helper) >= + (int) sizeof(path)) { + close(rootfd); + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("path too long"); + return; + } + expect_fd_magiclink_rejected("dirfd / dev/fd", rootfd, path); + close(rootfd); + + int cwdfd = open(".", O_RDONLY | O_DIRECTORY); + if (cwdfd < 0) { + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("open cwd"); + return; + } + if (chdir("/") < 0) { + close(cwdfd); + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("chdir /"); + return; + } + expect_fd_magiclink_rejected("cwd / dev/fd", AT_FDCWD, path); + if (fchdir(cwdfd) < 0) { + close(cwdfd); + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("restore cwd"); + return; + } + close(cwdfd); + + int devfd = open("/dev", O_RDONLY | O_DIRECTORY); + if (devfd < 0) { + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("open /dev"); + return; + } + if (snprintf(path, sizeof(path), "shm/../fd/%d", helper) >= + (int) sizeof(path)) { + close(devfd); + close(helper); + TEST("openat2 prepares /dev/fd helper"); + FAIL("path too long"); + return; + } + expect_fd_magiclink_rejected("dirfd /dev shm/../fd", devfd, path); + close(devfd); + close(helper); +} + static void test_openat2_resolve_no_xdev_rejects_normalized_proc_fd_magiclink( void) { @@ -1024,6 +1301,13 @@ int main(void) /* openat2 RESOLVE_* */ test_openat2_basic(); + test_openat2_rejects_nonzero_how_extension(); + test_openat2_rejects_oversized_how(); + test_openat2_rejects_unknown_flags(); + test_openat2_rejects_mode_without_create(); + test_openat2_rejects_beneath_in_root(); + test_openat2_rejects_directory_create(); + test_openat2_rejects_tmpfile_readonly(); test_openat2_resolve_beneath(); test_openat2_resolve_beneath_allows_internal_dotdot(); test_openat2_resolve_in_root_clamps_dotdot(); @@ -1045,6 +1329,9 @@ int main(void) test_openat2_resolve_no_xdev_rejects_symlink_to_proc(); test_openat2_resolve_no_xdev_in_root_clamps_dotdot(); test_openat2_resolve_no_xdev_rejects_proc_fd_magiclink(); + test_openat2_rejects_proc_pid_fd_magiclink(); + test_openat2_resolve_no_xdev_rejects_dev_fd_magiclink(); + test_openat2_rejects_dev_fd_magiclink_variants(); test_openat2_resolve_no_xdev_rejects_normalized_proc_fd_magiclink(); /* O_PATH */