Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ typedef struct {
#define LINUX_O_TRUNC 0x0200
#define LINUX_O_APPEND 0x0400
#define LINUX_O_NONBLOCK 0x0800
#define LINUX_O_DSYNC 0x1000
#define LINUX_O_ASYNC 0x2000
/* aarch64-linux open flag values (from asm-generic/fcntl.h).
* These differ from x86_64-linux values.
Expand All @@ -397,7 +398,10 @@ typedef struct {
#define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */
#define LINUX_O_NOATIME 0x40000 /* 01000000 octal */
#define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */
#define LINUX_O_SYNC 0x101000 /* __O_SYNC | O_DSYNC */
#define LINUX_O_PATH 0x200000 /* 010000000 octal */
#define LINUX___O_TMPFILE 0x400000
#define LINUX_O_TMPFILE (LINUX___O_TMPFILE | LINUX_O_DIRECTORY)

/* Linux fallocate(2) mode bits (linux/falloc.h). PUNCH_HOLE requires the
* caller to also set KEEP_SIZE per the manpage; collapse/insert/zero/unshare
Expand Down
74 changes: 45 additions & 29 deletions src/syscall/path.c
Original file line number Diff line number Diff line change
Expand Up @@ -655,35 +655,6 @@ int path_openat2_normalize_in_root(const char *path, char *out, size_t outsz)
return 0;
}

int path_openat2_is_proc_magiclink(guest_fd_t dirfd, const char *path)
{
if (!path)
return 0;

char normalized[LINUX_PATH_MAX];
const char *candidate = path;

if (path[0] == '/') {
if (strncmp(path, "/proc", 5) || (path[5] != '\0' && path[5] != '/'))
return 0;

size_t marks[PROC_PATH_COMPONENTS_MAX];
size_t depth;
if (proc_seed_absolute_path(path, normalized, sizeof(normalized), marks,
ARRAY_SIZE(marks), &depth) < 0)
return 0;
candidate = normalized;
} else {
int rc =
resolve_proc_at_path(dirfd, path, normalized, sizeof(normalized));
if (rc <= 0)
return 0;
candidate = normalized;
}

return !strncmp(candidate, "/proc/self/fd/", 14);
}

static int path_openat2_dirfd_host_path(guest_fd_t dirfd,
char *out,
size_t outsz)
Expand Down Expand Up @@ -909,6 +880,51 @@ static int dirfd_guest_base_path(guest_fd_t dirfd, char *out, size_t outsz)
return 0;
}

static bool normalized_proc_self_fd_anchor(const char *path)
{
if (!strncmp(path, "proc/self/fd/", 13))
return true;
if (strncmp(path, "proc/", 5))
return false;

char *endp;
const char *pid_start = path + sizeof("proc/") - 1;
errno = 0;
long long pid = strtoll(pid_start, &endp, 10);
if (endp == pid_start || errno == ERANGE ||
pid != (long long) proc_get_pid())
return false;
return strncmp(endp, "/fd/", 4) == 0;
}

bool path_openat2_is_fd_magiclink_anchor(guest_fd_t dirfd, const char *path)
{
if (!path)
return false;

char normalized[LINUX_PATH_MAX];

if (path[0] == '/') {
if (path_openat2_normalize_in_root(path, normalized,
sizeof(normalized)) < 0)
return false;
} else {
char base[LINUX_PATH_MAX];
char joined[LINUX_PATH_MAX];
if (dirfd_guest_base_path(dirfd, base, sizeof(base)) < 0)
return false;
if (snprintf(joined, sizeof(joined), "%s/%s", base, path) >=
(int) sizeof(joined))
return false;
if (path_openat2_normalize_in_root(joined, normalized,
sizeof(normalized)) < 0)
return false;
}

return strncmp(normalized, "dev/fd/", 7) == 0 ||
normalized_proc_self_fd_anchor(normalized);
}

/* Pop one trailing component from an absolute path, refusing to drop
* below the supplied floor length. floor_len is strlen of the walk root
* (1 == "/" for the bare-absolute case, dirfd-base length for IN_ROOT
Expand Down
2 changes: 1 addition & 1 deletion src/syscall/path.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ const char *path_resolve_sysroot_create_path(const char *path,

int path_openat2_stays_beneath(const char *path, bool clamp_at_root);
int path_openat2_normalize_in_root(const char *path, char *out, size_t outsz);
int path_openat2_is_proc_magiclink(guest_fd_t dirfd, const char *path);
bool path_openat2_is_fd_magiclink_anchor(guest_fd_t dirfd, const char *path);
int path_openat2_resolved_within_root(guest_fd_t dirfd,
const char *path,
uint64_t oflags,
Expand Down
107 changes: 83 additions & 24 deletions src/syscall/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,69 @@ static int64_t sc_memfd_create(guest_t *g,
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED)

/* Linux openat2() treats the user-supplied open_how size as an ABI
* version. The first published layout has three u64 fields
* (flags, mode, resolve), so v0 is 24 bytes. Bytes beyond that are
* future extension fields: all-zero tails are ignored, nonzero tails
* return E2BIG. Keep the same page-sized upper bound Linux applies
* before checking the tail.
*/
#define OPEN_HOW_SIZE_VER0 24
#define OPEN_HOW_MAX_SIZE 4096
Comment thread
jserv marked this conversation as resolved.

static int64_t openat2_check_zero_tail(guest_t *g,
uint64_t how_gva,
uint64_t size)
{
if (size == OPEN_HOW_SIZE_VER0)
return 0;

uint64_t off = OPEN_HOW_SIZE_VER0;
while (off < size) {
uint8_t tail[64];
size_t chunk = (size_t) (size - off);
if (chunk > sizeof(tail))
chunk = sizeof(tail);
if (how_gva > UINT64_MAX - off ||
guest_read_small(g, how_gva + off, tail, chunk) < 0)
return -LINUX_EFAULT;
for (size_t i = 0; i < chunk; i++) {
if (tail[i] != 0)
return -LINUX_E2BIG;
}
off += chunk;
}
return 0;
}

static bool openat2_flags_valid(uint64_t flags, uint64_t mode)
{
const uint64_t known_flags =
LINUX_O_ACCMODE | LINUX_O_CREAT | LINUX_O_EXCL | LINUX_O_NOCTTY |
LINUX_O_TRUNC | LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_DSYNC |
LINUX_O_ASYNC | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
LINUX_O_LARGEFILE | LINUX_O_NOATIME | LINUX_O_CLOEXEC | LINUX_O_SYNC |
LINUX_O_PATH | LINUX___O_TMPFILE;

if (flags & ~known_flags)
return false;
if ((flags & LINUX_O_PATH) &&
(flags & ~(LINUX_O_PATH | LINUX_O_CLOEXEC | LINUX_O_DIRECTORY |
LINUX_O_NOFOLLOW)))
return false;
if (flags & LINUX___O_TMPFILE)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: openat2_flags_valid() unconditionally rejects all O_TMPFILE usage instead of only invalid combinations, breaking legitimate O_TMPFILE calls

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/syscall/syscall.c, line 1408:

<comment>openat2_flags_valid() unconditionally rejects all O_TMPFILE usage instead of only invalid combinations, breaking legitimate O_TMPFILE calls</comment>

<file context>
@@ -1405,16 +1405,14 @@ static int openat2_flags_valid(uint64_t flags, uint64_t mode)
         return 0;
-    if ((flags & LINUX___O_TMPFILE) &&
-        (flags & LINUX_O_TMPFILE) != LINUX_O_TMPFILE)
+    if (flags & LINUX___O_TMPFILE)
         return 0;
-    if ((flags & LINUX_O_TMPFILE) == LINUX_O_TMPFILE &&
</file context>

return false;
if ((flags & (LINUX_O_DIRECTORY | LINUX_O_CREAT)) ==
(LINUX_O_DIRECTORY | LINUX_O_CREAT))
return false;
if (mode & ~07777ULL)
return false;
if (mode != 0 && !(flags & LINUX_O_CREAT))
return false;

return true;
}

static int64_t sc_openat2(guest_t *g,
uint64_t x0,
uint64_t x1,
Expand All @@ -1374,17 +1437,26 @@ static int64_t sc_openat2(guest_t *g,
(void) x4;
(void) x5;
(void) verbose;
if (x3 < 24)
if (x3 < OPEN_HOW_SIZE_VER0)
return -LINUX_EINVAL;
if (x3 > OPEN_HOW_MAX_SIZE)
return -LINUX_E2BIG;
uint64_t how[3];
if (guest_read_small(g, x2, how, sizeof(how)) < 0)
return -LINUX_EFAULT;
int64_t tail_rc = openat2_check_zero_tail(g, x2, x3);
if (tail_rc < 0)
return tail_rc;

uint64_t oflags = how[0], mode = how[1];
uint64_t resolve = how[2];

if (!openat2_flags_valid(oflags, mode))
return -LINUX_EINVAL;
if (resolve & ~(uint64_t) RESOLVE_ALL)
return -LINUX_EINVAL;
Comment thread
abnormal749 marked this conversation as resolved.
if ((resolve & RESOLVE_BENEATH) && (resolve & RESOLVE_IN_ROOT))
return -LINUX_EINVAL;

/* RESOLVE_CACHED asks the kernel to satisfy lookup from cache only.
* elfuse has no dentry cache, so report EAGAIN and let the guest retry
Expand All @@ -1403,6 +1475,10 @@ static int64_t sc_openat2(guest_t *g,
if (guest_read_str(g, x1, path, sizeof(path)) < 0)
return -LINUX_EFAULT;

if ((resolve & (RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS)) &&
path_openat2_is_fd_magiclink_anchor((int) x0, path))
return -LINUX_ELOOP;

if (resolve & RESOLVE_NO_SYMLINKS) {
if (sys_path_has_symlink((int) x0, path) < 0) {
if (errno == ELOOP)
Expand All @@ -1425,32 +1501,15 @@ static int64_t sc_openat2(guest_t *g,
!path_openat2_stays_beneath(path, true))
return -LINUX_EXDEV;

if ((resolve & RESOLVE_NO_MAGICLINKS) &&
path_openat2_is_proc_magiclink((int) x0, path))
return -LINUX_ELOOP;

int no_xdev_start_class = -1;
if (resolve & RESOLVE_NO_XDEV) {
/* A /proc/self/fd/N traversal follows a magic link out of procfs
* into whatever mount holds the target fd, which is a mount
* crossing under Linux NO_XDEV. The post-open class check cannot
* detect this because procemu stamps the resulting fd's proc_path
* with the symbolic /proc/self/fd/N path, hiding the real landing
* mount. Reject the traversal up front.
* path_openat2_is_proc_magiclink normalizes both absolute and
* dirfd/cwd-relative forms against /proc, so /proc/self/fd/N,
* "self/fd/N" / "fd/N" from a /proc or /proc/self anchor, and
* traversals like "task/../fd/N" all collapse onto the same
* /proc/self/fd/N candidate.
*
* Not yet detected: /dev/fd/N (procemu intercepts this as an
* alias for /proc/self/fd/N and dup()s the underlying fd) and
* /proc/<pid>/fd/N with an explicit pid. A NO_XDEV resolution
* landing on either is still a real cross, but the precheck has
* no normalization path for /dev or for explicit-pid procfs
* anchors yet, so the bypass remains.
/* /proc/self/fd/N, /proc/<self_pid>/fd/N, and /dev/fd/N all
* traverse fd magic links into whatever mount holds the target fd.
* Reject the normalized anchor up front because procemu can stamp
* the resulting fd with the symbolic proc/dev path, hiding the real
* landing mount from the post-open class check.
*/
if (path_openat2_is_proc_magiclink((int) x0, path))
if (path_openat2_is_fd_magiclink_anchor((int) x0, path))
return -LINUX_EXDEV;
int crossed = path_openat2_crosses_mount(
(int) x0, path, (resolve & RESOLVE_IN_ROOT) != 0,
Expand Down
Loading
Loading