From 3b4f5015e26401b2ce3188853d814e044b4ebf32 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 8 Jun 2026 04:35:55 +0800 Subject: [PATCH 1/7] Emulate Linux PTY ioctls and /dev/pts/N paths sys_ioctl previously had no case for TIOCSWINSZ, so foot's initial master-side resize hit the default -ENOTTY arm and aborted terminal startup. The minimal one-line fix is insufficient on macOS: the host /dev/ptmx master is not itself a tty. TIOCSWINSZ / TIOCGWINSZ on the bare master return ENOTTY until something has opened the corresponding slave at least once, and the stored winsize gets cleared whenever the slave refcount drops to zero (verified empirically on macOS 15). Linux ptmx masters are tty fds in their own right, so guests assume those ioctls work without an open slave. pty_open_master bridges the gap by eagerly opening one slave host fd that elfuse holds for the lifetime of the master and never exposes to the guest. A per-master side table records ({master,slave}_host_fd, linux_pts_num, slave_path). fd_cleanup_entry drops the keepalive when the master closes; sys_close's fast-close branch also calls proc_pty_close_keepalive so single-thread closes do not bypass the slow path. duplicate_guest_fd mirrors the keepalive via a dup-under-lock so dup/dup2/fcntl(F_DUPFD) aliases each keep a slave reference, then registers BEFORE fd_alloc publishes the new guest fd so a sibling close racing the install drops the duped keepalive too. Guest /dev/pts/N opens and stats resolve through the captured ptsname(3) string rather than a /dev/ttys%03lu reformat that breaks on any future macOS naming change or unusual minor encoding. The synthesized stat publishes st_rdev = (136 << 24) | minor in the macOS encoding so the fs-stat translation layer (mac_to_linux_dev) yields a Linux dev_t whose major(rdev) equals UNIX98_PTY_SLAVE_MAJOR, satisfying glibc ptsname's device-type check. pty_open_master fails the ptmx open with EMFILE when the table is full instead of returning a master fd whose pts number cannot round-trip through /dev/pts/N. /dev/pts is added to path_might_use_stat_intercept. Four new ioctls: TIOCSWINSZ passes through to the host (now valid because the keepalive opened the slave); TIOCGPTN reports the captured pts number; TIOCSPTLCK(0) maps to host unlockpt(3) (lock-after-unlock is rejected because macOS exposes no re-lock primitive); TIOCGPTPEER opens the captured slave path with translate_open_flags and rejects unsupported bits with EINVAL. TIOCSWINSZ / TIOCGWINSZ / TIOCGPTN defensively call proc_pty_master_adopt(guest_fd) first so a master received via SCM_RIGHTS lazily registers its own keepalive before the host ioctl. Adopt uses fd_snapshot_and_dup to atomically pin the canonical (host_fd, generation) with a probe dup, performs the slave open against the probe, then re-validates and inserts under joint fd_lock + pty_keepalive_lock so a sibling close+recycle between validation and insert cannot attach the keepalive to the wrong file. pty_keepalive_register_locked returns the existing pts_num atomically on duplicate so the idempotent path never re-reads under the lock-free race window. Fork IPC propagates the keepalive table. fork_ipc_send_pty_keepalives walks proc_pty_snapshot_keepalive output, matches each entry's master_host_fd against the parent's fd_table to recover the guest fd that is stable across the IPC, and ships (guest_fd, linux_pts_num, slave_path) records plus an SCM_RIGHTS batch of dup'd slave fds. fork_ipc_recv_pty_keepalives resolves each guest_fd through the child's just-installed fd_table to recover the child-side master host fd, then calls proc_pty_restore_keepalive which sets FD_CLOEXEC on the inherited slave and registers the pair under the wire-transmitted linux_pts_num (no reparse of the path string). Both phases run after fork_ipc_{send, recv}_fd_table so the guest-fd-to-host-fd lookup exists in both directions. Close #88 --- src/runtime/fork-state.c | 165 +++++++- src/runtime/fork-state.h | 11 + src/runtime/forkipc.c | 18 + src/runtime/procemu.c | 799 +++++++++++++++++++++++++++++++++++++-- src/runtime/procemu.h | 78 ++++ src/syscall/abi.h | 8 + src/syscall/fdtable.c | 7 + src/syscall/fs.c | 49 ++- src/syscall/io.c | 119 ++++++ src/syscall/path.c | 7 + tests/manifest.txt | 1 + tests/test-pty.c | 410 ++++++++++++++++++++ 12 files changed, 1624 insertions(+), 48 deletions(-) create mode 100644 tests/test-pty.c diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 1c31bc0..ad283f9 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -17,6 +17,7 @@ #include "utils.h" #include "runtime/fork-state.h" +#include "runtime/procemu.h" #include "debug/log.h" #include "syscall/abi.h" @@ -447,6 +448,162 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return 0; } +/* Wire payload for one pty keepalive. The slave fd travels separately via + * SCM_RIGHTS; the parent's master_host_fd is intentionally omitted because the + * child's number will differ. The child re-derives it from fd_table[gfd]. + */ +typedef struct { + int32_t guest_fd; + uint32_t linux_pts_num; + char slave_path[64]; +} ipc_pty_keepalive_t; + +int fork_ipc_send_pty_keepalives(int ipc_sock) +{ + /* PTY_KEEPALIVE_MAX upper bound on entries; allocate to that. */ + proc_pty_ipc_entry_t snapshot[256]; + int snapshot_slave_fds[256]; + const int max_entries = (int) (sizeof(snapshot) / sizeof(snapshot[0])); + int num_snap = + proc_pty_snapshot_keepalive(snapshot, snapshot_slave_fds, max_entries); + + /* Match each keepalive's master_host_fd against a live fd_table entry to + * recover the guest_fd, which is the stable identifier across fork. + */ + ipc_pty_keepalive_t payload[256]; + int payload_slave_fds[256]; + uint32_t num_send = 0; + + pthread_mutex_lock(&fd_lock); + for (int i = 0; i < num_snap; i++) { + int matched_gfd = -1; + for (int gfd = 0; gfd < FD_TABLE_SIZE; gfd++) { + if (fd_table[gfd].type == FD_CLOSED) + continue; + if (fd_table[gfd].host_fd != snapshot[i].master_host_fd) + continue; + if (fd_type_is_synthetic(fd_table[gfd].type)) + continue; + matched_gfd = gfd; + break; + } + + if (matched_gfd < 0) { + /* Master was closed between snapshot and lookup, or never tracked + * in the guest fd table (defensive). Drop the duped slave; nothing + * else holds it. + */ + close(snapshot_slave_fds[i]); + continue; + } + + payload[num_send].guest_fd = matched_gfd; + payload[num_send].linux_pts_num = snapshot[i].linux_pts_num; + _Static_assert( + sizeof(payload[0].slave_path) == sizeof(snapshot[0].slave_path), + "keepalive slave_path size must match payload"); + memcpy(payload[num_send].slave_path, snapshot[i].slave_path, + sizeof(payload[0].slave_path)); + payload_slave_fds[num_send] = snapshot_slave_fds[i]; + num_send++; + } + pthread_mutex_unlock(&fd_lock); + + if (fork_ipc_write_all(ipc_sock, &num_send, sizeof(num_send)) < 0) + goto fail; + if (num_send > 0) { + if (fork_ipc_write_all(ipc_sock, payload, + num_send * sizeof(payload[0])) < 0) + goto fail; + if (fork_ipc_send_fds(ipc_sock, payload_slave_fds, (int) num_send) < + 0) { + log_error("clone: failed to send pty keepalive fds"); + goto fail; + } + } + + for (uint32_t i = 0; i < num_send; i++) + close(payload_slave_fds[i]); + return 0; + +fail: + for (uint32_t i = 0; i < num_send; i++) + close(payload_slave_fds[i]); + return -1; +} + +int fork_ipc_recv_pty_keepalives(int ipc_fd) +{ + uint32_t num; + if (fork_ipc_read_all(ipc_fd, &num, sizeof(num)) < 0) { + log_error("fork-child: failed to read pty keepalive count"); + return -1; + } + if (num == 0) + return 0; + if (num > FD_TABLE_SIZE) { + log_error("fork-child: pty keepalive count %u exceeds FD_TABLE_SIZE", + num); + return -1; + } + + ipc_pty_keepalive_t *payload = calloc(num, sizeof(*payload)); + if (!payload) + return -1; + + if (fork_ipc_read_all(ipc_fd, payload, num * sizeof(*payload)) < 0) { + free(payload); + return -1; + } + + int *slave_fds = calloc(num, sizeof(int)); + if (!slave_fds) { + free(payload); + return -1; + } + int got = 0; + if (fork_ipc_recv_fds(ipc_fd, slave_fds, (int) num, &got) < 0 || + got != (int) num) { + log_error("fork-child: pty keepalive recv mismatch: got %d expected %u", + got, num); + for (int i = 0; i < got; i++) + close(slave_fds[i]); + free(slave_fds); + free(payload); + return -1; + } + + for (uint32_t i = 0; i < num; i++) { + int gfd = payload[i].guest_fd; + int child_master = -1; + if (RANGE_CHECK(gfd, 0, FD_TABLE_SIZE)) { + pthread_mutex_lock(&fd_lock); + if (fd_table[gfd].type != FD_CLOSED) + child_master = fd_table[gfd].host_fd; + pthread_mutex_unlock(&fd_lock); + } + if (child_master < 0) { + /* Master fd did not survive the fd_table batch (synthetic-type + * filter, or the slot was rejected). Drop the keepalive cleanly. + */ + close(slave_fds[i]); + continue; + } + + /* Force-NUL the path before passing it on so a malformed sender cannot + * trick the child into reading past the buffer. + */ + payload[i].slave_path[sizeof(payload[i].slave_path) - 1] = '\0'; + proc_pty_restore_keepalive(child_master, slave_fds[i], + payload[i].linux_pts_num, + payload[i].slave_path); + } + + free(slave_fds); + free(payload); + return 0; +} + static int fork_ipc_send_backing_fds(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions) @@ -714,12 +871,14 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read region count"); return -1; } + uint8_t regions_tracker_stale = 0; if (fork_ipc_read_all(ipc_fd, ®ions_tracker_stale, sizeof(regions_tracker_stale)) < 0) { log_error("fork-child: failed to read region tracker state"); return -1; } + uint32_t recv_regions = num_guest_regions; if (recv_regions > GUEST_MAX_REGIONS) recv_regions = GUEST_MAX_REGIONS; @@ -729,15 +888,17 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read regions"); return -1; } + /* Drain any excess records the parent serialized beyond the local cap. * Without this drain, the next read (num_preannounced) consumes stale - * region bytes and desynchronizes the rest of the IPC payload. Mirrors - * the preannounced-region drain below. + * region bytes and desynchronizes the rest of the IPC payload. Mirrors the + * preannounced-region drain below. */ if (num_guest_regions > recv_regions && fork_ipc_drain_bytes(ipc_fd, (num_guest_regions - recv_regions) * sizeof(guest_region_t)) < 0) return -1; + g->nregions = (int) recv_regions; g->regions_tracker_stale = (regions_tracker_stale != 0) || (num_guest_regions > recv_regions); diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index e9de0bc..27c2080 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -99,6 +99,17 @@ int fork_ipc_recv_memory_regions(int ipc_fd, guest_t *g); int fork_ipc_send_fd_table(int ipc_sock); int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g); +/* Carry the /dev/ptmx keepalive slave fds across the fork boundary. The fd + * table batch sends master fds without their hidden keepalive companions, so + * a child that inherits a master would otherwise hit the macOS ENOTTY / + * winsize-reset cliff that proc_pty_close_keepalive papers over in the + * parent. send/recv must run AFTER fork_ipc_send_fd_table / _recv_fd_table + * so the child can look up its new master host fd from the just-installed + * fd_table entry. + */ +int fork_ipc_send_pty_keepalives(int ipc_sock); +int fork_ipc_recv_pty_keepalives(int ipc_fd); + int fork_ipc_send_process_state(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions, diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index aef1d24..7b89dcc 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -261,6 +261,15 @@ int fork_child_main(int ipc_fd, return 1; } + /* Must follow fork_ipc_recv_fd_table: the keepalive recv resolves each + * payload guest_fd to its (now installed) child-side host master fd. + */ + if (fork_ipc_recv_pty_keepalives(ipc_fd) < 0) { + log_error("fork-child: failed to receive pty keepalives"); + guest_destroy(&g); + return 1; + } + signal_state_t sig; if (fork_ipc_recv_process_state(ipc_fd, &g, &sig) < 0) { log_error("fork-child: failed to receive process state"); @@ -1598,6 +1607,15 @@ int64_t sys_clone(hv_vcpu_t vcpu, goto fail_snapshot; } + /* Must follow fork_ipc_send_fd_table because the keepalive payload + * carries a guest_fd that the child resolves through its just-installed + * fd_table to recover the child-side master host fd. + */ + if (fork_ipc_send_pty_keepalives(ipc_sock) < 0) { + log_error("clone: failed to send pty keepalives"); + goto fail_snapshot; + } + uint32_t num_guest_regions = (uint32_t) nregions_snapshot; uint32_t num_preannounced = (uint32_t) npreannounced_snapshot; if (fork_ipc_send_process_state( diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index c1b2e31..9666585 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -19,6 +19,7 @@ */ #define MAPS_NAME_COLUMN 73 +#include #include #include #include @@ -40,6 +41,8 @@ #include #include #include +#include +#include #include "utils.h" @@ -1479,11 +1482,620 @@ static void proc_task_collect_cb(thread_entry_t *t, void *arg) c->tids[c->ntids++] = t->guest_tid; } +/* Pseudoterminal master side-table. + * + * Bridges two host vs guest mismatches in one place: + * + * 1. The macOS /dev/ptmx master is not itself a tty. TIOCSWINSZ / TIOCGWINSZ + * on the bare master return ENOTTY until something has opened the + * corresponding slave once, and the stored winsize gets cleared whenever + * the slave refcount drops to zero (verified empirically on macOS 15). + * Linux ptmx masters are tty fds in their own right, so guests assume those + * ioctls work without an open slave. To bridge the gap, every /dev/ptmx + * open eagerly opens one slave host fd that elfuse holds for the lifetime + * of the master and never exposes to the guest. + * + * 2. macOS slaves live at /dev/ttysNNN; Linux glibc looks for /dev/pts/N where + * N comes from TIOCGPTN. Guest opens of /dev/pts/N route back to the + * macOS path captured from ptsname(3) at /dev/ptmx open time, not a + * re-formatted guess, so format changes in macOS (or unusual minor + * encodings) cannot strand the guest with the wrong slave. + * + * Entries are keyed by the host master fd because that is what fd_cleanup_entry + * has when the guest closes a master. Capacity matches the macOS default + * UNIX98 slave count; overflow leaves the entry empty and the guest gets the + * pre-fix degraded behavior for that one pair instead of an open failure. + */ +#define PTY_KEEPALIVE_MAX 256 +#define PTY_KEEPALIVE_FREE (-1) +/* PTY_SLAVE_PATH_MAX lives in procemu.h so this table and the fork-IPC + * payload (proc_pty_ipc_entry_t) cannot drift apart. + */ +static struct { + int master_host_fd; + int slave_host_fd; + uint32_t linux_pts_num; + char slave_path[PTY_SLAVE_PATH_MAX]; +} pty_keepalive_table[PTY_KEEPALIVE_MAX]; +static pthread_mutex_t pty_keepalive_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_once_t pty_keepalive_once = PTHREAD_ONCE_INIT; + +static void pty_keepalive_init(void) +{ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[i].linux_pts_num = 0; + pty_keepalive_table[i].slave_path[0] = '\0'; + } +} + +static uint32_t pty_extract_pts_num(const char *slave_path) +{ + /* macOS canonical slave paths are /dev/ttysNNN with a decimal tail. Read + * the longest decimal suffix and return it as the Linux pts number used + * by guest /dev/pts/N. Returns UINT32_MAX on parse failure so callers + * can reject ambiguous names rather than silently aliasing. + */ + if (!slave_path) + return UINT32_MAX; + const char *p = slave_path + strlen(slave_path); + while (p > slave_path && isdigit((unsigned char) p[-1])) + p--; + if (!*p || !isdigit((unsigned char) *p)) + return UINT32_MAX; + char *endp; + unsigned long n = strtoul(p, &endp, 10); + if (endp == p || *endp != '\0' || n > UINT32_MAX) + return UINT32_MAX; + return (uint32_t) n; +} + +/* Result codes for the locked register helper. */ +#define PTY_REG_INSERTED 0 /* new entry installed */ +#define PTY_REG_EXISTS 1 /* a matching entry already existed */ +#define PTY_REG_FULL (-1) /* table out of free slots */ + +/* Caller-holds-lock variant. Returns one of PTY_REG_* and, on PTY_REG_EXISTS, + * writes the existing entry's pts number to *existing_pts_num. The lock-held + * variant exists so proc_pty_master_adopt can atomically pair fd-table slot + * validation with keepalive insertion under fd_lock + pty_keepalive_lock, + * eliminating the race window where a sibling close+recycle between validate + * and register would attach the keepalive to the wrong file. + */ +static int pty_keepalive_register_locked(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path, + uint32_t *existing_pts_num) +{ + int free_slot = -1; + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == master_host_fd) { + if (existing_pts_num) + *existing_pts_num = pty_keepalive_table[i].linux_pts_num; + return PTY_REG_EXISTS; + } + if (free_slot < 0 && + pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) + free_slot = i; + } + if (free_slot < 0) + return PTY_REG_FULL; + pty_keepalive_table[free_slot].master_host_fd = master_host_fd; + pty_keepalive_table[free_slot].slave_host_fd = slave_host_fd; + pty_keepalive_table[free_slot].linux_pts_num = linux_pts_num; + if (slave_path) { + strncpy(pty_keepalive_table[free_slot].slave_path, slave_path, + PTY_SLAVE_PATH_MAX - 1); + pty_keepalive_table[free_slot].slave_path[PTY_SLAVE_PATH_MAX - 1] = + '\0'; + } else { + pty_keepalive_table[free_slot].slave_path[0] = '\0'; + } + return PTY_REG_INSERTED; +} + +/* Lock-acquiring convenience wrapper used by the open-time and fork-restore + * paths where atomicity with fd_table is not required. Returns 0 on success + * (including PTY_REG_EXISTS, in which case the caller should close its own + * redundant slave_host_fd), -1 with errno set on table-full (ENOSPC). + */ +static int pty_keepalive_register(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path) +{ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + int rc = pty_keepalive_register_locked(master_host_fd, slave_host_fd, + linux_pts_num, slave_path, NULL); + pthread_mutex_unlock(&pty_keepalive_lock); + if (rc == PTY_REG_FULL) { + errno = ENOSPC; + return -1; + } + if (rc == PTY_REG_EXISTS) + errno = EEXIST; + return 0; +} + +uint32_t proc_pty_master_pts_num(int master_host_fd) +{ + if (master_host_fd < 0) + return UINT32_MAX; + pthread_once(&pty_keepalive_once, pty_keepalive_init); + uint32_t pts_num = UINT32_MAX; + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == master_host_fd) { + pts_num = pty_keepalive_table[i].linux_pts_num; + break; + } + } + pthread_mutex_unlock(&pty_keepalive_lock); + return pts_num; +} + +/* Re-validate that fd_table[guest_fd] still refers to (host_fd, generation). + * Returns true when both match the snapshot, false otherwise (slot closed or + * recycled). Used by proc_pty_master_adopt to bracket every host-fd-number + * access against the closing-and-reuse race. + */ +static bool pty_fd_still_canonical(int guest_fd, + int canonical_host_fd, + uint64_t canonical_gen) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return false; + return snap.host_fd == canonical_host_fd && + snap.generation == canonical_gen; +} + +uint32_t proc_pty_master_adopt(int guest_fd) +{ + /* Step 1: atomically snapshot (host_fd, generation) and dup the + * canonical fd in a single fd_lock window. fd_snapshot_and_dup pins + * the file object behind the canonical host fd, so even if a sibling + * closes the guest fd and the host fd number is recycled by an + * unrelated open, host syscalls against the probe still operate on + * the right tty. The generation captured here is the witness for the + * subsequent table lookup and register validations. + */ + fd_entry_t snap; + int probe = fd_snapshot_and_dup(guest_fd, &snap); + if (probe < 0) + return UINT32_MAX; + int canonical_host_fd = snap.host_fd; + uint64_t canonical_gen = snap.generation; + + /* Fast path: a keepalive was already registered for this canonical fd + * (typical case for /dev/ptmx opens that went through pty_open_master). + * The keepalive table is keyed by host fd number, so re-validate the + * slot identity before trusting the returned pts_num. If the fd has + * been recycled to a different file (generation mismatch), the + * existing entry belongs to that file, not ours, and the slow path + * below must register a fresh entry for our pinned probe. + */ + uint32_t existing = proc_pty_master_pts_num(canonical_host_fd); + if (existing != UINT32_MAX && + pty_fd_still_canonical(guest_fd, canonical_host_fd, canonical_gen)) { + close(probe); + return existing; + } + + /* Step 2: confirm the file really is a /dev/ptmx master. ptsname(3) + * returns NULL/ENOTTY on non-pty descriptors, so a stray TIOCGPTN + * against a regular file is rejected without any side effect. + */ + char slave_path[PTY_SLAVE_PATH_MAX]; + uint32_t pts_num = UINT32_MAX; + int slave = -1; + if (ptsname_r(probe, slave_path, sizeof(slave_path)) != 0) + goto out; + pts_num = pty_extract_pts_num(slave_path); + if (pts_num == UINT32_MAX) + goto out; + + /* unlockpt(3) is harmless if the sender already unlocked. EINVAL means + * already unlocked; anything else means the slave will not open and + * we give up cleanly. + */ + if (unlockpt(probe) < 0 && errno != EINVAL) { + pts_num = UINT32_MAX; + goto out; + } + slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC); + if (slave < 0) { + pts_num = UINT32_MAX; + goto out; + } + + /* Step 3: re-validate AND publish under the joint pty_keepalive_lock + + * fd_lock window. Lock order is pty_keepalive_lock first; + * duplicate_guest_fd uses the same order when bracketing + * fd_snapshot_and_dup + proc_pty_dup_keepalive_locked, so the two paths + * cannot deadlock. With both held, no sibling can flip the fd_table slot + * between the validation read and the keepalive insert, so the keepalive + * cannot attach to a recycled canonical host fd. + */ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + pthread_mutex_lock(&fd_lock); + if (fd_table[guest_fd].type == FD_CLOSED || + fd_table[guest_fd].host_fd != canonical_host_fd || + fd_table[guest_fd].generation != canonical_gen) { + pthread_mutex_unlock(&fd_lock); + pthread_mutex_unlock(&pty_keepalive_lock); + close(slave); + pts_num = UINT32_MAX; + goto out; + } + uint32_t existing_pts = UINT32_MAX; + int rc = pty_keepalive_register_locked(canonical_host_fd, slave, pts_num, + slave_path, &existing_pts); + pthread_mutex_unlock(&fd_lock); + pthread_mutex_unlock(&pty_keepalive_lock); + if (rc == PTY_REG_FULL) { + close(slave); + pts_num = UINT32_MAX; + } else if (rc == PTY_REG_EXISTS) { + /* Another adopter registered first; their slave keeps the tty alive. + * The pts_num came from the locked scan above, so it is the value the + * winning entry holds and is not subject to a lookup-after-recycle + * race. + */ + close(slave); + pts_num = existing_pts; + } + +out: + close(probe); + return pts_num; +} + +/* Look up the captured macOS slave path for a Linux pts number. Returns 0 and + * writes the path on hit, -1 with errno=ENOENT on miss. Used by the /dev/pts/N + * open and stat intercepts so they hit the exact path returned by ptsname(3) + * rather than a guessed /dev/ttys%03lu reformat that breaks if macOS changes + * its naming scheme or uses an unexpected minor encoding. + */ +static int pty_lookup_slave_path(uint32_t linux_pts_num, + char *out, + size_t out_sz) +{ + if (!out || out_sz == 0) { + errno = EINVAL; + return -1; + } + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE && + pty_keepalive_table[i].linux_pts_num == linux_pts_num) { + size_t len = strlen(pty_keepalive_table[i].slave_path); + if (len >= out_sz) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENAMETOOLONG; + return -1; + } + memcpy(out, pty_keepalive_table[i].slave_path, len + 1); + pthread_mutex_unlock(&pty_keepalive_lock); + return 0; + } + } + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENOENT; + return -1; +} + +static int pty_open_pts_dir(int linux_flags) +{ + char dir[80]; + uint32_t pts_nums[PTY_KEEPALIVE_MAX]; + int pts_count = 0; + int n = snprintf(dir, sizeof(dir), "/tmp/elfuse-pts-XXXXXX"); + if (n < 0 || (size_t) n >= sizeof(dir)) { + errno = ENAMETOOLONG; + return -1; + } + if (!mkdtemp(dir)) + return -1; + + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) + continue; + pts_nums[pts_count++] = pty_keepalive_table[i].linux_pts_num; + } + pthread_mutex_unlock(&pty_keepalive_lock); + + for (int i = 0; i < pts_count; i++) { + char entry[160]; + int en = snprintf(entry, sizeof(entry), "%s/%u", dir, pts_nums[i]); + if (en <= 0 || (size_t) en >= sizeof(entry)) + continue; + int tfd = open(entry, O_CREAT | O_WRONLY, 0444); + if (tfd >= 0) + close(tfd); + } + + pthread_once(&proc_scratch_atexit_once, proc_scratch_register_atexit); + + pthread_mutex_lock(&proc_scratch_lock); + if (proc_scratch_dirs_count < PROC_SCRATCH_DIRS_MAX) { + str_copy_trunc(proc_scratch_dirs[proc_scratch_dirs_count++], dir, + sizeof(proc_scratch_dirs[0])); + } + pthread_mutex_unlock(&proc_scratch_lock); + + int fd = proc_open_dir_fd(dir, linux_flags); + if (fd < 0) { + int saved = errno; + proc_scratch_remove_one(dir); + errno = saved; + } + return fd; +} + +void proc_pty_lock_for_dup(void) +{ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); +} + +void proc_pty_unlock_for_dup(void) +{ + pthread_mutex_unlock(&pty_keepalive_lock); +} + +void proc_pty_dup_keepalive_locked(int src_master_host_fd, + int dst_master_host_fd) +{ + /* Caller-holds-lock variant. Used by duplicate_guest_fd which brackets + * the source snapshot, host dup, and this mirror inside one + * pty_keepalive_lock window so a sibling close cannot run + * proc_pty_close_keepalive between the snapshot and the mirror -- the + * exact race that would leave the alias without a keepalive entry. + */ + if (src_master_host_fd < 0 || dst_master_host_fd < 0) + return; + + int dst_slave = -1; + uint32_t src_pts_num = UINT32_MAX; + char src_slave_path[PTY_SLAVE_PATH_MAX] = {0}; + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == src_master_host_fd) { + dst_slave = dup(pty_keepalive_table[i].slave_host_fd); + src_pts_num = pty_keepalive_table[i].linux_pts_num; + memcpy(src_slave_path, pty_keepalive_table[i].slave_path, + PTY_SLAVE_PATH_MAX); + break; + } + } + if (dst_slave < 0) + return; + + /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into + * a guest child that has no map back to it. + */ + int fdflags = fcntl(dst_slave, F_GETFD); + if (fdflags < 0 || fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { + close(dst_slave); + return; + } + uint32_t existing_pts = UINT32_MAX; + int rc = pty_keepalive_register_locked(dst_master_host_fd, dst_slave, + src_pts_num, src_slave_path, + &existing_pts); + if (rc != PTY_REG_INSERTED) { + /* Table full or duplicate entry for dst_master_host_fd; drop the + * redundant slave. Duplicate is unexpected: dst is a freshly-duped + * host fd that should not already be in the table unless a prior + * close skipped proc_pty_close_keepalive. + */ + close(dst_slave); + } +} + +void proc_pty_close_keepalive(int master_host_fd) +{ + /* fd_cleanup_entry calls this for every guest fd close, not just pty + * masters. Force the table to its FREE-sentinel state on first use: without + * this, the BSS-zero initial state lets the first close of host fd 0 (e.g. + * guest stdin) match slot 0 (master_host_fd=0) and close fd 0 inside + * elfuse. pthread_once is idempotent and cheap after the flag is set. + */ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + if (master_host_fd < 0) + return; + + int slave = -1; + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == master_host_fd) { + slave = pty_keepalive_table[i].slave_host_fd; + pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[i].linux_pts_num = 0; + pty_keepalive_table[i].slave_path[0] = '\0'; + break; + } + } + pthread_mutex_unlock(&pty_keepalive_lock); + if (slave >= 0) + close(slave); +} + +int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, + int *out_slave_fds, + int max_entries) +{ + if (!out_entries || !out_slave_fds || max_entries <= 0) + return 0; + + pthread_once(&pty_keepalive_once, pty_keepalive_init); + int n = 0; + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX && n < max_entries; i++) { + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) + continue; + + /* dup under the lock so the slave fd cannot be closed and the host fd + * number recycled before SCM_RIGHTS reads it. The caller closes the dup + * after the send completes. + */ + int duped = dup(pty_keepalive_table[i].slave_host_fd); + if (duped < 0) + continue; + + out_entries[n].master_host_fd = pty_keepalive_table[i].master_host_fd; + out_entries[n].linux_pts_num = pty_keepalive_table[i].linux_pts_num; + _Static_assert(sizeof(out_entries[n].slave_path) == PTY_SLAVE_PATH_MAX, + "ipc slave_path size must match keepalive table"); + memcpy(out_entries[n].slave_path, pty_keepalive_table[i].slave_path, + PTY_SLAVE_PATH_MAX); + out_slave_fds[n] = duped; + n++; + } + pthread_mutex_unlock(&pty_keepalive_lock); + return n; +} + +void proc_pty_restore_keepalive(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path) +{ + /* fork-IPC hand-off: the child just installed its own host fd for the pty + * master (a fresh kernel fd, not the parent's number) and received the + * keepalive slave fd via SCM_RIGHTS. Stitch the pair back together under + * the same (linux_pts_num, slave_path) the parent had so future /dev/pts/N + * opens in the child resolve to the same macOS tty. + * + * The slave fd arrived via SCM_RIGHTS without FD_CLOEXEC set; the keepalive + * must not survive exec into the guest's children, so set it here. On any + * failure, drop the slave fd rather than ship a keepalive whose lifetime + * elfuse cannot guarantee. + */ + if (master_host_fd < 0) { + if (slave_host_fd >= 0) + close(slave_host_fd); + return; + } + + if (slave_host_fd >= 0) { + int fdflags = fcntl(slave_host_fd, F_GETFD); + if (fdflags < 0 || + fcntl(slave_host_fd, F_SETFD, fdflags | FD_CLOEXEC) < 0) { + close(slave_host_fd); + return; + } + } + + /* Trust the parent's linux_pts_num verbatim instead of re-parsing + * slave_path. The wire-format string is bounded to PTY_SLAVE_PATH_MAX-1 + * bytes; if a future macOS canonical form ever exceeded that, the parent + * would have truncated and reparsing here would yield the wrong number. + */ + errno = 0; + if (pty_keepalive_register(master_host_fd, slave_host_fd, linux_pts_num, + slave_path) < 0) { + if (slave_host_fd >= 0) + close(slave_host_fd); + } else if (errno == EEXIST) { + /* The child's fd_table-restore path can theoretically replay the + * same master_host_fd if a prior recv-keepalive entry was already + * installed for that number. Drop our redundant slave so it does + * not leak. + */ + if (slave_host_fd >= 0) + close(slave_host_fd); + } +} + +/* Open /dev/ptmx, unlock the slave, and instantiate a keepalive slave fd + * so the master's tty ioctls work before the guest opens the slave itself. + * Returns the master host fd on success, -1 with errno set on failure. + */ +static int pty_open_master(int linux_flags) +{ + /* /dev/ptmx is a character device; O_CREAT / O_TRUNC / O_EXCL make no + * sense here. Strip them and only honor accmode + descriptor flags so + * the host open(2) never sees a variadic-mode-required combination + * without a mode arg. + */ + int oflags = translate_open_flags(linux_flags) & + (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + int master = open("/dev/ptmx", oflags); + if (master < 0) + return -1; + + /* grantpt(3) is a no-op on a unix98 pty mount, but call it for clarity + * and to match what posix_openpt(3)'s callers expect to have happened. + */ + if (grantpt(master) < 0 || unlockpt(master) < 0) { + int saved = errno; + close(master); + errno = saved; + return -1; + } + char slave_path[PTY_SLAVE_PATH_MAX]; + if (ptsname_r(master, slave_path, sizeof(slave_path)) != 0) { + int saved = errno; + close(master); + errno = saved; + return -1; + } + + /* Establish the (linux_pts_num, slave_path) mapping that /dev/pts/N opens + * and stats resolve through. If table or slave-fd registration fails after + * the master is open, report EMFILE rather than silently returning a master + * fd whose pts number cannot be opened back through /dev/pts/N. The caller + * can close other pty pairs and retry instead of dealing with a half-broken + * descriptor. + */ + uint32_t linux_pts_num = pty_extract_pts_num(slave_path); + if (linux_pts_num == UINT32_MAX) { + close(master); + errno = ENOTTY; + return -1; + } + int slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC); + if (slave < 0) { + int saved = errno; + close(master); + errno = saved; + return -1; + } + errno = 0; + if (pty_keepalive_register(master, slave, linux_pts_num, slave_path) < 0) { + close(slave); + close(master); + errno = EMFILE; + return -1; + } + /* Defense-in-depth: the freshly-opened master fd should not already have + * a keepalive (would indicate a stale entry from a prior close that did + * not run proc_pty_close_keepalive). Drop the redundant slave so it does + * not leak. + */ + if (errno == EEXIST) + close(slave); + return master; +} + int proc_intercept_open(const guest_t *g, const char *path, int linux_flags, int mode) { + /* /dev/ptmx -> host /dev/ptmx + keepalive slave (see pty_open_master). */ + if (!strcmp(path, "/dev/ptmx")) + return pty_open_master(linux_flags); + /* /dev/null, /dev/zero, /dev/(u)random, /dev/tty */ const char *host_dev = NULL; int host_accmode = translate_open_flags(linux_flags) & O_ACCMODE; @@ -1525,6 +2137,7 @@ int proc_intercept_open(const guest_t *g, const char *shm = shm_dir_path(); return shm ? proc_open_dir_fd(shm, linux_flags) : -1; } + if (!strncmp(path, "/dev/shm/", 9)) { char host_path[512]; if (dev_shm_resolve_path(path + 9, host_path, sizeof(host_path)) < 0) @@ -1548,6 +2161,46 @@ int proc_intercept_open(const guest_t *g, if (!strncmp(path, "/dev/fd/", 8)) return dev_fd_dup(path, 8); + /* /dev/pts -> synthetic devpts directory. stat/access advertise this + * directory even on macOS hosts without /dev/pts, so open must be + * intercepted too or callers that probe then enumerate see inconsistent + * Linux-visible behavior. + */ + if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/")) + return pty_open_pts_dir(linux_flags); + + /* /dev/pts/N -> the macOS slave path captured at /dev/ptmx open time. + * Looking up the exact ptsname(3) string (rather than reformatting + * /dev/ttys%03lu) keeps the guest correct against any future macOS format + * change and against tty minor encodings that do not round-trip through + * plain zero-padding. ENOENT until the owning master is opened matches + * Linux devpts behavior for an unallocated slave number. + */ + if (!strncmp(path, "/dev/pts/", 9)) { + const char *digits = path + 9; + if (!*digits) { + errno = ENOENT; + return -1; + } + char *endp; + unsigned long n = strtoul(digits, &endp, 10); + if (endp == digits || *endp != '\0' || n > UINT32_MAX) { + errno = ENOENT; + return -1; + } + char host_path[PTY_SLAVE_PATH_MAX]; + if (pty_lookup_slave_path((uint32_t) n, host_path, sizeof(host_path)) < + 0) + return -1; + /* /dev/pts/N is a character device; strip O_CREAT and friends so + * the two-argument open(2) never sees a creation-mode-required + * combination without a mode arg. + */ + int oflags = translate_open_flags(linux_flags) & + (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + return open(host_path, oflags); + } + /* /proc -> synthetic directory with PID entries for busybox ps, top, etc. * Creates a temp dir once (cached for the process lifetime) with entries * matching the current single-process model: the current PID directory + @@ -1573,9 +2226,8 @@ int proc_intercept_open(const guest_t *g, * Each open gets its own scratch dir so concurrent enumerations cannot * mutate one another (see proc_open_fd_scratch). */ - if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) { + if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) return proc_open_fd_scratch("elfuse-fd", linux_flags); - } if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) { const char *dir = ensure_proc_tmpdir(g); @@ -1590,9 +2242,9 @@ int proc_intercept_open(const guest_t *g, return proc_open_dir_fd(netdir, linux_flags); } - /* /proc/[/...] -> /proc/self[...]. Returns -1 on - * ENAMETOOLONG so the guest sees the same error a real Linux kernel - * would produce instead of falling through to a host syscall. + /* /proc/[/...] -> /proc/self[...]. + * Returns -1 on ENAMETOOLONG so the guest sees the same error a real Linux + * kernel would produce instead of falling through to a host syscall. */ { char alias[LINUX_PATH_MAX]; @@ -1621,9 +2273,9 @@ int proc_intercept_open(const guest_t *g, * return an actual file descriptor to the binary. * Under rosetta, the binfmt_misc convention treats rosetta as the * interpreter visible to the guest: rosetta opens /proc/self/fd/X - * via /proc/self/exe to identify itself and then issues the VZ - * ioctls on that descriptor. Return ROSETTA_PATH so the VZ ioctl - * gate (rosetta_ioctl_target_fd) recognises the fd. + * via /proc/self/exe to identify itself and then issues the VZ ioctls on + * that descriptor. Return ROSETTA_PATH so the VZ ioctl gate + * (rosetta_ioctl_target_fd) recognises the fd. */ if (!strcmp(path, "/proc/self/exe")) { if (g && g->is_rosetta) @@ -1637,8 +2289,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/cpuinfo -> synthetic file with CPU count. - * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid - * silent truncation on hosts with >16 CPUs. + * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid silent + * truncation on hosts with >16 CPUs. */ if (!strcmp(path, "/proc/cpuinfo")) { int ncpu = (int) sysconf(_SC_NPROCESSORS_ONLN); @@ -1746,8 +2398,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/task -> directory with per-thread TID entries. - * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at - * startup to discover thread count and per-thread state. + * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at startup + * to discover thread count and per-thread state. * * Rebuilds a temp directory on each open (thread set is dynamic). * Cannot rmdir before returning the fd because macOS getdents on unlinked @@ -1815,8 +2467,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/task/ directory itself: synthesize a dir with - * stat/status placeholder entries. Persistent so getdents sees - * the entries on macOS (which cannot enumerate unlinked dirs). + * stat/status placeholder entries. Persistent so getdents sees the + * entries on macOS (which cannot enumerate unlinked dirs). */ if (*endp == '\0' || !strcmp(endp, "/")) { static proc_persistent_dir_t tiddir = @@ -1881,9 +2533,9 @@ int proc_intercept_open(const guest_t *g, /* Add preannounced entries only while they still have an uncovered * tail. Once the union of live regions covers the full advertised - * interval, suppress the shadow entry so /proc/self/maps shows only - * the realized split VMAs. A partial union must stay visible because - * some reserved-but-not-realized span remains to advertise. + * interval, suppress the shadow entry so /proc/self/maps shows only the + * realized split VMAs. A partial union must stay visible because some + * reserved-but-not-realized span remains to advertise. */ for (int i = 0; i < g->npreannounced && nentries < MAPS_ENTRY_MAX; i++) { @@ -2185,8 +2837,9 @@ int proc_intercept_open(const guest_t *g, uint64_t mask; /* fs/signalfd.c uses a tab after the colon (matching the * pos:/flags:/mnt_id: convention in fs/proc/fd.c, not the - * single-space style of eventfd/timerfd). Verified against a - * real Linux 6.x /proc/self/fdinfo dump. */ + * single-space style of eventfd/timerfd). Verified against a real + * Linux 6.x /proc/self/fdinfo dump. + */ if (signalfd_fdinfo_snapshot(n, &mask)) snprintf(extra, sizeof(extra), "sigmask:\t%016llx\n", (unsigned long long) mask); @@ -2196,11 +2849,12 @@ int proc_intercept_open(const guest_t *g, int64_t value_ns, interval_ns; if (timerfd_fdinfo_snapshot(n, &clockid, &ticks, &value_ns, &interval_ns)) { - /* Linux fs/timerfd.c emits these fields with single - * spaces after the colon, not tabs (unlike pos:/flags:/ - * mnt_id: in fs/proc/fd.c, which do use tabs). Match the - * upstream format so guest readers parsing fdinfo via a - * "it_value: (" prefix find the field. */ + /* Linux fs/timerfd.c emits these fields with single spaces + * after the colon, not tabs (unlike pos:/flags:/mnt_id: in + * fs/proc/fd.c, which do use tabs). Match the upstream format + * so guest readers parsing fdinfo via a "it_value: (" prefix + * find the field. + */ snprintf(extra, sizeof(extra), "clockid: %d\n" "ticks: %llu\n" @@ -2227,9 +2881,9 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/fdinfo -> directory listing. Each open gets its own scratch - * dir so concurrent getdents on independent dirfds cannot interfere - * (the previous shared-dir design unlinked entries under a sibling - * enumerator). The dirs are tracked for atexit cleanup. + * dir so concurrent getdents on independent dirfds cannot interfere (the + * previous shared-dir design unlinked entries under a sibling enumerator). + * The dirs are tracked for atexit cleanup. */ if (!strcmp(path, "/proc/self/fdinfo") || !strcmp(path, "/proc/self/fdinfo/")) { @@ -2276,6 +2930,7 @@ int proc_intercept_open(const guest_t *g, buffers_kb = total_kb / 20; cached_kb = total_kb / 4; } + return proc_emit_fmt( "MemTotal: %llu kB\n" "MemFree: %llu kB\n" @@ -2313,9 +2968,9 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/io -> synthetic I/O counters. - * Some node-style observability runtimes read this for resource - * monitoring metrics. procfs emulation returns zeroed counters because - * it does not track per-guest I/O. + * Some node-style observability runtimes read this for resource monitoring + * metrics. procfs emulation returns zeroed counters because it does not + * track per-guest I/O. */ if (!strcmp(path, "/proc/self/io")) { return proc_emit_literal( @@ -2488,6 +3143,63 @@ int proc_intercept_stat(const char *path, struct stat *st) return stat(host_path, st); } + /* /dev/pts directory and /dev/pts/N slave entries. glibc ptsname(3) + * stats /dev/pts/N after TIOCGPTN and rejects with ENOENT if absent. + * Synthesize a minimal char-device stat whose st_rdev decodes to Linux's + * standard pts major (136) so glibc's major(rdev) == UNIX98_PTY_SLAVE_MAJOR + * check passes. The numeric tail must round-trip with /dev/ttysN via the + * open intercept (see proc_intercept_open). + */ + if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/")) { + stat_fill_proc_dir(st, 0755, 2, path); + return 0; + } + if (!strncmp(path, "/dev/pts/", 9)) { + const char *digits = path + 9; + if (!*digits) { + errno = ENOENT; + return -1; + } + char *endp; + unsigned long n = strtoul(digits, &endp, 10); + if (endp == digits || *endp != '\0' || n > UINT32_MAX) { + errno = ENOENT; + return -1; + } + /* Resolve through the captured-path table: ENOENT unless the + * corresponding master is currently open. This avoids the host + * stat false-positive where /dev/ttysNNN happens to exist for an + * unrelated tty allocated outside elfuse. + */ + char host_path[PTY_SLAVE_PATH_MAX]; + if (pty_lookup_slave_path((uint32_t) n, host_path, sizeof(host_path)) < + 0) + return -1; + struct stat host_st; + if (stat(host_path, &host_st) < 0) { + errno = ENOENT; + return -1; + } + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFCHR | 0620; + st->st_nlink = 1; + st->st_uid = host_st.st_uid; + st->st_gid = host_st.st_gid; + /* macOS dev_t = (major << 24) | minor; the fs-stat translation layer + * (mac_to_linux_dev) re-encodes that into Linux's split major/minor + * layout, so storing 136 in the macOS-major slot makes glibc's + * major(rdev) yield UNIX98_PTY_SLAVE_MAJOR. + */ + st->st_rdev = ((dev_t) 136u << 24) | (dev_t) (n & 0xFFFFFFu); + st->st_size = 0; + st->st_blksize = 1024; + st->st_blocks = 0; + st->st_atime = host_st.st_atime; + st->st_mtime = host_st.st_mtime; + st->st_ctime = host_st.st_ctime; + return 0; + } + /* /proc and /proc/ are directories */ if (!strcmp(path, "/proc") || !strcmp(path, "/proc/")) { stat_fill_proc_dir(st, 0555, 3, path); @@ -2645,10 +3357,11 @@ int proc_intercept_stat(const char *path, struct stat *st) struct stat host_st; if (lstat(host_path, &host_st) < 0) return -1; + /* Replace host inode/dev with the synthetic-procfs convention so - * the guest sees a stable identity that does not collide with - * real host files (and so st_size reads as 0 for cpumask files, - * matching real sysfs). + * the guest sees a stable identity that does not collide with real + * host files (and so st_size reads as 0 for cpumask files, matching + * real sysfs). */ if (S_ISDIR(host_st.st_mode)) stat_fill_proc_dir(st, 0555, 2, path); @@ -2699,9 +3412,9 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) char sysroot_snap[LINUX_PATH_MAX]; if (proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap))) { /* proc_set_sysroot stores a realpath()-canonicalized form, so - * canonicalize exe before the prefix check or the strip fails - * when /var -> /private/var (and similar macOS symlinks) make - * the two strings diverge. + * canonicalize exe before the prefix check or the strip fails when + * /var -> /private/var (and similar macOS symlinks) make the two + * strings diverge. */ const char *exe_cmp = exe; if (realpath(exe, exe_real)) @@ -2843,10 +3556,11 @@ int proc_intercept_write(int guest_fd, return 0; int kind = proc_oom_path_kind(snap.proc_path); if (kind == OOM_PATH_SCORE) { - /* Linux: oom_score has no write handler. proc_reg_write returns - * -EIO when the underlying proc_dir_entry exposes no write op, - * not -EINVAL. Match that so guests probing the error code see - * the same value as on a real kernel. */ + /* Linux: oom_score has no write handler. proc_reg_write returns -EIO + * when the underlying proc_dir_entry exposes no write op, not -EINVAL. + * Match that so guests probing the error code see the same value as on + * a real kernel. + */ errno = EIO; return -1; } @@ -2854,8 +3568,8 @@ int proc_intercept_write(int guest_fd, return 0; /* Linux: zero-byte writes to proc nodes succeed without side effects. - * Without this short-circuit, sys_writev would funnel a zero-length - * vector through proc_parse_int_write and get -EINVAL. + * Without this short-circuit, sys_writev would funnel a zero-length vector + * through proc_parse_int_write and get -EINVAL. */ if (count == 0) { *written_out = 0; @@ -2902,6 +3616,7 @@ int proc_intercept_write(int guest_fd, goto unlock; if (!use_pwrite && lseek(host_fd, offset + (int64_t) count, SEEK_SET) < 0) goto unlock; + atomic_store(&oom_score_adj_value, score_adj); proc_oom_refresh_live_fds_locked(); *written_out = (ssize_t) count; diff --git a/src/runtime/procemu.h b/src/runtime/procemu.h index 58f4f30..78227ae 100644 --- a/src/runtime/procemu.h +++ b/src/runtime/procemu.h @@ -11,6 +11,7 @@ #pragma once #include +#include #include #include #include "core/guest.h" @@ -86,3 +87,80 @@ const char *proc_get_shm_dir(void); int proc_dev_shm_resolve(const char *guest_suffix, char *host_path, size_t host_path_sz); + +/* Drop the keepalive slave fd paired with a /dev/ptmx master host fd. Called + * from fd_cleanup_entry when the guest closes the master so the host kernel + * eventually tears the tty down. Idempotent and safe to call on any host fd + * (no-op when no keepalive is registered). + */ +void proc_pty_close_keepalive(int master_host_fd); + +/* Caller-locked variant of pty keepalive duplication. Brackets the host + * fd_snapshot_and_dup and the keepalive mirror under one pty_keepalive_lock + * window so a sibling close cannot run proc_pty_close_keepalive in the + * gap and leave the alias without a keepalive entry. Caller must wrap the + * sequence with proc_pty_lock_for_dup / proc_pty_unlock_for_dup. + */ +void proc_pty_lock_for_dup(void); +void proc_pty_unlock_for_dup(void); +void proc_pty_dup_keepalive_locked(int src_master_host_fd, + int dst_master_host_fd); + +/* Return the captured Linux pts number for a host master fd, or UINT32_MAX + * when no keepalive is registered. Lets sys_ioctl TIOCGPTN report the value + * /dev/pts/N opens / stats round-trip through, instead of independently + * parsing the macOS slave path and risking divergence with the open table. + */ +uint32_t proc_pty_master_pts_num(int master_host_fd); + +/* Best-effort lazy registration for a /dev/ptmx master that elfuse did not + * open itself (e.g. one received from a peer via SCM_RIGHTS). Takes a guest + * fd because the canonical host fd would race with sibling close+reuse if + * passed in directly: this helper snapshots fd_table[guest_fd].host_fd and + * its generation, performs the slave open against a private dup, and only + * registers the keepalive after re-verifying the slot still holds the + * original (host_fd, generation). Returns the pts number on success, or + * UINT32_MAX if the fd is not a pty master, the slot got closed/recycled + * mid-adoption, or the keepalive table is full. Idempotent: if the master + * already has a keepalive entry, returns its stored linux_pts_num. + */ +uint32_t proc_pty_master_adopt(int guest_fd); + +/* Max bytes for a captured macOS slave path (e.g. "/dev/ttys004"). Lives in + * the header so proc_pty_ipc_entry_t below and the in-memory keepalive table + * in procemu.c share one source of truth; a divergence would silently corrupt + * the fork-IPC wire format. + */ +#define PTY_SLAVE_PATH_MAX 64 + +/* Serialized form of a pty keepalive entry, used by fork-IPC. The slave host + * fd travels separately via SCM_RIGHTS; this struct only carries the + * lifetime-independent metadata that the child needs to re-register. + */ +typedef struct { + int32_t master_host_fd; + uint32_t linux_pts_num; + char slave_path[PTY_SLAVE_PATH_MAX]; +} proc_pty_ipc_entry_t; + +/* Snapshot the current pty keepalive table into out_entries / out_slave_fds. + * dup()s every slave fd under the keepalive lock so the snapshot stays valid + * across the SCM_RIGHTS send even if the original master gets closed before + * the IPC drains. Returns the number of live entries written (always + * <= max_entries); on success the caller owns the duplicated slave fds and + * must close them after the IPC send completes. + */ +int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, + int *out_slave_fds, + int max_entries); + +/* Re-register a single keepalive in the child after a fork-IPC. master_host_fd + * is the child-side host fd that just landed in fd_table (its number is + * different from the parent's). Takes ownership of slave_host_fd. The slave + * path is recorded so /dev/pts/N opens in the child resolve to the same macOS + * tty the parent has been talking to. + */ +void proc_pty_restore_keepalive(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path); diff --git a/src/syscall/abi.h b/src/syscall/abi.h index ac58574..cecc5c1 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -351,6 +351,7 @@ typedef struct { #define LINUX_TIOCSPGRP 0x5410 /* -> macOS TIOCSPGRP (same semantics) */ #define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */ #define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */ +#define LINUX_TIOCSWINSZ 0x5414 /* -> macOS TIOCSWINSZ (same struct) */ #define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */ #define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */ #define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */ @@ -362,6 +363,13 @@ typedef struct { #define LINUX_TCSETS2 0x402c542b /* termios2 set (TCSANOW) */ #define LINUX_TCSETSW2 0x402c542c /* termios2 set (TCSADRAIN) */ #define LINUX_TCSETSF2 0x402c542d /* termios2 set (TCSAFLUSH) */ +/* Pseudoterminal multiplexer ioctls. The numeric encodings match Linux + * include/uapi/asm-generic/ioctls.h regardless of architecture. macOS exposes + * an equivalent /dev/ptmx and unlockpt(3); ptsname(3) returns /dev/ttysNNN. + */ +#define LINUX_TIOCGPTN 0x80045430 /* _IOR('T', 0x30, unsigned int) */ +#define LINUX_TIOCSPTLCK 0x40045431 /* _IOW('T', 0x31, int) */ +#define LINUX_TIOCGPTPEER 0x5441 /* _IO('T', 0x41); arg is open flags */ /* Linux open flags. */ #define LINUX_O_RDONLY 0x0000 diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index a072cfb..13e95bd 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -21,6 +21,7 @@ #include "utils.h" #include "core/shim-globals.h" +#include "runtime/procemu.h" #include "syscall/abi.h" #include "syscall/internal.h" @@ -477,6 +478,12 @@ void fd_cleanup_entry(int guest_fd, const fd_entry_t *snap) if (snap->cleanup) snap->cleanup(guest_fd); + /* Drop any /dev/ptmx keepalive slave fd paired with this host fd. Must + * happen before close(snap->host_fd) because the side table is keyed by + * the still-live host master fd. No-op for non-pty fds. + */ + proc_pty_close_keepalive(snap->host_fd); + /* Keep stdin/stdout/stderr open on the host */ if (snap->type != FD_STDIO) close(snap->host_fd); diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 004f3b2..3ccf699 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -342,6 +342,12 @@ int64_t sys_openat_path(guest_t *g, int type = intercepted_fd_type(tx.intercept_path, intercepted, linux_flags); if (type < 0) { + /* /dev/ptmx registers a keepalive slave under intercepted + * before this point; without dropping it here the slave fd + * leaks because nothing else has the master in fd_table. + * proc_pty_close_keepalive is a no-op for other paths. + */ + proc_pty_close_keepalive(intercepted); close_keep_errno(intercepted); return linux_errno(); } @@ -351,6 +357,7 @@ int64_t sys_openat_path(guest_t *g, fd_alloc_opened_host(intercepted, type, linux_flags, min_guest_fd, fd_cleanup_for_type(type)); if (guest_fd < 0) { + proc_pty_close_keepalive(intercepted); close_keep_errno(intercepted); return linux_errno(); } @@ -430,6 +437,13 @@ int64_t sys_close(int fd) int host_fd = -1; if (fd_close_regular_relaxed(fd, &host_fd)) { + /* The fast path bypasses fd_cleanup_entry, so any side tables + * keyed by host_fd that the slow path drops must be drained here + * too. proc_pty_close_keepalive is a cheap no-op for non-pty fds + * and prevents the keepalive slave from leaking past a /dev/ptmx + * close when no per-type cleanup is registered. + */ + proc_pty_close_keepalive(host_fd); if (close(host_fd) < 0) return linux_errno(); return 0; @@ -542,18 +556,27 @@ static int duplicate_guest_fd(int src_fd, bool fixed_slot, int linux_flags) { - /* Snapshot the source entry and dup its host fd in a single fd_lock - * critical section so the type, host fd, and metadata captured here - * cannot drift apart under a racing close + reopen. + /* Hold pty_keepalive_lock across the source snapshot, host dup, and + * keepalive mirror so a concurrent sys_close on src_fd cannot remove + * the source's keepalive entry between fd_snapshot_and_dup and + * proc_pty_dup_keepalive_locked. Without this bracket the alias would + * land in fd_table with no keepalive of its own. + * + * Lock order is pty_keepalive_lock -> fd_lock (fd_snapshot_and_dup + * takes fd_lock internally); proc_pty_master_adopt's joint-locked + * publish uses the same order so the two paths do not deadlock. */ + proc_pty_lock_for_dup(); fd_entry_t src_snap; int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap); if (new_host_fd < 0 && src_snap.type == FD_CLOSED) { + proc_pty_unlock_for_dup(); errno = EBADF; return -1; } if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE || src_snap.type == FD_FUSE_DIR) { + proc_pty_unlock_for_dup(); if (new_host_fd >= 0) close_keep_errno(new_host_fd); return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, @@ -566,13 +589,27 @@ static int duplicate_guest_fd(int src_fd, * bind there. */ if (src_snap.type == FD_EVENTFD) { + proc_pty_unlock_for_dup(); if (new_host_fd >= 0) close_keep_errno(new_host_fd); return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd, fixed_guest_fd, fixed_slot, linux_flags); } - if (new_host_fd < 0) + if (new_host_fd < 0) { + proc_pty_unlock_for_dup(); return -1; + } + + /* Mirror any /dev/ptmx keepalive BEFORE fd_alloc publishes guest_fd. + * Once the guest fd exists, a sibling thread can close it; that runs + * fd_cleanup_entry which calls proc_pty_close_keepalive(new_host_fd). + * For that cleanup to drop the freshly-duped keepalive, the keepalive + * entry must already be in the table; registering after fd_alloc would + * lose the race and leak the slave fd. No-op when the source has no + * keepalive. + */ + proc_pty_dup_keepalive_locked(src_snap.host_fd, new_host_fd); + proc_pty_unlock_for_dup(); int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type; void (*cleanup)(int) = fd_cleanup_for_type(new_type); @@ -583,6 +620,10 @@ static int duplicate_guest_fd(int src_fd, if (guest_fd < 0) { if (fixed_slot) errno = EBADF; + /* fd_cleanup_entry never ran on new_host_fd (no guest fd was + * registered), so the keepalive must be dropped explicitly here. + */ + proc_pty_close_keepalive(new_host_fd); close_keep_errno(new_host_fd); return -1; } diff --git a/src/syscall/io.c b/src/syscall/io.c index 67f02cd..59ef3e9 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1551,6 +1552,7 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) } case LINUX_TIOCGWINSZ: { /* Get terminal window size */ + (void) proc_pty_master_adopt(fd); struct winsize ws; if (ioctl(host_fd, TIOCGWINSZ, &ws) < 0) { host_fd_ref_close(&host_ref); @@ -1569,6 +1571,33 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) host_fd_ref_close(&host_ref); return 0; } + case LINUX_TIOCSWINSZ: { + /* Set terminal window size. Same struct as TIOCGWINSZ; foot, sshd, + * tmux, and any libvte-derived emulator call this on the PTY master + * after spawning the slave child. Without it, terminal startup fails + * with -ENOTTY from the default arm below (issue #88). + * + * A master received through SCM_RIGHTS bypasses /dev/ptmx open + * interception, so lazily create its keepalive before the host ioctl. + * The helper is a no-op for non-pty fds; the real ioctl below still + * supplies the final errno. + */ + linux_winsize_t lws; + if (guest_read_small(g, arg, &lws, sizeof(lws)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + struct winsize ws = { + .ws_row = lws.ws_row, + .ws_col = lws.ws_col, + .ws_xpixel = lws.ws_xpixel, + .ws_ypixel = lws.ws_ypixel, + }; + (void) proc_pty_master_adopt(fd); + int rc = ioctl(host_fd, TIOCSWINSZ, &ws); + host_fd_ref_close(&host_ref); + return rc < 0 ? linux_errno() : 0; + } case LINUX_TCGETS: { /* Get terminal attributes. c_cc index mapping is in file-scope @@ -1715,6 +1744,96 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) return 0; } + case LINUX_TIOCGPTN: { + /* Get the slave pty number associated with a /dev/ptmx master fd. + * Pass the guest fd: proc_pty_master_adopt snapshots the canonical + * (host_fd, generation) under fd_lock, performs the slave open on a + * private dup, then re-validates the slot before publishing the + * keepalive. Passing the per-syscall host_fd_ref dup or a raw host + * fd would race with sibling close+reuse. + */ + uint32_t val = proc_pty_master_adopt(fd); + if (val == UINT32_MAX) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + if (guest_write_small(g, arg, &val, sizeof(val)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + host_fd_ref_close(&host_ref); + return 0; + } + case LINUX_TIOCSPTLCK: { + /* Lock/unlock the slave side of a pty. glibc unlockpt() always passes + * 0 (unlock); util-linux's setlock(1) passes 1 to lock. macOS exposes + * unlockpt(3) but no re-lock primitive, so the lock branch is accepted + * as a best-effort no-op for real ptmx masters rather than surfacing + * as -EINVAL: an application probing the result would otherwise + * misread the failure as "this kernel has no devpts". + */ + int32_t lock = 0; + if (guest_read_small(g, arg, &lock, sizeof(lock)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + int rc = 0; + if (lock == 0) { + rc = unlockpt(host_fd); + } else { + char slave[64]; + if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + } + host_fd_ref_close(&host_ref); + return rc < 0 ? linux_errno() : 0; + } + case LINUX_TIOCGPTPEER: { + /* Return a fresh fd referring to the slave side of a /dev/ptmx master. + * Linux added this in 4.13 so callers can avoid the ptsname(3) round + * trip and any /dev/pts visibility races. The arg holds open(2)-style + * flags. Restrict to the bits Linux's pty driver actually honors + * (accmode + O_NOCTTY + O_NONBLOCK + O_CLOEXEC); any other bit, in + * particular O_CREAT / O_TRUNC / O_EXCL / O_PATH, would be silently + * ignored on Linux and is rejected with EINVAL here so misuse does + * not leak nonsense flags into the guest fd table. + */ + int linux_flags = (int) arg; + const int allowed = LINUX_O_ACCMODE | LINUX_O_NOCTTY | + LINUX_O_NONBLOCK | LINUX_O_CLOEXEC; + if (linux_flags & ~allowed) { + host_fd_ref_close(&host_ref); + return -LINUX_EINVAL; + } + char slave[64]; + if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + int oflags = translate_open_flags(linux_flags); + int host_slave_fd = open(slave, oflags); + if (host_slave_fd < 0) { + int saved_errno = errno; + host_fd_ref_close(&host_ref); + errno = saved_errno; + return linux_errno(); + } + host_fd_ref_close(&host_ref); + int guest_fd = fd_alloc(FD_REGULAR, host_slave_fd, NULL); + if (guest_fd < 0) { + close(host_slave_fd); + return -LINUX_EMFILE; + } + /* Track CLOEXEC + accmode in the guest table so exec honors them; the + * host fd's own FD_CLOEXEC is per-descriptor and would be lost on the + * dup that host_fd_ref hands multi-threaded callers. + */ + fd_publish_linux_flags(guest_fd, linux_flags); + return guest_fd; + } + case LINUX_FIONBIO: { /* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg: * nonzero enables non-blocking, zero disables it. libuv's diff --git a/src/syscall/path.c b/src/syscall/path.c index b8599ed..d2bf6d8 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -75,6 +75,13 @@ bool path_might_use_stat_intercept(const char *path) return true; if (!strcmp(path, "/dev/fuse")) return true; + /* glibc ptsname(3) stats /dev/pts/N after TIOCGPTN to confirm the slave + * exists and is a char device; without this the stat falls through to the + * host where /dev/pts is absent and ptsname returns ENOENT. + */ + if (!strncmp(path, "/dev/pts/", 9) || !strcmp(path, "/dev/pts") || + !strcmp(path, "/dev/pts/")) + return true; if (fuse_path_matches_mount(path)) return true; if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) diff --git a/tests/manifest.txt b/tests/manifest.txt index 8d836eb..e9ce10f 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -68,6 +68,7 @@ test-epoll-aba test-timerfd test-large-io-boundary test-ioctl-cloexec +test-pty [section] /proc and /dev emulation tests test-proc diff --git a/tests/test-pty.c b/tests/test-pty.c new file mode 100644 index 0000000..ce77534 --- /dev/null +++ b/tests/test-pty.c @@ -0,0 +1,410 @@ +/* PTY ioctl regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Foot, sshd, tmux, and any libvte-derived terminal need the multiplexer + * primitives glibc's posix_openpt(3) / ptsname(3) / openpty(3) stack rests + * on. Exercise the four pieces wired in for issue #88: + * + * 1. TIOCSWINSZ on the /dev/ptmx master fd (the direct failure foot saw) + * 2. TIOCGPTN -> /dev/pts/N path round trip + * 3. TIOCSPTLCK(0) for unlockpt(), plus non-zero lock request fd typing + * 4. /dev/pts/N open + stat intercept and the slave fd's window size + * + * The test stays self-contained on the syscall surface (no libutil/openpty), + * so it runs the same way under elfuse-aarch64, qemu-aarch64, and any future + * elfuse-x86_64 reuse. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +#ifndef TIOCSWINSZ +#define TIOCSWINSZ 0x5414 +#endif +#ifndef TIOCGPTN +#define TIOCGPTN 0x80045430 +#endif +#ifndef TIOCSPTLCK +#define TIOCSPTLCK 0x40045431 +#endif +#ifndef TIOCGPTPEER +#define TIOCGPTPEER 0x5441 +#endif + +int passes = 0, fails = 0; + +int main(void) +{ + printf("test-pty: PTY ioctl + /dev/pts/N path support (issue #88)\n"); + + /* Regression guard for the pty_keepalive_table BSS-zero collision: any + * close that runs before the very first /dev/ptmx open would walk the + * still-zero-initialized table and match a master_host_fd of zero, + * silently closing the wrong slave_host_fd (also zero). Closing stdin + * here forces fd_cleanup_entry to invoke proc_pty_close_keepalive in + * that vulnerable window. If the fix regresses, every subsequent test + * still passes locally but a future stdio fd in another vCPU may go + * missing. The cheap sentinel makes sure stdin's close itself does + * not corrupt elfuse's own file table. + */ + close(STDIN_FILENO); + + int ptmx = open("/dev/ptmx", O_RDWR | O_NOCTTY); + TEST("open(/dev/ptmx, O_RDWR | O_NOCTTY)"); + EXPECT_TRUE(ptmx >= 0, "open /dev/ptmx failed"); + if (ptmx < 0) { + SUMMARY("test-pty"); + return 1; + } + + /* TIOCSWINSZ on the master is the direct regression from issue #88. + * Before the fix, sys_ioctl had no case for it and fell through to the + * default -ENOTTY arm, breaking foot's terminal initialization. + */ + TEST("TIOCSWINSZ on /dev/ptmx master"); + struct winsize ws_set = { + .ws_row = 40, + .ws_col = 132, + .ws_xpixel = 1056, + .ws_ypixel = 640, + }; + EXPECT_TRUE(ioctl(ptmx, TIOCSWINSZ, &ws_set) == 0, "TIOCSWINSZ failed"); + + TEST("TIOCGWINSZ round-trips the values set above"); + struct winsize ws_get = {0}; + int ok = ioctl(ptmx, TIOCGWINSZ, &ws_get) == 0 && ws_get.ws_row == 40 && + ws_get.ws_col == 132 && ws_get.ws_xpixel == 1056 && + ws_get.ws_ypixel == 640; + EXPECT_TRUE(ok, "TIOCGWINSZ round trip mismatch"); + + TEST("TIOCSPTLCK(0) unlocks the slave"); + int unlock = 0; + EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &unlock) == 0, "TIOCSPTLCK(0) failed"); + + /* Linux TIOCSPTLCK(non-zero) locks the slave and returns success. + * elfuse cannot actually enforce the lock on macOS (no re-lock primitive) + * but must still report success so callers do not misread the result as + * "this kernel has no devpts". + */ + TEST("TIOCSPTLCK(1) accepted as best-effort no-op"); + int lock = 1; + EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &lock) == 0, "TIOCSPTLCK(1)"); + + TEST("TIOCSPTLCK(1) rejects a regular file"); + char lock_template[] = "/tmp/elfuse-pty-lock-XXXXXX"; + int regular = mkstemp(lock_template); + if (regular < 0) { + FAIL("mkstemp failed"); + } else { + EXPECT_ERRNO(ioctl(regular, TIOCSPTLCK, &lock), ENOTTY, + "regular fd accepted TIOCSPTLCK(1)"); + close(regular); + unlink(lock_template); + } + + TEST("TIOCSPTLCK(1) rejects a pipe"); + int lock_pipe[2]; + if (pipe(lock_pipe) != 0) { + FAIL("pipe(lock_pipe) failed"); + } else { + EXPECT_ERRNO(ioctl(lock_pipe[0], TIOCSPTLCK, &lock), ENOTTY, + "pipe fd accepted TIOCSPTLCK(1)"); + close(lock_pipe[0]); + close(lock_pipe[1]); + } + + TEST("TIOCGPTN returns a numeric slave id"); + unsigned int ptyno = (unsigned int) -1; + EXPECT_TRUE(ioctl(ptmx, TIOCGPTN, &ptyno) == 0 && ptyno < 100000u, + "TIOCGPTN failed"); + + TEST("stat(/dev/pts) succeeds and reports a directory"); + struct stat pts_dir_st; + int pts_dir_statrc = stat("/dev/pts", &pts_dir_st); + EXPECT_TRUE(pts_dir_statrc == 0 && S_ISDIR(pts_dir_st.st_mode), + "stat /dev/pts failed"); + + TEST("open(/dev/pts, O_DIRECTORY) succeeds"); + int pts_dir_fd = open("/dev/pts", O_RDONLY | O_DIRECTORY); + EXPECT_TRUE(pts_dir_fd >= 0, "open /dev/pts directory failed"); + if (pts_dir_fd >= 0) + close(pts_dir_fd); + + TEST("readdir(/dev/pts) lists the active slave id"); + DIR *pts_dir = opendir("/dev/pts"); + if (!pts_dir) { + FAIL("opendir /dev/pts failed"); + } else { + char want[32]; + snprintf(want, sizeof(want), "%u", ptyno); + int saw_ptyno = 0; + struct dirent *ent; + while ((ent = readdir(pts_dir))) { + if (!strcmp(ent->d_name, want)) { + saw_ptyno = 1; + break; + } + } + closedir(pts_dir); + EXPECT_TRUE(saw_ptyno, "active pts id missing from /dev/pts"); + } + + char pts_path[32]; + snprintf(pts_path, sizeof(pts_path), "/dev/pts/%u", ptyno); + + /* glibc ptsname(3) stats the formatted path before returning it. + * Until the path.c stat allowlist included /dev/pts/N, the stat went + * to the host (which has no /dev/pts at all) and ptsname returned + * ENOENT, leaving every caller without a usable slave path. + */ + TEST("stat(/dev/pts/N) succeeds and reports a char device"); + struct stat st; + int statrc = stat(pts_path, &st); + EXPECT_TRUE(statrc == 0 && S_ISCHR(st.st_mode), "stat /dev/pts/N failed"); + + TEST("stat(/dev/pts/N) major is Linux pts (136)"); + EXPECT_EQ(major(st.st_rdev), 136, "wrong pts major"); + + TEST("open(/dev/pts/N) returns a usable slave fd"); + int slave = open(pts_path, O_RDWR | O_NOCTTY); + EXPECT_TRUE(slave >= 0, "open /dev/pts/N failed"); + + if (slave >= 0) { + TEST("TIOCGWINSZ on the slave reflects the master-side update"); + struct winsize ws_slave = {0}; + int slave_ok = ioctl(slave, TIOCGWINSZ, &ws_slave) == 0 && + ws_slave.ws_row == 40 && ws_slave.ws_col == 132; + EXPECT_TRUE(slave_ok, "slave winsize mismatch"); + + TEST("TIOCSPTLCK(1) rejects the pty slave"); + EXPECT_ERRNO(ioctl(slave, TIOCSPTLCK, &lock), ENOTTY, + "slave fd accepted TIOCSPTLCK(1)"); + + TEST("TIOCSWINSZ on the slave propagates back to the master"); + struct winsize ws_resize = { + .ws_row = 24, + .ws_col = 80, + .ws_xpixel = 0, + .ws_ypixel = 0, + }; + if (ioctl(slave, TIOCSWINSZ, &ws_resize) != 0) { + FAIL("slave TIOCSWINSZ failed"); + } else { + struct winsize ws_after = {0}; + int after_ok = ioctl(ptmx, TIOCGWINSZ, &ws_after) == 0 && + ws_after.ws_row == 24 && ws_after.ws_col == 80; + EXPECT_TRUE(after_ok, "master did not see slave resize"); + } + + close(slave); + } + + /* TIOCGPTPEER short-circuits the ptsname/stat/open dance. Recent foot + * and util-linux prefer it; older kernels return ENOTTY and the caller + * falls back to /dev/pts. Accept either an fd or ENOTTY (some hosts + * legitimately do not implement it), but never silent corruption. */ + TEST("TIOCGPTPEER returns a slave fd or ENOTTY"); + int peer = ioctl(ptmx, TIOCGPTPEER, O_RDWR | O_NOCTTY); + int peer_ok = peer >= 0 || (peer == -1 && errno == ENOTTY); + EXPECT_TRUE(peer_ok, "TIOCGPTPEER returned unexpected status"); + if (peer >= 0) + close(peer); + + /* dup of the master must keep both aliases functional even after the + * original is closed. The keepalive slave needs to be mirrored across + * the dup so the surviving alias still observes master-side tty ioctls. + */ + TEST("dup(ptmx) followed by close(orig) leaves alias usable"); + int alias = dup(ptmx); + if (alias < 0) { + FAIL("dup(ptmx) failed"); + } else { + close(ptmx); + struct winsize ws_alias = {.ws_row = 50, .ws_col = 100}; + if (ioctl(alias, TIOCSWINSZ, &ws_alias) != 0) { + FAIL("TIOCSWINSZ on alias after closing original"); + } else { + struct winsize ws_check = {0}; + int alias_ok = ioctl(alias, TIOCGWINSZ, &ws_check) == 0 && + ws_check.ws_row == 50 && ws_check.ws_col == 100; + EXPECT_TRUE(alias_ok, "alias winsize did not stick"); + } + ptmx = alias; /* the alias is the live master from here on */ + } + + /* Fork must propagate the master's keepalive across the IPC handoff so + * the child can do master-side tty ioctls without the macOS ENOTTY + * fallback. The parent's keepalive slave fd is independent (each side + * holds its own slot) so closing one side does not affect the other. + */ + TEST("child fork inherits master keepalive (TIOCSWINSZ works)"); + int sync_pipe[2]; + if (pipe(sync_pipe) != 0) { + FAIL("pipe(sync_pipe) failed"); + } else { + pid_t pid = fork(); + if (pid < 0) { + FAIL("fork failed"); + close(sync_pipe[0]); + close(sync_pipe[1]); + } else if (pid == 0) { + /* Child: do master-side TIOCSWINSZ and report rc via pipe. */ + close(sync_pipe[0]); + struct winsize ws_child = { + .ws_row = 30, + .ws_col = 90, + }; + int rc = ioctl(ptmx, TIOCSWINSZ, &ws_child); + char status = (rc == 0) ? 'Y' : 'N'; + (void) !write(sync_pipe[1], &status, 1); + close(sync_pipe[1]); + _exit(rc == 0 ? 0 : 1); + } else { + close(sync_pipe[1]); + char status = '?'; + ssize_t n = read(sync_pipe[0], &status, 1); + close(sync_pipe[0]); + int wstatus = 0; + waitpid(pid, &wstatus, 0); + int child_ok = (n == 1) && (status == 'Y') && WIFEXITED(wstatus) && + WEXITSTATUS(wstatus) == 0; + EXPECT_TRUE(child_ok, "child TIOCSWINSZ on master failed"); + /* Parent should still see the child's update because the slave + * keepalive in the parent is still alive. */ + struct winsize ws_parent = {0}; + int parent_ok = ioctl(ptmx, TIOCGWINSZ, &ws_parent) == 0 && + ws_parent.ws_row == 30 && ws_parent.ws_col == 90; + TEST("parent observes the child's master-side resize"); + EXPECT_TRUE(parent_ok, "parent winsize mismatch after child"); + } + } + + close(ptmx); + + /* Re-open and immediately close should not leak the keepalive slave. + * Without the proc_pty_close_keepalive call in sys_close's fast path, + * single-thread close goes through fd_close_regular_relaxed and + * bypasses fd_cleanup_entry, leaving the hidden slave fd open until + * elfuse exits. Loop enough times to expose any per-close leak. + */ + TEST("repeated open/close does not exhaust the keepalive table"); + int leak_loop_ok = 1; + for (int i = 0; i < 300; i++) { + int f = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (f < 0) { + leak_loop_ok = 0; + break; + } + close(f); + } + EXPECT_TRUE(leak_loop_ok, + "repeated /dev/ptmx open/close exhausted the keepalive table"); + + /* A pty master received via SCM_RIGHTS bypasses the /dev/ptmx open + * intercept, so the receiver process has no keepalive entry for it. + * proc_pty_master_adopt must lazily register one before master-side tty + * ioctls, even when TIOCSWINSZ runs before the first TIOCGPTN. Two + * checks live inside this block: TIOCSWINSZ-first and the post-adopt + * stat(/dev/pts/N). Each has its own TEST() label below. + */ + int sp[2]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0) { + TEST("socketpair for SCM_RIGHTS adoption setup"); + FAIL("socketpair failed"); + } else { + int donor = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (donor < 0) { + FAIL("donor open(/dev/ptmx) failed"); + close(sp[0]); + close(sp[1]); + } else { + char iobuf = 'x'; + struct iovec iov = {.iov_base = &iobuf, .iov_len = 1}; + char ctrl[CMSG_SPACE(sizeof(int))] = {0}; + struct msghdr msg = {.msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl)}; + struct cmsghdr *cm = CMSG_FIRSTHDR(&msg); + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + cm->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cm), &donor, sizeof(int)); + ssize_t sent = sendmsg(sp[0], &msg, 0); + close(donor); + if (sent < 0) { + FAIL("sendmsg(SCM_RIGHTS) failed"); + } else { + char rbuf; + struct iovec riov = {.iov_base = &rbuf, .iov_len = 1}; + char rctrl[CMSG_SPACE(sizeof(int))] = {0}; + struct msghdr rmsg = {.msg_iov = &riov, + .msg_iovlen = 1, + .msg_control = rctrl, + .msg_controllen = sizeof(rctrl)}; + ssize_t got = recvmsg(sp[1], &rmsg, 0); + if (got < 0) { + FAIL("recvmsg(SCM_RIGHTS) failed"); + } else { + struct cmsghdr *rcm = CMSG_FIRSTHDR(&rmsg); + int recv_master = -1; + if (rcm && rcm->cmsg_level == SOL_SOCKET && + rcm->cmsg_type == SCM_RIGHTS) + memcpy(&recv_master, CMSG_DATA(rcm), sizeof(int)); + if (recv_master < 0) { + TEST("SCM_RIGHTS recv yields a master fd"); + FAIL("recv_master fd not received"); + } else { + TEST("TIOCSWINSZ first on SCM_RIGHTS-received master"); + struct winsize ws_recv = { + .ws_row = 33, + .ws_col = 101, + .ws_xpixel = 808, + .ws_ypixel = 528, + }; + EXPECT_TRUE( + ioctl(recv_master, TIOCSWINSZ, &ws_recv) == 0, + "TIOCSWINSZ before TIOCGPTN on " + "SCM_RIGHTS-received master"); + TEST("TIOCGPTN on SCM_RIGHTS-received master"); + unsigned int recv_ptyno = (unsigned int) -1; + int gpn_rc = ioctl(recv_master, TIOCGPTN, &recv_ptyno); + EXPECT_TRUE(gpn_rc == 0 && recv_ptyno < 100000u, + "TIOCGPTN on SCM_RIGHTS-received master"); + char recv_pts_path[32]; + snprintf(recv_pts_path, sizeof(recv_pts_path), + "/dev/pts/%u", recv_ptyno); + TEST("stat(/dev/pts/N) after SCM_RIGHTS adoption"); + struct stat recv_st; + EXPECT_TRUE(stat(recv_pts_path, &recv_st) == 0 && + S_ISCHR(recv_st.st_mode), + "stat after SCM_RIGHTS adoption failed"); + close(recv_master); + } + } + } + close(sp[0]); + close(sp[1]); + } + } + + SUMMARY("test-pty"); + return fails > 0 ? 1 : 0; +} From e6d6be3690a4f76d99e1d6db60e9d0fb3342f2bd Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 8 Jun 2026 14:16:13 +0800 Subject: [PATCH 2/7] Survive close-before-open in PTY slave path The keepalive table previously cleared the entire entry on master close, which dropped the (linux_pts_num, slave_path) mapping a forked child's subsequent open("/dev/pts/N") relied on. foot/sshd/openssh sftp-server all close(master) in the child after fork and BEFORE opening the slave, so the slave open landed in the cleared lookup and returned ENOENT even though the parent still held the master and the macOS slave node was openable. proc_pty_close_keepalive now retains linux_pts_num + slave_path for fork-restored entries flagged stale_open_once, keeping the inherited slave host fd to pin the macOS tty across the close-before-open window. The next translated /dev/pts/N open consumes the stale entry, closes the retained slave, and clears the slot. Ordinary local master closes still drop the mapping immediately, so the path cache cannot persist beyond one consumer. pty_keepalive_register_locked prefers a stale-path slot with the same pts number on insert (deterministic macOS minor mapping makes reuse path-correct), then an empty slot, then evicts the lowest-index stale slot. Live entries are never evicted. pty_keepalive_register_recycled expires any stale-path entries holding the same slave_path before the new master takes the slot, so a recycled minor cannot inherit the prior tenant's cached translation. pty_lookup_slave_path and pty_open_slave walk all entries with a non-empty slave_path, preferring live over stale. pty_open_pts_dir enumerates the same set so open and readdir stay consistent: a child that just opened its slave via the stale-path mapping no longer sees an entry that is open(2)-reachable but readdir-invisible. --- src/runtime/procemu.c | 293 +++++++++++++++++++++++++++++++++++------- src/syscall/io.c | 2 +- tests/test-pty.c | 115 ++++++++++++++++- 3 files changed, 360 insertions(+), 50 deletions(-) diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 9666585..cc9e4a1 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -20,6 +20,7 @@ #define MAPS_NAME_COLUMN 73 #include +#include #include #include #include @@ -1502,9 +1503,19 @@ static void proc_task_collect_cb(thread_entry_t *t, void *arg) * encodings) cannot strand the guest with the wrong slave. * * Entries are keyed by the host master fd because that is what fd_cleanup_entry - * has when the guest closes a master. Capacity matches the macOS default - * UNIX98 slave count; overflow leaves the entry empty and the guest gets the - * pre-fix degraded behavior for that one pair instead of an open failure. + * has when the guest closes a master. Capacity matches the macOS default UNIX98 + * slave count; overflow leaves the entry empty and the guest gets the pre-fix + * degraded behavior for that one pair instead of an open failure. + * + * Fork-restored entries may outlive their master for one /dev/pts/N open. A + * foot / sshd / posix-compliant child closes the master fd after fork before + * opening the slave (the child has no use for the master); without retaining + * the path mapping past close, the subsequent /dev/pts/N open in the child + * loses its translation and fails with ENOENT even though the parent still + * holds the master and the macOS slave node is openable. Those stale entries + * keep the received slave fd until the first translated open attempt, then + * expire before the minor can be reused for an unrelated host tty. Ordinary + * local master closes clear the mapping immediately. */ #define PTY_KEEPALIVE_MAX 256 #define PTY_KEEPALIVE_FREE (-1) @@ -1515,6 +1526,7 @@ static struct { int master_host_fd; int slave_host_fd; uint32_t linux_pts_num; + bool stale_open_once; char slave_path[PTY_SLAVE_PATH_MAX]; } pty_keepalive_table[PTY_KEEPALIVE_MAX]; static pthread_mutex_t pty_keepalive_lock = PTHREAD_MUTEX_INITIALIZER; @@ -1526,10 +1538,22 @@ static void pty_keepalive_init(void) pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; pty_keepalive_table[i].linux_pts_num = 0; + pty_keepalive_table[i].stale_open_once = false; pty_keepalive_table[i].slave_path[0] = '\0'; } } +static int pty_keepalive_clear_slot_locked(int slot) +{ + int slave = pty_keepalive_table[slot].slave_host_fd; + pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[slot].slave_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[slot].linux_pts_num = 0; + pty_keepalive_table[slot].stale_open_once = false; + pty_keepalive_table[slot].slave_path[0] = '\0'; + return slave; +} + static uint32_t pty_extract_pts_num(const char *slave_path) { /* macOS canonical slave paths are /dev/ttysNNN with a decimal tail. Read @@ -1567,31 +1591,69 @@ static int pty_keepalive_register_locked(int master_host_fd, int slave_host_fd, uint32_t linux_pts_num, const char *slave_path, + bool stale_open_once, uint32_t *existing_pts_num) { - int free_slot = -1; + int empty_slot = -1; + int stale_path_slot = -1; for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { if (pty_keepalive_table[i].master_host_fd == master_host_fd) { if (existing_pts_num) *existing_pts_num = pty_keepalive_table[i].linux_pts_num; return PTY_REG_EXISTS; } - if (free_slot < 0 && - pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) - free_slot = i; - } - if (free_slot < 0) - return PTY_REG_FULL; - pty_keepalive_table[free_slot].master_host_fd = master_host_fd; - pty_keepalive_table[free_slot].slave_host_fd = slave_host_fd; - pty_keepalive_table[free_slot].linux_pts_num = linux_pts_num; + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + /* Prefer a stale-path slot with the same pts number: the macOS minor + * deterministically maps to the same slave_path string, so reusing + * keeps lookups path-correct and bounds the table at one slot per + * live minor instead of accumulating a new entry on every reopen. + */ + if (pty_keepalive_table[i].slave_path[0] != '\0' && + pty_keepalive_table[i].linux_pts_num == linux_pts_num) { + stale_path_slot = i; + } else if (empty_slot < 0 && + pty_keepalive_table[i].slave_path[0] == '\0') { + empty_slot = i; + } + } + int slot = (stale_path_slot >= 0) ? stale_path_slot : empty_slot; + if (slot < 0) { + /* Out of empty slots and no stale-path match: evict the lowest-index + * stale-path entry so the live registration cannot starve. Live entries + * are never evicted. The eviction policy is approximately LRU: empty + * slots fill from low indices, so the lowest-index stale slot tends to + * be the oldest closed. A theoretical race exists with the + * close-before-open child pattern (a child stales slot K under + * pty_keepalive_lock and races into open("/dev/pts/N") just as another + * thread evicts slot K to register a different minor) but needs the + * keepalive table to be full -- live and stale entries both count -- + * with the staling thread's slot being the lowest-index stale. Well + * outside the foot / sshd workload that motivated this code. + */ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE && + pty_keepalive_table[i].slave_path[0] != '\0') { + slot = i; + break; + } + } + if (slot < 0) + return PTY_REG_FULL; + } + pty_keepalive_table[slot].master_host_fd = master_host_fd; + if (pty_keepalive_table[slot].slave_host_fd >= 0 && + pty_keepalive_table[slot].slave_host_fd != slave_host_fd) + close(pty_keepalive_table[slot].slave_host_fd); + pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + pty_keepalive_table[slot].linux_pts_num = linux_pts_num; + pty_keepalive_table[slot].stale_open_once = stale_open_once; if (slave_path) { - strncpy(pty_keepalive_table[free_slot].slave_path, slave_path, + strncpy(pty_keepalive_table[slot].slave_path, slave_path, PTY_SLAVE_PATH_MAX - 1); - pty_keepalive_table[free_slot].slave_path[PTY_SLAVE_PATH_MAX - 1] = - '\0'; + pty_keepalive_table[slot].slave_path[PTY_SLAVE_PATH_MAX - 1] = '\0'; } else { - pty_keepalive_table[free_slot].slave_path[0] = '\0'; + pty_keepalive_table[slot].slave_path[0] = '\0'; } return PTY_REG_INSERTED; } @@ -1604,12 +1666,14 @@ static int pty_keepalive_register_locked(int master_host_fd, static int pty_keepalive_register(int master_host_fd, int slave_host_fd, uint32_t linux_pts_num, - const char *slave_path) + const char *slave_path, + bool stale_open_once) { pthread_once(&pty_keepalive_once, pty_keepalive_init); pthread_mutex_lock(&pty_keepalive_lock); int rc = pty_keepalive_register_locked(master_host_fd, slave_host_fd, - linux_pts_num, slave_path, NULL); + linux_pts_num, slave_path, + stale_open_once, NULL); pthread_mutex_unlock(&pty_keepalive_lock); if (rc == PTY_REG_FULL) { errno = ENOSPC; @@ -1734,7 +1798,7 @@ uint32_t proc_pty_master_adopt(int guest_fd) } uint32_t existing_pts = UINT32_MAX; int rc = pty_keepalive_register_locked(canonical_host_fd, slave, pts_num, - slave_path, &existing_pts); + slave_path, false, &existing_pts); pthread_mutex_unlock(&fd_lock); pthread_mutex_unlock(&pty_keepalive_lock); if (rc == PTY_REG_FULL) { @@ -1770,24 +1834,117 @@ static int pty_lookup_slave_path(uint32_t linux_pts_num, return -1; } pthread_once(&pty_keepalive_once, pty_keepalive_init); + int hit = -1; pthread_mutex_lock(&pty_keepalive_lock); + /* Prefer a live entry (master still open in this process) over a stale + * path entry. Both encode the same slave_path for a given minor on macOS, + * so the preference only matters if a future change ever lets the two + * diverge - live wins by breaking out of the scan on first match. + */ for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { - if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE && - pty_keepalive_table[i].linux_pts_num == linux_pts_num) { + if (pty_keepalive_table[i].linux_pts_num != linux_pts_num) + continue; + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) { + hit = i; + break; + } + if (!pty_keepalive_table[i].stale_open_once || + pty_keepalive_table[i].slave_host_fd < 0) + continue; + if (hit < 0) + hit = i; + } + if (hit < 0) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENOENT; + return -1; + } + size_t len = strlen(pty_keepalive_table[hit].slave_path); + if (len >= out_sz) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENAMETOOLONG; + return -1; + } + memcpy(out, pty_keepalive_table[hit].slave_path, len + 1); + pthread_mutex_unlock(&pty_keepalive_lock); + return 0; +} + +static int pty_open_slave(uint32_t linux_pts_num, int linux_flags) +{ + int oflags = translate_open_flags(linux_flags) & + (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + char host_path[PTY_SLAVE_PATH_MAX]; + int stale_hit = -1; + int retained_slaves[PTY_KEEPALIVE_MAX]; + int nretained = 0; + int fd = -1; + + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].linux_pts_num != linux_pts_num) + continue; + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) { size_t len = strlen(pty_keepalive_table[i].slave_path); - if (len >= out_sz) { + if (len >= sizeof(host_path)) { pthread_mutex_unlock(&pty_keepalive_lock); errno = ENAMETOOLONG; return -1; } - memcpy(out, pty_keepalive_table[i].slave_path, len + 1); + memcpy(host_path, pty_keepalive_table[i].slave_path, len + 1); pthread_mutex_unlock(&pty_keepalive_lock); - return 0; + return open(host_path, oflags); } + if (stale_hit < 0 && pty_keepalive_table[i].stale_open_once && + pty_keepalive_table[i].slave_host_fd >= 0) + stale_hit = i; + } + + if (stale_hit < 0) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENOENT; + return -1; + } + + /* Stale fork-child entries are one-shot. The retained slave fd pins the + * macOS tty while we translate the close-before-open sequence, preventing + * the cached path from resolving to a reused unrelated minor. Regardless + * of open success, consume the stale mapping before returning. + */ + size_t len = strlen(pty_keepalive_table[stale_hit].slave_path); + if (len >= sizeof(host_path)) { + int retained_slave = pty_keepalive_clear_slot_locked(stale_hit); + pthread_mutex_unlock(&pty_keepalive_lock); + if (retained_slave >= 0) + close(retained_slave); + errno = ENAMETOOLONG; + return -1; + } + memcpy(host_path, pty_keepalive_table[stale_hit].slave_path, len + 1); + fd = open(host_path, oflags); + int saved = errno; + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + if (!pty_keepalive_table[i].stale_open_once) + continue; + if (strncmp(pty_keepalive_table[i].slave_path, host_path, + PTY_SLAVE_PATH_MAX) != 0) + continue; + int retained_slave = pty_keepalive_clear_slot_locked(i); + if (retained_slave >= 0 && nretained < PTY_KEEPALIVE_MAX) + retained_slaves[nretained++] = retained_slave; } pthread_mutex_unlock(&pty_keepalive_lock); - errno = ENOENT; - return -1; + for (int i = 0; i < nretained; i++) + close(retained_slaves[i]); + errno = saved; + return fd; } static int pty_open_pts_dir(int linux_flags) @@ -1805,9 +1962,21 @@ static int pty_open_pts_dir(int linux_flags) pthread_once(&pty_keepalive_once, pty_keepalive_init); pthread_mutex_lock(&pty_keepalive_lock); + /* Enumerate live masters and fork-child one-shot stale entries. The stale + * entries retain a slave fd until the first open attempt consumes them, so + * they cannot name a reused unrelated tty while they appear in readdir. + */ for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { - if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE && + (!pty_keepalive_table[i].stale_open_once || + pty_keepalive_table[i].slave_host_fd < 0)) continue; + /* The recycle/reuse-by-pts_num invariant in + * pty_keepalive_register_locked keeps at most one entry per minor, so + * no de-duplication pass is needed here. + */ pts_nums[pts_count++] = pty_keepalive_table[i].linux_pts_num; } pthread_mutex_unlock(&pty_keepalive_lock); @@ -1888,7 +2057,7 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, } uint32_t existing_pts = UINT32_MAX; int rc = pty_keepalive_register_locked(dst_master_host_fd, dst_slave, - src_pts_num, src_slave_path, + src_pts_num, src_slave_path, false, &existing_pts); if (rc != PTY_REG_INSERTED) { /* Table full or duplicate entry for dst_master_host_fd; drop the @@ -1916,11 +2085,15 @@ void proc_pty_close_keepalive(int master_host_fd) pthread_mutex_lock(&pty_keepalive_lock); for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { if (pty_keepalive_table[i].master_host_fd == master_host_fd) { - slave = pty_keepalive_table[i].slave_host_fd; - pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; - pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; - pty_keepalive_table[i].linux_pts_num = 0; - pty_keepalive_table[i].slave_path[0] = '\0'; + if (pty_keepalive_table[i].stale_open_once) { + /* Fork-restored child entry: retain the slave fd and path for + * one /dev/pts/N open after close(master). pty_open_slave + * consumes and closes it on the first translated open attempt. + */ + pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; + } else { + slave = pty_keepalive_clear_slot_locked(i); + } break; } } @@ -1929,6 +2102,43 @@ void proc_pty_close_keepalive(int master_host_fd) close(slave); } +static void proc_pty_expire_stale_by_path(const char *slave_path) +{ + if (!slave_path || slave_path[0] == '\0') + return; + + int stale_slaves[PTY_KEEPALIVE_MAX]; + int nslaves = 0; + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + if (!pty_keepalive_table[i].stale_open_once) + continue; + if (strncmp(pty_keepalive_table[i].slave_path, slave_path, + PTY_SLAVE_PATH_MAX) != 0) + continue; + int slave = pty_keepalive_clear_slot_locked(i); + if (slave >= 0 && nslaves < PTY_KEEPALIVE_MAX) + stale_slaves[nslaves++] = slave; + } + pthread_mutex_unlock(&pty_keepalive_lock); + for (int i = 0; i < nslaves; i++) + close(stale_slaves[i]); +} + +static int pty_keepalive_register_recycled(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path, + bool stale_open_once) +{ + proc_pty_expire_stale_by_path(slave_path); + return pty_keepalive_register(master_host_fd, slave_host_fd, linux_pts_num, + slave_path, stale_open_once); +} + int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, int *out_slave_fds, int max_entries) @@ -2001,8 +2211,8 @@ void proc_pty_restore_keepalive(int master_host_fd, * would have truncated and reparsing here would yield the wrong number. */ errno = 0; - if (pty_keepalive_register(master_host_fd, slave_host_fd, linux_pts_num, - slave_path) < 0) { + if (pty_keepalive_register_recycled(master_host_fd, slave_host_fd, + linux_pts_num, slave_path, true) < 0) { if (slave_host_fd >= 0) close(slave_host_fd); } else if (errno == EEXIST) { @@ -2071,7 +2281,8 @@ static int pty_open_master(int linux_flags) return -1; } errno = 0; - if (pty_keepalive_register(master, slave, linux_pts_num, slave_path) < 0) { + if (pty_keepalive_register_recycled(master, slave, linux_pts_num, + slave_path, false) < 0) { close(slave); close(master); errno = EMFILE; @@ -2188,17 +2399,11 @@ int proc_intercept_open(const guest_t *g, errno = ENOENT; return -1; } - char host_path[PTY_SLAVE_PATH_MAX]; - if (pty_lookup_slave_path((uint32_t) n, host_path, sizeof(host_path)) < - 0) - return -1; /* /dev/pts/N is a character device; strip O_CREAT and friends so * the two-argument open(2) never sees a creation-mode-required * combination without a mode arg. */ - int oflags = translate_open_flags(linux_flags) & - (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); - return open(host_path, oflags); + return pty_open_slave((uint32_t) n, linux_flags); } /* /proc -> synthetic directory with PID entries for busybox ps, top, etc. diff --git a/src/syscall/io.c b/src/syscall/io.c index 59ef3e9..eafc798 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -1575,7 +1575,7 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) /* Set terminal window size. Same struct as TIOCGWINSZ; foot, sshd, * tmux, and any libvte-derived emulator call this on the PTY master * after spawning the slave child. Without it, terminal startup fails - * with -ENOTTY from the default arm below (issue #88). + * with -ENOTTY from the default arm below. * * A master received through SCM_RIGHTS bypasses /dev/ptmx open * interception, so lazily create its keepalive before the host ioctl. diff --git a/tests/test-pty.c b/tests/test-pty.c index ce77534..1702be7 100644 --- a/tests/test-pty.c +++ b/tests/test-pty.c @@ -5,12 +5,14 @@ * * Foot, sshd, tmux, and any libvte-derived terminal need the multiplexer * primitives glibc's posix_openpt(3) / ptsname(3) / openpty(3) stack rests - * on. Exercise the four pieces wired in for issue #88: + * on. Exercise the pieces glibc and posix-compliant pty children depend on: * * 1. TIOCSWINSZ on the /dev/ptmx master fd (the direct failure foot saw) * 2. TIOCGPTN -> /dev/pts/N path round trip * 3. TIOCSPTLCK(0) for unlockpt(), plus non-zero lock request fd typing * 4. /dev/pts/N open + stat intercept and the slave fd's window size + * 5. /dev/pts/N open in a forked child after the child has already closed + * its master (foot/sshd/openssh sftp-server pattern) * * The test stays self-contained on the syscall surface (no libutil/openpty), * so it runs the same way under elfuse-aarch64, qemu-aarch64, and any future @@ -52,7 +54,7 @@ int passes = 0, fails = 0; int main(void) { - printf("test-pty: PTY ioctl + /dev/pts/N path support (issue #88)\n"); + printf("test-pty: PTY ioctl + /dev/pts/N path support\n"); /* Regression guard for the pty_keepalive_table BSS-zero collision: any * close that runs before the very first /dev/ptmx open would walk the @@ -74,9 +76,9 @@ int main(void) return 1; } - /* TIOCSWINSZ on the master is the direct regression from issue #88. - * Before the fix, sys_ioctl had no case for it and fell through to the - * default -ENOTTY arm, breaking foot's terminal initialization. + /* TIOCSWINSZ on the master was the direct regression that broke foot's + * terminal startup: sys_ioctl had no case for it and fell through to the + * default -ENOTTY arm. */ TEST("TIOCSWINSZ on /dev/ptmx master"); struct winsize ws_set = { @@ -317,6 +319,109 @@ int main(void) EXPECT_TRUE(leak_loop_ok, "repeated /dev/ptmx open/close exhausted the keepalive table"); + /* foot/sshd/openssh sftp-server pattern: the child closes its inherited + * master fd before opening the slave (the child has no use for the + * master). Earlier, this dropped the keepalive table entry that held the + * macOS slave_path mapping, and the subsequent open("/dev/pts/N") in the + * child failed with ENOENT even though the parent still held the master + * and the macOS slave node was openable. The retained-path semantics in + * proc_pty_close_keepalive must let the child still translate the path. + */ + TEST("child can open /dev/pts/N after closing its master"); + int spawn_master = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (spawn_master < 0) { + FAIL("open(/dev/ptmx) for spawn scenario"); + } else { + unsigned int spawn_ptyno = (unsigned int) -1; + int unlock_zero = 0; + if (ioctl(spawn_master, TIOCGPTN, &spawn_ptyno) != 0 || + ioctl(spawn_master, TIOCSPTLCK, &unlock_zero) != 0) { + FAIL("TIOCGPTN/TIOCSPTLCK on spawn master"); + close(spawn_master); + } else { + char spawn_pts[32]; + snprintf(spawn_pts, sizeof(spawn_pts), "/dev/pts/%u", spawn_ptyno); + int spawn_pipe[2]; + if (pipe(spawn_pipe) != 0) { + FAIL("pipe for spawn scenario"); + close(spawn_master); + } else { + pid_t spawn_pid = fork(); + if (spawn_pid < 0) { + FAIL("fork for spawn scenario"); + close(spawn_pipe[0]); + close(spawn_pipe[1]); + close(spawn_master); + } else if (spawn_pid == 0) { + /* Child: foot's slave_exec sequence -- close the master, + * then open(pts_name) for the controlling terminal. + */ + close(spawn_pipe[0]); + if (setsid() < 0) + _exit(11); + close(spawn_master); + int slave_fd = open(spawn_pts, O_RDWR); + char status = (slave_fd >= 0) ? 'Y' : 'N'; + (void) !write(spawn_pipe[1], &status, 1); + if (slave_fd >= 0) { + (void) !write(slave_fd, "ok\n", 3); + close(slave_fd); + } + close(spawn_pipe[1]); + _exit(slave_fd >= 0 ? 0 : 12); + } else { + close(spawn_pipe[1]); + char status = '?'; + ssize_t n = read(spawn_pipe[0], &status, 1); + close(spawn_pipe[0]); + int wstatus = 0; + waitpid(spawn_pid, &wstatus, 0); + int spawn_ok = (n == 1) && (status == 'Y') && + WIFEXITED(wstatus) && + WEXITSTATUS(wstatus) == 0; + EXPECT_TRUE(spawn_ok, + "child open(/dev/pts/N) after close(master)"); + if (spawn_ok) { + char drain[16] = {0}; + int flags = fcntl(spawn_master, F_GETFL); + if (flags >= 0 && fcntl(spawn_master, F_SETFL, + flags | O_NONBLOCK) == 0) { + (void) !read(spawn_master, drain, + sizeof(drain) - 1); + (void) fcntl(spawn_master, F_SETFL, flags); + } + } + close(spawn_master); + + TEST("stale /dev/pts/N expires after master teardown"); + int stale_fd = open(spawn_pts, O_RDWR); + /* Both ENOENT (devfs node gone) and ENXIO (devfs node + * lingers but the pty pair has been torn down) are valid + * macOS responses depending on kernel version. The + * invariant the test guards is "the stale cached path + * does not silently hand back an unrelated tty"; any + * open failure satisfies that. + */ + int stale_ok = + stale_fd < 0 && (errno == ENOENT || errno == ENXIO); + if (stale_fd >= 0) + close(stale_fd); + EXPECT_TRUE(stale_ok, "stale /dev/pts/N stayed openable"); + } + } + } + } + + /* /dev/pts/N for a never-allocated minor must surface ENOENT (the cached + * paths in the keepalive table cannot satisfy an arbitrary number). + */ + TEST("/dev/pts/ returns ENOENT"); + int unknown = open("/dev/pts/999999", O_RDWR); + int unknown_ok = unknown < 0 && errno == ENOENT; + if (unknown >= 0) + close(unknown); + EXPECT_TRUE(unknown_ok, "open(/dev/pts/999999) did not return ENOENT"); + /* A pty master received via SCM_RIGHTS bypasses the /dev/ptmx open * intercept, so the receiver process has no keepalive entry for it. * proc_pty_master_adopt must lazily register one before master-side tty From 041fc9f8ffc59487f80695f3b26f320ec5581f7c Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 8 Jun 2026 17:47:50 +0800 Subject: [PATCH 3/7] Consolidate PTY keepalive helpers Extract pty_keepalive_lock_acquire and pty_keepalive_find_master_locked so the ten pthread_once + lock acquire pairs and four "scan by master_host_fd" loops collapse to one call site each. Drop the redundant BSS-zero field assignments from pty_keepalive_init now that pty_keepalive_clear_slot_locked resets every field on slot reclaim, so the BSS dependency only matters for the first-touch sentinel. Switch pty_keepalive_register_locked to str_copy_trunc, collapse three close + saved-errno blocks in pty_open_master into close_keep_errno, and fold the success and failure cleanup loops of fork_ipc_send_pty_keepalives so payload_slave_fds is closed once at the tail. Replace proc_pty_restore_keepalive's four scattered close paths with a single goto drop trailer and use ARRAY_SIZE in fork_ipc_send_pty_keepalives instead of an open-coded divisor. --- src/runtime/fork-state.c | 30 +++--- src/runtime/procemu.c | 212 ++++++++++++++++----------------------- 2 files changed, 98 insertions(+), 144 deletions(-) diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index ad283f9..3b2847a 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -463,9 +463,8 @@ int fork_ipc_send_pty_keepalives(int ipc_sock) /* PTY_KEEPALIVE_MAX upper bound on entries; allocate to that. */ proc_pty_ipc_entry_t snapshot[256]; int snapshot_slave_fds[256]; - const int max_entries = (int) (sizeof(snapshot) / sizeof(snapshot[0])); - int num_snap = - proc_pty_snapshot_keepalive(snapshot, snapshot_slave_fds, max_entries); + int num_snap = proc_pty_snapshot_keepalive(snapshot, snapshot_slave_fds, + ARRAY_SIZE(snapshot)); /* Match each keepalive's master_host_fd against a live fd_table entry to * recover the guest_fd, which is the stable identifier across fork. @@ -509,27 +508,22 @@ int fork_ipc_send_pty_keepalives(int ipc_sock) } pthread_mutex_unlock(&fd_lock); - if (fork_ipc_write_all(ipc_sock, &num_send, sizeof(num_send)) < 0) - goto fail; - if (num_send > 0) { + int rc = 0; + if (fork_ipc_write_all(ipc_sock, &num_send, sizeof(num_send)) < 0) { + rc = -1; + } else if (num_send > 0) { if (fork_ipc_write_all(ipc_sock, payload, - num_send * sizeof(payload[0])) < 0) - goto fail; - if (fork_ipc_send_fds(ipc_sock, payload_slave_fds, (int) num_send) < - 0) { + num_send * sizeof(payload[0])) < 0) { + rc = -1; + } else if (fork_ipc_send_fds(ipc_sock, payload_slave_fds, + (int) num_send) < 0) { log_error("clone: failed to send pty keepalive fds"); - goto fail; + rc = -1; } } - - for (uint32_t i = 0; i < num_send; i++) - close(payload_slave_fds[i]); - return 0; - -fail: for (uint32_t i = 0; i < num_send; i++) close(payload_slave_fds[i]); - return -1; + return rc; } int fork_ipc_recv_pty_keepalives(int ipc_fd) diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index cc9e4a1..da19ae7 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -1532,17 +1532,32 @@ static struct { static pthread_mutex_t pty_keepalive_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_once_t pty_keepalive_once = PTHREAD_ONCE_INIT; +/* Sentinel-init. Other fields stay BSS-zero; without sentinels a host fd 0 + * close would match slot 0 and close the wrong fd inside elfuse. + */ static void pty_keepalive_init(void) { for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; - pty_keepalive_table[i].linux_pts_num = 0; - pty_keepalive_table[i].stale_open_once = false; - pty_keepalive_table[i].slave_path[0] = '\0'; } } +static void pty_keepalive_lock_acquire(void) +{ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); +} + +/* Find a slot by master_host_fd; -1 if none. Caller holds the lock. */ +static int pty_keepalive_find_master_locked(int master_host_fd) +{ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) + if (pty_keepalive_table[i].master_host_fd == master_host_fd) + return i; + return -1; +} + static int pty_keepalive_clear_slot_locked(int slot) { int slave = pty_keepalive_table[slot].slave_host_fd; @@ -1648,13 +1663,11 @@ static int pty_keepalive_register_locked(int master_host_fd, pty_keepalive_table[slot].slave_host_fd = slave_host_fd; pty_keepalive_table[slot].linux_pts_num = linux_pts_num; pty_keepalive_table[slot].stale_open_once = stale_open_once; - if (slave_path) { - strncpy(pty_keepalive_table[slot].slave_path, slave_path, - PTY_SLAVE_PATH_MAX - 1); - pty_keepalive_table[slot].slave_path[PTY_SLAVE_PATH_MAX - 1] = '\0'; - } else { + if (slave_path) + str_copy_trunc(pty_keepalive_table[slot].slave_path, slave_path, + PTY_SLAVE_PATH_MAX); + else pty_keepalive_table[slot].slave_path[0] = '\0'; - } return PTY_REG_INSERTED; } @@ -1669,8 +1682,7 @@ static int pty_keepalive_register(int master_host_fd, const char *slave_path, bool stale_open_once) { - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); int rc = pty_keepalive_register_locked(master_host_fd, slave_host_fd, linux_pts_num, slave_path, stale_open_once, NULL); @@ -1688,15 +1700,10 @@ uint32_t proc_pty_master_pts_num(int master_host_fd) { if (master_host_fd < 0) return UINT32_MAX; - pthread_once(&pty_keepalive_once, pty_keepalive_init); - uint32_t pts_num = UINT32_MAX; - pthread_mutex_lock(&pty_keepalive_lock); - for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { - if (pty_keepalive_table[i].master_host_fd == master_host_fd) { - pts_num = pty_keepalive_table[i].linux_pts_num; - break; - } - } + pty_keepalive_lock_acquire(); + int slot = pty_keepalive_find_master_locked(master_host_fd); + uint32_t pts_num = + (slot < 0) ? UINT32_MAX : pty_keepalive_table[slot].linux_pts_num; pthread_mutex_unlock(&pty_keepalive_lock); return pts_num; } @@ -1784,8 +1791,7 @@ uint32_t proc_pty_master_adopt(int guest_fd) * between the validation read and the keepalive insert, so the keepalive * cannot attach to a recycled canonical host fd. */ - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); pthread_mutex_lock(&fd_lock); if (fd_table[guest_fd].type == FD_CLOSED || fd_table[guest_fd].host_fd != canonical_host_fd || @@ -1833,9 +1839,8 @@ static int pty_lookup_slave_path(uint32_t linux_pts_num, errno = EINVAL; return -1; } - pthread_once(&pty_keepalive_once, pty_keepalive_init); int hit = -1; - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); /* Prefer a live entry (master still open in this process) over a stale * path entry. Both encode the same slave_path for a given minor on macOS, * so the preference only matters if a future change ever lets the two @@ -1882,8 +1887,7 @@ static int pty_open_slave(uint32_t linux_pts_num, int linux_flags) int nretained = 0; int fd = -1; - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { if (pty_keepalive_table[i].linux_pts_num != linux_pts_num) continue; @@ -1960,8 +1964,7 @@ static int pty_open_pts_dir(int linux_flags) if (!mkdtemp(dir)) return -1; - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); /* Enumerate live masters and fork-child one-shot stale entries. The stale * entries retain a slave fd until the first open attempt consumes them, so * they cannot name a reused unrelated tty while they appear in readdir. @@ -2011,8 +2014,7 @@ static int pty_open_pts_dir(int linux_flags) void proc_pty_lock_for_dup(void) { - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); } void proc_pty_unlock_for_dup(void) @@ -2023,29 +2025,20 @@ void proc_pty_unlock_for_dup(void) void proc_pty_dup_keepalive_locked(int src_master_host_fd, int dst_master_host_fd) { - /* Caller-holds-lock variant. Used by duplicate_guest_fd which brackets - * the source snapshot, host dup, and this mirror inside one - * pty_keepalive_lock window so a sibling close cannot run - * proc_pty_close_keepalive between the snapshot and the mirror -- the - * exact race that would leave the alias without a keepalive entry. - */ + /* Caller-holds-lock variant; see header for the dup race this guards. */ if (src_master_host_fd < 0 || dst_master_host_fd < 0) return; - int dst_slave = -1; - uint32_t src_pts_num = UINT32_MAX; - char src_slave_path[PTY_SLAVE_PATH_MAX] = {0}; - for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { - if (pty_keepalive_table[i].master_host_fd == src_master_host_fd) { - dst_slave = dup(pty_keepalive_table[i].slave_host_fd); - src_pts_num = pty_keepalive_table[i].linux_pts_num; - memcpy(src_slave_path, pty_keepalive_table[i].slave_path, - PTY_SLAVE_PATH_MAX); - break; - } - } + int slot = pty_keepalive_find_master_locked(src_master_host_fd); + if (slot < 0) + return; + int dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); if (dst_slave < 0) return; + uint32_t src_pts_num = pty_keepalive_table[slot].linux_pts_num; + char src_slave_path[PTY_SLAVE_PATH_MAX]; + memcpy(src_slave_path, pty_keepalive_table[slot].slave_path, + PTY_SLAVE_PATH_MAX); /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into * a guest child that has no map back to it. @@ -2055,10 +2048,9 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, close(dst_slave); return; } - uint32_t existing_pts = UINT32_MAX; - int rc = pty_keepalive_register_locked(dst_master_host_fd, dst_slave, - src_pts_num, src_slave_path, false, - &existing_pts); + int rc = + pty_keepalive_register_locked(dst_master_host_fd, dst_slave, + src_pts_num, src_slave_path, false, NULL); if (rc != PTY_REG_INSERTED) { /* Table full or duplicate entry for dst_master_host_fd; drop the * redundant slave. Duplicate is unexpected: dst is a freshly-duped @@ -2072,29 +2064,23 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, void proc_pty_close_keepalive(int master_host_fd) { /* fd_cleanup_entry calls this for every guest fd close, not just pty - * masters. Force the table to its FREE-sentinel state on first use: without - * this, the BSS-zero initial state lets the first close of host fd 0 (e.g. - * guest stdin) match slot 0 (master_host_fd=0) and close fd 0 inside - * elfuse. pthread_once is idempotent and cheap after the flag is set. + * masters; pty_keepalive_lock_acquire guarantees sentinel-init first. */ - pthread_once(&pty_keepalive_once, pty_keepalive_init); if (master_host_fd < 0) return; int slave = -1; - pthread_mutex_lock(&pty_keepalive_lock); - for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { - if (pty_keepalive_table[i].master_host_fd == master_host_fd) { - if (pty_keepalive_table[i].stale_open_once) { - /* Fork-restored child entry: retain the slave fd and path for - * one /dev/pts/N open after close(master). pty_open_slave - * consumes and closes it on the first translated open attempt. - */ - pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; - } else { - slave = pty_keepalive_clear_slot_locked(i); - } - break; + pty_keepalive_lock_acquire(); + int slot = pty_keepalive_find_master_locked(master_host_fd); + if (slot >= 0) { + if (pty_keepalive_table[slot].stale_open_once) { + /* Fork-restored child entry: retain the slave fd and path for one + * /dev/pts/N open after close(master). pty_open_slave consumes and + * closes it on the first translated open attempt. + */ + pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE; + } else { + slave = pty_keepalive_clear_slot_locked(slot); } } pthread_mutex_unlock(&pty_keepalive_lock); @@ -2109,8 +2095,7 @@ static void proc_pty_expire_stale_by_path(const char *slave_path) int stale_slaves[PTY_KEEPALIVE_MAX]; int nslaves = 0; - pthread_once(&pty_keepalive_once, pty_keepalive_init); - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) continue; @@ -2146,9 +2131,8 @@ int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, if (!out_entries || !out_slave_fds || max_entries <= 0) return 0; - pthread_once(&pty_keepalive_once, pty_keepalive_init); int n = 0; - pthread_mutex_lock(&pty_keepalive_lock); + pty_keepalive_lock_acquire(); for (int i = 0; i < PTY_KEEPALIVE_MAX && n < max_entries; i++) { if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) continue; @@ -2179,63 +2163,48 @@ void proc_pty_restore_keepalive(int master_host_fd, uint32_t linux_pts_num, const char *slave_path) { - /* fork-IPC hand-off: the child just installed its own host fd for the pty - * master (a fresh kernel fd, not the parent's number) and received the - * keepalive slave fd via SCM_RIGHTS. Stitch the pair back together under - * the same (linux_pts_num, slave_path) the parent had so future /dev/pts/N - * opens in the child resolve to the same macOS tty. - * - * The slave fd arrived via SCM_RIGHTS without FD_CLOEXEC set; the keepalive - * must not survive exec into the guest's children, so set it here. On any - * failure, drop the slave fd rather than ship a keepalive whose lifetime - * elfuse cannot guarantee. + /* fork-IPC hand-off. SCM_RIGHTS drops FD_CLOEXEC; set it here so the + * keepalive does not survive exec. Any failure drops the slave fd. */ - if (master_host_fd < 0) { - if (slave_host_fd >= 0) - close(slave_host_fd); - return; - } + if (master_host_fd < 0) + goto drop; if (slave_host_fd >= 0) { int fdflags = fcntl(slave_host_fd, F_GETFD); if (fdflags < 0 || - fcntl(slave_host_fd, F_SETFD, fdflags | FD_CLOEXEC) < 0) { - close(slave_host_fd); - return; - } + fcntl(slave_host_fd, F_SETFD, fdflags | FD_CLOEXEC) < 0) + goto drop; } /* Trust the parent's linux_pts_num verbatim instead of re-parsing - * slave_path. The wire-format string is bounded to PTY_SLAVE_PATH_MAX-1 + * slave_path. The wire-format string is bounded to PTY_SLAVE_PATH_MAX - 1 * bytes; if a future macOS canonical form ever exceeded that, the parent - * would have truncated and reparsing here would yield the wrong number. + * would have truncated and reparsing here would yield the wrong number. On + * EEXIST the child's fd_table-restore path replayed master_host_fd over a + * prior recv-keepalive entry; drop the redundant slave so it does not leak. */ errno = 0; if (pty_keepalive_register_recycled(master_host_fd, slave_host_fd, - linux_pts_num, slave_path, true) < 0) { - if (slave_host_fd >= 0) - close(slave_host_fd); - } else if (errno == EEXIST) { - /* The child's fd_table-restore path can theoretically replay the - * same master_host_fd if a prior recv-keepalive entry was already - * installed for that number. Drop our redundant slave so it does - * not leak. - */ - if (slave_host_fd >= 0) - close(slave_host_fd); - } + linux_pts_num, slave_path, true) < 0 || + errno == EEXIST) + goto drop; + return; + +drop: + if (slave_host_fd >= 0) + close(slave_host_fd); } -/* Open /dev/ptmx, unlock the slave, and instantiate a keepalive slave fd - * so the master's tty ioctls work before the guest opens the slave itself. +/* Open /dev/ptmx, unlock the slave, and instantiate a keepalive slave fd so the + * master's tty ioctls work before the guest opens the slave itself. * Returns the master host fd on success, -1 with errno set on failure. */ static int pty_open_master(int linux_flags) { - /* /dev/ptmx is a character device; O_CREAT / O_TRUNC / O_EXCL make no - * sense here. Strip them and only honor accmode + descriptor flags so - * the host open(2) never sees a variadic-mode-required combination - * without a mode arg. + /* /dev/ptmx is a character device; O_CREAT / O_TRUNC / O_EXCL make no sense + * here. Strip them and only honor accmode + descriptor flags so the host + * open(2) never sees a variadic-mode-required combination without a mode + * arg. */ int oflags = translate_open_flags(linux_flags) & (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); @@ -2246,17 +2215,10 @@ static int pty_open_master(int linux_flags) /* grantpt(3) is a no-op on a unix98 pty mount, but call it for clarity * and to match what posix_openpt(3)'s callers expect to have happened. */ - if (grantpt(master) < 0 || unlockpt(master) < 0) { - int saved = errno; - close(master); - errno = saved; - return -1; - } char slave_path[PTY_SLAVE_PATH_MAX]; - if (ptsname_r(master, slave_path, sizeof(slave_path)) != 0) { - int saved = errno; - close(master); - errno = saved; + if (grantpt(master) < 0 || unlockpt(master) < 0 || + ptsname_r(master, slave_path, sizeof(slave_path)) != 0) { + close_keep_errno(master); return -1; } @@ -2275,9 +2237,7 @@ static int pty_open_master(int linux_flags) } int slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC); if (slave < 0) { - int saved = errno; - close(master); - errno = saved; + close_keep_errno(master); return -1; } errno = 0; From 59f37c0ad7c2c9c49772a4456d1856d9d6c32ce0 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 8 Jun 2026 17:49:12 +0800 Subject: [PATCH 4/7] Honor O_PATH on /dev/ptmx without pty pair Linux O_PATH means "path-only": the device hook must not run, no pty pair gets allocated, and the resulting fd only supports fstat plus *at-style operations. Forwarding the open to pty_open_master broke this because every probe of /dev/ptmx allocated a new pty and grew /dev/pts indefinitely. proc_intercept_open now short-circuits /dev/ptmx + O_PATH to a /dev/null backing fd. /dev/null is harmless, never a directory, and the guest's I/O and ioctl paths are already gated by FD_PATH so the backing fd is never visible. proc_intercept_stat synthesizes the matching character-device stat with rdev = (5, 2) so fstat through the FD_PATH gate reports the standard Linux ptmx device numbers, and the stat-translation layer's mac_to_linux_dev produces the right values in the guest's struct stat. sys_fstat picks up the synthetic stat when an FD_PATH fd carries a non-empty proc_path -- the route any future virtual-path-backed fd (/proc, /dev/ptmx, etc.) needs. To keep that proc_path safe under sibling close + reopen races, fold the proc_path install into fd_alloc_opened_host's existing (type, host_fd) tuple-revalidation window: the resolver runs before fd_lock and the install happens inside it, alongside linux_flags and the urandom bitmap. The previous post-publish unlocked write let a recycled slot inherit the stale proc_path string, which on the FD_PATH path could surface another file's fstat as /dev/ptmx. --- src/runtime/procemu.c | 36 +++++++++++++++++++-- src/syscall/fs-stat.c | 51 ++++++++++++++++++++++------- src/syscall/fs.c | 75 ++++++++++++++++++++++++++++--------------- tests/test-pty.c | 69 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 38 deletions(-) diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index da19ae7..297d102 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -2263,9 +2263,24 @@ int proc_intercept_open(const guest_t *g, int linux_flags, int mode) { - /* /dev/ptmx -> host /dev/ptmx + keepalive slave (see pty_open_master). */ - if (!strcmp(path, "/dev/ptmx")) + /* /dev/ptmx -> host /dev/ptmx + keepalive slave (see pty_open_master). + * O_PATH is path-only on Linux: it must not run the device open hook or + * allocate a pty pair. Use a harmless backing fd; FD_PATH gates I/O and + * ioctl, while proc_intercept_stat supplies the visible device metadata. + */ + if (!strcmp(path, "/dev/ptmx")) { + if (linux_flags & LINUX_O_PATH) { + if (linux_flags & LINUX_O_DIRECTORY) { + errno = ENOTDIR; + return -1; + } + int oflags = O_RDONLY; + if (linux_flags & LINUX_O_CLOEXEC) + oflags |= O_CLOEXEC; + return open("/dev/null", oflags); + } return pty_open_master(linux_flags); + } /* /dev/null, /dev/zero, /dev/(u)random, /dev/tty */ const char *host_dev = NULL; @@ -3294,6 +3309,23 @@ int proc_intercept_stat(const char *path, struct stat *st) if (!strcmp(path, "/dev/fuse")) return fuse_proc_stat(st); + /* Linux /dev/ptmx is the Unix98 pty multiplexer character device (5:2). + * Keep this synthetic so O_PATH probes can fstat the path fd without + * forcing a real host /dev/ptmx open, which would allocate a pty. + */ + if (!strcmp(path, "/dev/ptmx")) { + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFCHR | 0666; + st->st_nlink = 1; + st->st_dev = PROC_SYNTH_DEV; + st->st_ino = proc_synth_ino(path); + st->st_uid = proc_get_uid(); + st->st_gid = proc_get_gid(); + st->st_rdev = ((dev_t) 5u << 24) | (dev_t) 2u; + st->st_blksize = 1024; + return 0; + } + /* /dev/shm is a directory */ if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) { stat_fill_proc_dir(st, 01777, 2, diff --git a/src/syscall/fs-stat.c b/src/syscall/fs-stat.c index eb584b7..d246016 100644 --- a/src/syscall/fs-stat.c +++ b/src/syscall/fs-stat.c @@ -212,29 +212,45 @@ static int64_t stat_at_path(guest_t *g, return 0; } - host_fd_ref_t dir_ref; - if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) - return -LINUX_EBADF; - int64_t rc = 0; + host_fd_ref_t dir_ref = {.fd = -1, .owned = false}; if ((flags & LINUX_AT_EMPTY_PATH) && pathp[0] == '\0') { /* Linux: AT_EMPTY_PATH with dirfd == AT_FDCWD operates on the * current working directory. */ - if (dir_ref.fd == AT_FDCWD) { + if (dirfd == LINUX_AT_FDCWD) { + dir_ref.fd = AT_FDCWD; int mac_flags = translate_at_flags(flags); if (fstatat(AT_FDCWD, ".", mac_st, mac_flags) < 0) { rc = linux_errno(); goto done; } - } else if (dir_ref.fd < 0) { - rc = -LINUX_EBADF; - goto done; - } else if (fstat(dir_ref.fd, mac_st) < 0) { - rc = linux_errno(); - goto done; + } else { + fd_entry_t snap; + dir_ref.fd = fd_snapshot_and_dup(dirfd, &snap); + dir_ref.owned = true; + if (dir_ref.fd < 0) { + rc = -LINUX_EBADF; + goto done; + } + if (snap.type == FD_PATH && snap.proc_path[0] != '\0') { + int intercepted = proc_intercept_stat(snap.proc_path, mac_st); + if (intercepted == 0) + goto done; + if (intercepted == -1) { + rc = linux_errno(); + goto done; + } + } + if (fstat(dir_ref.fd, mac_st) < 0) { + rc = linux_errno(); + goto done; + } } } else { + if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) + return -LINUX_EBADF; + int intercepted = PROC_NOT_INTERCEPTED; if (path_might_use_stat_intercept(tx.intercept_path)) { intercepted = proc_intercept_stat(tx.intercept_path, mac_st); @@ -275,6 +291,19 @@ int64_t sys_fstat(guest_t *g, int fd, uint64_t stat_gva) if (frc != -LINUX_EBADF) return frc; + fd_entry_t snap; + if (fd_snapshot(fd, &snap) && snap.type == FD_PATH && + snap.proc_path[0] != '\0') { + int intercepted = proc_intercept_stat(snap.proc_path, &mac_st); + if (intercepted == 0) { + if (write_linux_stat(g, stat_gva, &mac_st) < 0) + return -LINUX_EFAULT; + return 0; + } + if (intercepted == -1) + return linux_errno(); + } + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) { log_debug("fstat(%d): invalid guest fd", fd); diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 3ccf699..627f29b 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -105,19 +105,33 @@ static const char *proc_stateful_file_path(const char *path) return NULL; } -static void fd_note_proc_path(int guest_fd, const char *path) +/* Resolve the proc_path the fd table should record for an intercepted path. + * Returns true and fills *out when a mapping exists; false otherwise so the + * caller can skip the install entirely. Pure string work; safe to call before + * any lock acquisition. + */ +static bool resolve_virtual_path(const char *path, char *out, size_t out_size) { - if (!path || strncmp(path, "/proc", 5) != 0) - return; + if (!path || out_size == 0) + return false; + + if (!strcmp(path, "/dev/ptmx")) { + str_copy_trunc(out, path, out_size); + return true; + } + + if (strncmp(path, "/proc", 5) != 0) + return false; char virt_buf[64]; const char *virt = proc_virtual_dir_path(path, virt_buf, sizeof(virt_buf)); if (!virt) virt = proc_stateful_file_path(path); + if (!virt) + return false; - if (virt) - str_copy_trunc(fd_table[guest_fd].proc_path, virt, - sizeof(fd_table[guest_fd].proc_path)); + str_copy_trunc(out, virt, out_size); + return true; } static const char *proc_virtual_dir_path(const char *path, @@ -185,7 +199,8 @@ static int fd_alloc_opened_host(int host_fd, int type, int linux_flags, int min_guest_fd, - void (*cleanup)(int)) + void (*cleanup)(int), + const char *virtual_path) { DIR *dir = NULL; @@ -213,16 +228,23 @@ static int fd_alloc_opened_host(int host_fd, return -1; } - /* Publish linux_flags, dir, and the urandom bitmap bit atomically - * with respect to the slot's identity. fd_alloc_*_relaxed drops + /* Resolve the virtual-path stamp before taking fd_lock; the helper is pure + * string work and must not run inside the critical section. + */ + char proc_path_buf[FD_VIRTUAL_PATH_MAX]; + bool have_proc_path = resolve_virtual_path(virtual_path, proc_path_buf, + sizeof(proc_path_buf)); + + /* Publish linux_flags, dir, proc_path, and the urandom bitmap bit + * atomically with respect to the slot's identity. fd_alloc_*_relaxed drops * fd_lock before returning, so a sibling vCPU's pathological - * close(guest_fd) + open() could reuse the slot between alloc and - * the metadata install below. Re-acquire fd_lock and verify the - * (type, host_fd) tuple still matches what just got allocated; - * if it does not, the slot belongs to a different file now and - * any install would clobber the sibling's entry. The sibling's - * close path already cleaned up our host_fd via fd_cleanup_entry, - * so this side only owns dir, which gets closed below. + * close(guest_fd) + open() could reuse the slot between alloc and the + * metadata install below. Re-acquire fd_lock and verify the + * (type, host_fd) tuple still matches what just got allocated; if it does + * not, the slot belongs to a different file now and any install would + * clobber the sibling's entry. The sibling's close path already cleaned up + * the host_fd of this side via fd_cleanup_entry, so this side only owns + * dir, which gets closed below. */ bool installed = false; pthread_mutex_lock(&fd_lock); @@ -231,6 +253,9 @@ static int fd_alloc_opened_host(int host_fd, fd_table[guest_fd].linux_flags = linux_flags; if (dir) fd_table[guest_fd].dir = dir; + if (have_proc_path) + memcpy(fd_table[guest_fd].proc_path, proc_path_buf, + sizeof(proc_path_buf)); bool readable_urandom = type == FD_URANDOM && (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY; @@ -283,8 +308,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(sidecar_fd); return linux_errno(); } - int guest_fd = - fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL); + int guest_fd = fd_alloc_opened_host(sidecar_fd, type, linux_flags, + -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(sidecar_fd); return linux_errno(); @@ -314,7 +339,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -353,15 +378,14 @@ int64_t sys_openat_path(guest_t *g, } int min_guest_fd = (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128; - int guest_fd = - fd_alloc_opened_host(intercepted, type, linux_flags, - min_guest_fd, fd_cleanup_for_type(type)); + int guest_fd = fd_alloc_opened_host( + intercepted, type, linux_flags, min_guest_fd, + fd_cleanup_for_type(type), tx.intercept_path); if (guest_fd < 0) { proc_pty_close_keepalive(intercepted); close_keep_errno(intercepted); return linux_errno(); } - fd_note_proc_path(guest_fd, tx.intercept_path); return guest_fd; } if (intercepted == -1) { @@ -382,7 +406,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -404,7 +428,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); diff --git a/tests/test-pty.c b/tests/test-pty.c index 1702be7..ce6344f 100644 --- a/tests/test-pty.c +++ b/tests/test-pty.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -49,9 +50,43 @@ #ifndef TIOCGPTPEER #define TIOCGPTPEER 0x5441 #endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif +#ifndef AT_EMPTY_PATH +#define AT_EMPTY_PATH 0x1000 +#endif +#ifndef SYS_statx +#define SYS_statx 291 +#endif int passes = 0, fails = 0; +static int count_pts_entries(void) +{ + DIR *dir = opendir("/dev/pts"); + if (!dir) + return -1; + + int count = 0; + struct dirent *ent; + while ((ent = readdir(dir))) { + if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) + continue; + int numeric = 1; + for (const char *p = ent->d_name; *p; p++) { + if (!isdigit((unsigned char) *p)) { + numeric = 0; + break; + } + } + if (numeric) + count++; + } + closedir(dir); + return count; +} + int main(void) { printf("test-pty: PTY ioctl + /dev/pts/N path support\n"); @@ -68,6 +103,40 @@ int main(void) */ close(STDIN_FILENO); + TEST("open(/dev/ptmx, O_PATH) does not allocate a pty"); + int before_path_pts = count_pts_entries(); + int path_fd = open("/dev/ptmx", O_PATH | O_CLOEXEC); + int after_path_pts = count_pts_entries(); + if (path_fd >= 0) { + struct stat ptmx_path_st; + int fstat_rc = fstat(path_fd, &ptmx_path_st); + struct stat ptmx_at_st; + int fstatat_rc = fstatat(path_fd, "", &ptmx_at_st, AT_EMPTY_PATH); + struct statx ptmx_sx; + memset(&ptmx_sx, 0, sizeof(ptmx_sx)); + long statx_rc = + syscall(SYS_statx, path_fd, "", AT_EMPTY_PATH, 0x7ff, &ptmx_sx); + errno = 0; + int fchdir_rc = fchdir(path_fd); + int fchdir_errno = errno; + int ok = before_path_pts >= 0 && after_path_pts == before_path_pts && + fstat_rc == 0 && S_ISCHR(ptmx_path_st.st_mode) && + major(ptmx_path_st.st_rdev) == 5 && + minor(ptmx_path_st.st_rdev) == 2 && fstatat_rc == 0 && + S_ISCHR(ptmx_at_st.st_mode) && + major(ptmx_at_st.st_rdev) == 5 && + minor(ptmx_at_st.st_rdev) == 2 && statx_rc == 0 && + S_ISCHR(ptmx_sx.stx_mode) && ptmx_sx.stx_rdev_major == 5 && + ptmx_sx.stx_rdev_minor == 2 && fchdir_rc < 0 && + fchdir_errno == ENOTDIR; + close(path_fd); + EXPECT_TRUE(ok, + "O_PATH open allocated a pty or exposed wrong path fd " + "semantics"); + } else { + FAIL("O_PATH /dev/ptmx open failed"); + } + int ptmx = open("/dev/ptmx", O_RDWR | O_NOCTTY); TEST("open(/dev/ptmx, O_RDWR | O_NOCTTY)"); EXPECT_TRUE(ptmx >= 0, "open /dev/ptmx failed"); From 520568cd1861165a21c49b7d19d748a0f924fd3b Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 9 Jun 2026 09:28:24 +0800 Subject: [PATCH 5/7] Harden runtime around foot Bundles the elfuse-side fixes the foot + Wayland compositor path needed to stop crashing after the PTY emulation work landed. guest.c: drain worker vCPUs in guest_destroy before any hv_vm_unmap. thread_destroy_all_vcpus releases handles but does not block on the owning pthread leaving hv_vcpu_run, so a worker still inside the guest at unmap time took a stage-2 translation fault on its next instruction fetch. exit_group already runs request + interrupt + join; the destroy path needs the same prefix because forkipc.c's vcpu_run_loop returns straight into guest_destroy without going through the guest exit_group handler. Wake signals (futex, wakeup-pipe, hv_vcpus_exit) cover workers blocked outside the vCPU loop so the 100ms join cap does not detach live pthreads onto the imminent munmap. futex.[ch]: futex_interrupt_consume is now a test-and-clear edge trigger; previously the sticky flag forkipc.c set on the last clone-thread exit kept every later epoll_pwait/ppoll/futex_wait returning -EINTR until execve cleared it, and in foot's case execve never came. poll.c, signal.c, and the futex wait paths switch to the consume variant. mem.c: hvf_apply_file_overlay refuses MAP_SHARED of a read-only backing fd up front. Apple HVF rejects post-overlay hv_vm_map with HV_DENIED when the underlying host VA loses write capability, so the overlay path silently swapped MAP_SHARED ranges to MAP_PRIVATE snapshots; routing read-only fds straight to the pread fallback returns -EACCES at the right layer and keeps the fork-child overlay re-install path quiet. io.c + abi.h: fallocate now handles FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE. The Linux semantic (reads in the punched range return zero, file size unchanged) maps to macOS F_PUNCHHOLE when the offset and length are block-aligned, and falls back to pwrite of zero pages for the one-byte probe foot's wl_shm pool issues at startup. Without the fallback, the probe returned EINVAL and foot disabled punch-hole for the whole session. crashreport.c: when an EL0 fault crash report fires, dump the guest page-table walk for the faulting VA plus the segment and region that should have backed it. The PT walk surfaced the hvf_segments=0 smoking gun behind the worker-drain race. --- src/core/guest.c | 34 +++++- src/debug/crashreport.c | 231 +++++++++++++++++++++++++++++++++++++++- src/runtime/futex.c | 146 +++++++++++++------------ src/runtime/futex.h | 1 + src/syscall/abi.h | 7 ++ src/syscall/io.c | 79 +++++++++++++- src/syscall/mem.c | 47 +++++++- src/syscall/poll.c | 6 +- src/syscall/signal.c | 10 ++ src/syscall/signal.h | 11 ++ tests/test-futex-pi.c | 100 ++++++++--------- tests/test-io-opt.c | 31 ++++++ 12 files changed, 564 insertions(+), 139 deletions(-) diff --git a/src/core/guest.c b/src/core/guest.c index fca75b3..b5afcbf 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -42,7 +42,10 @@ #include "core/startup-trace.h" #include "debug/log.h" #include "utils.h" +#include "runtime/futex.h" /* futex_interrupt_request */ #include "runtime/thread.h" /* thread_destroy_all_vcpus */ +#include "syscall/poll.h" /* wakeup_pipe_signal */ +#include "syscall/proc.h" /* proc_request_exit_group */ /* Per-vCPU pending TLBI request. Zero-initialized in every host pthread * by virtue of TLS default-zeroing, which maps to TLBI_NONE. @@ -575,9 +578,34 @@ static void release_extra_mappings(guest_t *g) void guest_destroy(guest_t *g) { - /* Destroy all worker vCPUs (thread table) before tearing down the VM. - * This prevents hv_vm_destroy from racing with active vCPUs that may still - * be running if thread join timed out during exit_group. + /* Quiesce worker vCPUs before unmapping stage-2. thread_destroy_all_vcpus + * only releases vCPU handles; it does not wait for the owning pthread to + * leave hv_vcpu_run. A worker still inside the guest at unmap time takes + * a stage-2 translation fault on its next instruction fetch and surfaces + * as "unexpected exception EC=0x20" in the crash report. PR #89's foot + * reproduction tripped exactly that race. The exit_group syscall handler + * already runs request, interrupt, and join before its own teardown; the + * destroy path needs the same prefix because forkipc.c:vcpu_run_loop + * returns straight into guest_destroy without going through the guest + * exit_group handler. The request is guarded on the prior state so a + * process that already chose its exit code keeps it intact. + * + * The wake signals cover workers blocked outside hv_vcpu_run: futex + * waiters poll futex_interrupt_requested, and any thread parked in + * epoll or poll wakes off the shared pipe. Without them, host-blocked + * workers miss the hv_vcpus_exit kick (which only affects threads + * inside hv_vcpu_run) and the 100ms join cap in thread_join_workers + * detaches them, leaving live pthreads to crash on the imminent munmap. + */ + if (!proc_exit_group_requested()) + proc_request_exit_group(0); + futex_interrupt_request(); + wakeup_pipe_signal(); + thread_interrupt_all(); + thread_join_workers(); + /* Destroy all remaining worker vCPUs (thread table) before tearing down + * the VM. This prevents hv_vm_destroy from racing with active vCPUs that + * may still be running if thread join timed out during exit_group. */ thread_destroy_all_vcpus(); if (g->vcpu) { diff --git a/src/debug/crashreport.c b/src/debug/crashreport.c index 2c4b789..a5d5ef5 100644 --- a/src/debug/crashreport.c +++ b/src/debug/crashreport.c @@ -19,6 +19,19 @@ #include "syscall/proc.h" +/* Page-table descriptor bit definitions used by the diagnostic walker below. + * The full set lives in src/core/guest.c next to the helpers that build + * descriptors; the crash walker needs only the fields it prints, so keep the + * duplication minimal rather than promoting a private header. + */ +#define CR_PT_VALID 1ULL +#define CR_PT_TABLE 2ULL +#define CR_PT_ADDR_MASK 0xFFFFFFFFF000ULL +#define CR_L2_BLOCK_ADDR_MASK 0xFFFFFFE00000ULL +#define CR_BLOCK_1GIB (1024ULL * 1024 * 1024) +#define CR_BLOCK_2MIB (2ULL * 1024 * 1024) +#define CR_PAGE_SIZE 4096ULL + /* Read a sysctl string into buf (NUL-terminated). Returns 0 on success. */ static int sysctl_str(const char *name, char *buf, size_t bufsz) { @@ -111,6 +124,201 @@ static const char *esr_ec_name(uint64_t esr) } } +static bool esr_is_data_abort(uint64_t esr) +{ + unsigned ec = (unsigned) ((esr >> 26) & 0x3f); + return ec == 0x24 || ec == 0x25; +} + +/* Walk the guest stage-1 page tables for `va` and print L0/L1/L2/L3 entries + * in raw form so the crash report localises an "unmapped" claim either to the + * guest PT (entry is 0 or PT_VALID clear) or to a downstream stage-2 hole. + * The walker mirrors gva_translate_perm in src/core/guest.c but accepts any + * PT_VALID descriptor instead of enforcing requested permissions, so a + * non-executable or EL1-only page still prints with its actual contents. + * Silently skips when g, host_base, or ttbr0 are missing -- the data needed + * for the dump is gone before the walker can stage anything useful. + */ +static bool dump_pt_walk_for_va(const guest_t *g, + uint64_t va, + uint64_t *ipa_out) +{ + if (!g || !g->host_base || !g->ttbr0) + return false; + + uint64_t base = g->ipa_base; + if (g->ttbr0 < base || g->ttbr0 - base >= g->guest_size) { + fprintf(stderr, " PT walk: TTBR0 0x%llx out of slab range\n", + (unsigned long long) g->ttbr0); + return false; + } + uint64_t l0_off = g->ttbr0 - base; + const uint64_t *l0 = + (const uint64_t *) ((const uint8_t *) g->host_base + l0_off); + unsigned l0_idx = (unsigned) (va / (512ULL * CR_BLOCK_1GIB)); + if (l0_idx >= 512) { + fprintf(stderr, " PT walk: L0 index %u out of range\n", l0_idx); + return false; + } + uint64_t l0_entry = l0[l0_idx]; + fprintf(stderr, " L0[%u]=0x%llx", l0_idx, (unsigned long long) l0_entry); + if (!(l0_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + uint64_t l1_ipa = l0_entry & CR_PT_ADDR_MASK; + if (l1_ipa < base || l1_ipa - base >= g->guest_size) { + fprintf(stderr, " L1@0x%llx out-of-slab\n", + (unsigned long long) l1_ipa); + return false; + } + const uint64_t *l1 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l1_ipa - base)); + unsigned l1_idx = (unsigned) ((va / CR_BLOCK_1GIB) % 512); + uint64_t l1_entry = l1[l1_idx]; + fprintf(stderr, " L1[%u]=0x%llx", l1_idx, (unsigned long long) l1_entry); + if (!(l1_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + uint64_t l2_ipa = l1_entry & CR_PT_ADDR_MASK; + if (l2_ipa < base || l2_ipa - base >= g->guest_size) { + fprintf(stderr, " L2@0x%llx out-of-slab\n", + (unsigned long long) l2_ipa); + return false; + } + const uint64_t *l2 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l2_ipa - base)); + unsigned l2_idx = (unsigned) ((va / CR_BLOCK_2MIB) % 512); + uint64_t l2_entry = l2[l2_idx]; + fprintf(stderr, " L2[%u]=0x%llx", l2_idx, (unsigned long long) l2_entry); + if (!(l2_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + /* A valid L2 entry is either a 2MiB block (bit1=0) or a table descriptor + * pointing at a 4KiB L3 page. Only the table case requires another walk. + */ + if (!(l2_entry & CR_PT_TABLE)) { + uint64_t block_ipa = l2_entry & CR_L2_BLOCK_ADDR_MASK; + uint64_t translated_ipa = block_ipa + (va & (CR_BLOCK_2MIB - 1)); + fprintf(stderr, " (2MiB block -> IPA 0x%llx)\n", + (unsigned long long) translated_ipa); + if (ipa_out) + *ipa_out = translated_ipa; + return true; + } + + uint64_t l3_ipa = l2_entry & CR_PT_ADDR_MASK; + if (l3_ipa < base || l3_ipa - base >= g->guest_size) { + fprintf(stderr, " L3@0x%llx out-of-slab\n", + (unsigned long long) l3_ipa); + return false; + } + const uint64_t *l3 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l3_ipa - base)); + unsigned l3_idx = (unsigned) ((va / CR_PAGE_SIZE) % 512); + uint64_t l3_entry = l3[l3_idx]; + if (!(l3_entry & CR_PT_VALID)) { + fprintf(stderr, " L3[%u]=0x%llx INVALID\n", l3_idx, + (unsigned long long) l3_entry); + return false; + } + + uint64_t page_ipa = l3_entry & CR_PT_ADDR_MASK; + uint64_t translated_ipa = page_ipa + (va & (CR_PAGE_SIZE - 1)); + fprintf(stderr, " L3[%u]=0x%llx -> IPA 0x%llx\n", l3_idx, + (unsigned long long) l3_entry, (unsigned long long) translated_ipa); + if (ipa_out) + *ipa_out = translated_ipa; + return true; +} + +/* Print the HVF stage-2 backing range covering `ipa` plus the region-tracker + * entry whose VA range includes the same address. A missing stage-2 backing is + * the downstream analogue of an INVALID L3 entry above; a missing region + * tracker is normal for non-tracked ranges (vDSO, shim) but useful for "is + * this VA in a known ELF segment" reasoning. + */ +static void dump_segment_and_region_for(const guest_t *g, + uint64_t va, + bool have_ipa, + uint64_t ipa) +{ + if (!g) + return; + + if (have_ipa) { + bool found_seg = false; + for (int i = 0; i < g->n_segments; i++) { + const hvf_segment_t *s = &g->segments[i]; + if (ipa >= s->ipa && ipa < s->ipa + s->len) { + fprintf(stderr, + " HVF segment[%d]: ipa=0x%llx len=0x%llx " + "(covers translated IPA 0x%llx)\n", + i, (unsigned long long) s->ipa, + (unsigned long long) s->len, (unsigned long long) ipa); + found_seg = true; + break; + } + } + if (!found_seg) { + const guest_mapping_t *m = guest_find_mapping(g, ipa); + if (m) { + fprintf(stderr, + " HVF mapping: gpa=0x%llx size=0x%zx " + "(covers translated IPA 0x%llx)\n", + (unsigned long long) m->gpa, m->size, + (unsigned long long) ipa); + found_seg = true; + } + } + if (!found_seg) { + const guest_overflow_t *o = guest_find_overflow(g, ipa); + if (o) { + fprintf(stderr, + " HVF overflow: ipa=0x%llx size=0x%llx " + "(covers translated IPA 0x%llx)\n", + (unsigned long long) o->ipa_start, + (unsigned long long) o->size, (unsigned long long) ipa); + found_seg = true; + } + } + if (!found_seg) + fprintf(stderr, + " HVF backing: NONE (stage-2 hole for IPA 0x%llx)\n", + (unsigned long long) ipa); + } else { + fprintf(stderr, + " HVF backing: not checked (no valid stage-1 translation)\n"); + } + + const guest_region_t *r = guest_region_find(g, va); + if (r) { + fprintf(stderr, " region: [0x%llx, 0x%llx) prot=%c%c%c name=\"%s\"\n", + (unsigned long long) r->start, (unsigned long long) r->end, + (r->prot & 1) ? 'r' : '-', (r->prot & 2) ? 'w' : '-', + (r->prot & 4) ? 'x' : '-', r->name[0] ? r->name : "(unnamed)"); + } else { + fprintf(stderr, " region: NONE\n"); + } +} + +static void dump_translation_diagnostics_for(const guest_t *g, + const char *label, + uint64_t va) +{ + fprintf(stderr, "## Translation diagnostics (%s=0x%llx)\n", label, + (unsigned long long) va); + uint64_t ipa = 0; + bool have_ipa = dump_pt_walk_for_va(g, va, &ipa); + dump_segment_and_region_for(g, va, have_ipa, ipa); + fprintf(stderr, "\n"); +} + void crash_report(hv_vcpu_t vcpu, const guest_t *g, crash_type_t type, @@ -247,7 +455,28 @@ void crash_report(hv_vcpu_t vcpu, fprintf(stderr, "interp_base = 0x%llx mmap_limit = 0x%llx\n", (unsigned long long) g->interp_base, (unsigned long long) g->mmap_limit); - fprintf(stderr, "nregions = %d\n\n", g->nregions); + fprintf(stderr, "nregions = %d hvf_segments = %d\n", g->nregions, + g->n_segments); + fprintf(stderr, "ttbr0 = 0x%llx\n\n", + (unsigned long long) g->ttbr0); + + /* Translation diagnostics for the faulting PC. Helps decide whether + * an inst-abort or data-abort came from a stage-1 PT entry that went + * away (L3 INVALID after an unrelated mprotect / munmap) or from a + * stage-2 backing hole (for example, an hvf_segment_split that left no + * covering entry). Data aborts also dump FAR when it differs from PC + * because FAR is the actual load/store fault address. + */ + if (vcpu) { + uint64_t pc = 0, esr = 0, far_reg = 0; + hv_vcpu_get_reg(vcpu, HV_REG_PC, &pc); + hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &esr); + hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_FAR_EL1, &far_reg); + if (pc) + dump_translation_diagnostics_for(g, "PC", pc); + if (esr_is_data_abort(esr) && far_reg && far_reg != pc) + dump_translation_diagnostics_for(g, "FAR", far_reg); + } } fprintf(stderr, diff --git a/src/runtime/futex.c b/src/runtime/futex.c index 683d95c..c167c53 100644 --- a/src/runtime/futex.c +++ b/src/runtime/futex.c @@ -34,6 +34,7 @@ #include "syscall/abi.h" #include "syscall/proc.h" +#include "syscall/signal.h" #include "debug/log.h" @@ -103,10 +104,12 @@ static _Atomic int futex_interrupt_requested = 0; * * The wait quantum is capped at 100 ms so proc_exit_group_requested() and * futex_interrupt_pending() get noticed promptly without a process-wide - * broadcast channel. The 1-second EINTR simulation that the bucket path uses - * for shutdown-stalled multi-threaded runtimes is preserved here, but only - * once more than one guest thread is active. Single-threaded guests should not - * see synthetic EINTR churn on indefinite waits. + * broadcast channel. EINTR is only synthesized when an actual deliverable + * signal is queued for this thread (signal_pending_lockfree) or when a guest + * itimer expires under the poll loop's signal_check_timer poke. Earlier + * revisions returned -EINTR after one unconditional second of waiting to + * unblock shutdown-stalled multi-threaded runtimes, but that broke POSIX + * sem_wait callers that do not retry on EINTR (e.g. foot's render worker). */ #if ELFUSE_HAVE_OS_SYNC_WAIT_ON_ADDRESS static bool os_sync_available; @@ -114,12 +117,6 @@ static bool os_sync_wait_enabled; #endif #define FUTEX_OS_SYNC_POLL_CAP_NS (100ULL * 1000 * 1000) -#define FUTEX_OS_SYNC_EINTR_SIM_MS 1000 - -static inline bool futex_should_simulate_periodic_eintr(void) -{ - return !thread_is_single_active(); -} /* Hash table */ @@ -222,6 +219,22 @@ int futex_interrupt_pending(void) return atomic_load(&futex_interrupt_requested); } +/* Test-and-clear: returns 1 if the interrupt request was pending and atomically + * clears it, 0 otherwise. The interrupt is a one-shot edge: forkipc.c sets it + * when the last clone-thread exits so the main thread observes EINTR in its + * next blocking wait, mirroring how real Linux delivers SIGCHLD. Without the + * clear, the flag stays set and every subsequent epoll_pwait, ppoll, futex + * wait, etc. spins on EINTR until execve clears it -- in foot's case it never + * does, and the spinning main thread eventually faults in a code path the + * guest never expects to reach. + */ +int futex_interrupt_consume(void) +{ + int expected = 1; + return atomic_compare_exchange_strong(&futex_interrupt_requested, &expected, + 0); +} + /* Cap on guest-supplied tv_sec. The cap exists purely so the int64_t / time_t * arithmetic in the deadline conversion (now.tv_sec + delta_sec, where * delta_sec = lts.tv_sec - mono.tv_sec) cannot overflow even for adversarial @@ -392,19 +405,13 @@ static int64_t futex_os_sync_wait(guest_t *g, if (current != expected) return -LINUX_EAGAIN; - struct timeval wait_start; - bool simulate_periodic_eintr = - !has_timeout && futex_should_simulate_periodic_eintr(); - if (simulate_periodic_eintr) - gettimeofday(&wait_start, NULL); - /* Bound consecutive EFAULT retries. Apple documents EFAULT as transient * (kernel copyin failure under memory pressure), so a few retries are fine; * but a genuinely bad page would otherwise cause the loop to spin with no * real sleep -- timeout_ns is supplied to syscall that returns immediately - * -- until either the user deadline or the 1-second EINTR simulation - * finally bails out. Surface EFAULT to the guest after this many - * back-to-back failures so the host CPU does not burn for ~1 s. + * -- until the user deadline finally bails out. Surface EFAULT to the + * guest after this many back-to-back failures so the host CPU does not + * burn for ~1 s. */ int efault_retries = 0; @@ -436,17 +443,22 @@ static int64_t futex_os_sync_wait(guest_t *g, efault_retries = 0; } - if (proc_exit_group_requested() || futex_interrupt_pending()) + if (proc_exit_group_requested() || futex_interrupt_consume()) return -LINUX_EINTR; - if (simulate_periodic_eintr) { - struct timeval now; - gettimeofday(&now, NULL); - long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 + - (now.tv_usec - wait_start.tv_usec) / 1000; - if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS) - return -LINUX_EINTR; - } + /* Drain any expired guest itimer so its SIGALRM / SIGVTALRM / SIGPROF + * queues into sig_state.pending; without this poke, a guest with all + * threads parked in futex_wait would never advance the timers. + */ + signal_check_timer(); + + /* Return EINTR only when a real deliverable signal is queued for + * this thread. POSIX callers (e.g. glibc sem_wait, foot's render + * worker) often do not retry on EINTR, so synthetic spurious + * wakeups cannot be issued here. + */ + if (signal_pending_lockfree()) + return -LINUX_EINTR; /* For has_timeout: futex_remaining_ns returns 0 next iteration once * the user deadline elapses, so the loop exits with -ETIMEDOUT. */ @@ -528,22 +540,6 @@ static int64_t futex_wait(guest_t *g, /* Wait until woken or timeout */ int ret = 0; - /* Record start time for the no-timeout path. On real Linux, any pending - * signal interrupts futex_wait with -EINTR. Without a timer signal - * (SIGVTALRM from timer_create/setitimer), some multi-threaded runtimes - * can deadlock when a thread blocks in futex_wait and no wakeup arrives - * (e.g., a shutdown signal delivered to the wrong I/O manager). - * FUTEX_WAIT returns -EINTR after 1 second of blocking to simulate - * periodic signal delivery. All real futex callers (musl, glibc, and - * other managed runtimes) handle -EINTR correctly by re-checking their - * condition and retrying. - */ - struct timeval wait_start; - bool simulate_periodic_eintr = - !has_timeout && futex_should_simulate_periodic_eintr(); - if (simulate_periodic_eintr) - gettimeofday(&wait_start, NULL); - while (!__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) { if (has_timeout) { int rc = pthread_cond_timedwait(&waiter.cond, &b->lock, &deadline); @@ -552,35 +548,43 @@ static int64_t futex_wait(guest_t *g, ret = -LINUX_ETIMEDOUT; break; } - } else { - /* No timeout specified: poll every 100ms to check for exit_group, - * futex_interrupt (simulated SIGCHLD), or excessive wait time - * (simulated signal interruption). - */ - struct timespec poll_ts; - timespec_deadline_in_ms(&poll_ts, 100); - pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts); + continue; + } - if (proc_exit_group_requested() || futex_interrupt_pending()) { - ret = -LINUX_EINTR; - break; - } + /* No timeout specified: poll every 100 ms to check for exit_group, + * futex_interrupt, expired guest itimers, and queued signals. + */ + struct timespec poll_ts; + timespec_deadline_in_ms(&poll_ts, 100); + pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts); - /* Simulate periodic signal delivery only for multi-threaded - * guests. Single-threaded glibc startup paths can legitimately - * park in FUTEX_WAIT forever until a real wake arrives, and - * synthetic EINTR here breaks that contract. - */ - if (simulate_periodic_eintr) { - struct timeval now; - gettimeofday(&now, NULL); - long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 + - (now.tv_usec - wait_start.tv_usec) / 1000; - if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS) { - ret = -LINUX_EINTR; - break; - } - } + if (proc_exit_group_requested() || futex_interrupt_consume()) { + ret = -LINUX_EINTR; + break; + } + + /* Lock-order: bucket lock(7) outranks sig_lock(4), so signal_pending() + * and signal_check_timer() may only be called once the bucket lock has + * been released. Drop it, poke the itimers, observe queued signals, + * then re-acquire and re-check waiter.woken in case a wake landed in + * the window. + */ + pthread_mutex_unlock(&b->lock); + signal_check_timer(); + bool sig_ready = signal_pending_lockfree(); + pthread_mutex_lock(&b->lock); + + if (__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) + break; + + /* Return EINTR only when a real deliverable signal is queued for + * this thread. POSIX callers (e.g. glibc sem_wait, foot's render + * worker) often do not retry on EINTR, so synthetic spurious + * wakeups cannot be issued here. + */ + if (sig_ready) { + ret = -LINUX_EINTR; + break; } } diff --git a/src/runtime/futex.h b/src/runtime/futex.h index 79f60eb..186b6a8 100644 --- a/src/runtime/futex.h +++ b/src/runtime/futex.h @@ -27,6 +27,7 @@ void futex_init(void); void futex_interrupt_request(void); void futex_interrupt_clear(void); int futex_interrupt_pending(void); +int futex_interrupt_consume(void); /* Main futex syscall entry point. * op: futex operation (FUTEX_WAIT, FUTEX_WAKE, etc.) diff --git a/src/syscall/abi.h b/src/syscall/abi.h index cecc5c1..c15121a 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -398,6 +398,13 @@ typedef struct { #define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */ #define LINUX_O_PATH 0x200000 /* 010000000 octal */ +/* Linux fallocate(2) mode bits (linux/falloc.h). PUNCH_HOLE requires the + * caller to also set KEEP_SIZE per the manpage; collapse/insert/zero/unshare + * range modes are recognised numerically but elsewhere unsupported. + */ +#define LINUX_FALLOC_FL_KEEP_SIZE 0x01 +#define LINUX_FALLOC_FL_PUNCH_HOLE 0x02 + /* Linux AT_* constants. */ #define LINUX_AT_FDCWD (-100) #define LINUX_AT_SYMLINK_NOFOLLOW 0x100 diff --git a/src/syscall/io.c b/src/syscall/io.c index eafc798..879db75 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -1878,8 +1878,83 @@ int64_t sys_fallocate(int fd, int mode, int64_t offset, int64_t len) return -LINUX_EINVAL; } - /* mode 0 = basic allocation -> ftruncate fallback. - * Other modes (FALLOC_FL_PUNCH_HOLE etc.) not supported. + /* FALLOC_FL_PUNCH_HOLE always requires FALLOC_FL_KEEP_SIZE on Linux; + * map both to macOS F_PUNCHHOLE on the host fd, with a pwrite-zeros + * fallback for misalignment. + * + * The Linux semantic is "reads in [offset, offset+len) return zero; + * file size unchanged". macOS F_PUNCHHOLE enforces filesystem block + * alignment on both ends and rejects sub-block requests with EINVAL -- + * that one-byte probe (offset=0 len=1) foot's wl_shm pool issues + * surfaces as "fallocate(FALLOC_FL_PUNCH_HOLE) not supported (Invalid + * argument)" otherwise, and foot disables punch-hole for the whole + * session. + * + * Writing zeros over the region produces the same observable result: + * reads return zero, file size unchanged. The disk-space deallocation + * optimisation is lost on the pwrite path, but the probe succeeds, so + * foot keeps punch-hole enabled and the later, properly aligned calls + * (page-sized buffers) still take the F_PUNCHHOLE fast path. + */ + const int kPunchHole = + LINUX_FALLOC_FL_PUNCH_HOLE | LINUX_FALLOC_FL_KEEP_SIZE; + if (mode == kPunchHole) { + struct fpunchhole hole = { + .fp_flags = 0, + .reserved = 0, + .fp_offset = (off_t) offset, + .fp_length = (off_t) len, + }; + if (fcntl(host_ref.fd, F_PUNCHHOLE, &hole) == 0) { + host_fd_ref_close(&host_ref); + return 0; + } + /* EINVAL: misaligned, sub-block, or non-regular file. pwrite zeros + * only through the current EOF so KEEP_SIZE remains guest-visible. + * Any other host errno propagates verbatim. + */ + if (errno != EINVAL) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + struct stat st; + if (fstat(host_ref.fd, &st) < 0) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + if (offset >= st.st_size) { + host_fd_ref_close(&host_ref); + return 0; + } + int64_t remaining = st.st_size - offset; + if (remaining > len) + remaining = len; + + static const char zeros[4096]; + off_t cur = (off_t) offset; + while (remaining > 0) { + size_t chunk = remaining > (int64_t) sizeof(zeros) + ? sizeof(zeros) + : (size_t) remaining; + ssize_t nw = pwrite(host_ref.fd, zeros, chunk, cur); + if (nw < 0) { + if (errno == EINTR) + continue; + host_fd_ref_close(&host_ref); + return linux_errno(); + } + if (nw == 0) + break; /* defensive; pwrite on a regular file should not 0 */ + cur += nw; + remaining -= nw; + } + host_fd_ref_close(&host_ref); + return 0; + } + + /* mode 0 = basic allocation -> ftruncate fallback. Anything else + * (collapse range, zero range, insert range, unshare range) stays + * unsupported and surfaces as -EOPNOTSUPP for the guest to handle. */ if (mode != 0) { host_fd_ref_close(&host_ref); diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 415fa2e..b01c84c 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -1485,12 +1485,30 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, * quiesces siblings around the unmap+remap window so concurrent vCPUs * cannot fault on the temporarily-unmapped IPA range. */ +/* True when the backing fd cannot be written through. The overlay path + * replaces the slab's RW host VA with MAP_SHARED|MAP_FIXED of this fd, and + * Apple HVF refuses hv_vm_map of any permission onto a host VA whose write + * capability does not cover the requested stage-2 perms. A read-only fd + * lands here with the kernel rejecting either PROT_WRITE on the host mmap + * or, after a PROT_READ downgrade, the post-overlay hv_vm_map with + * HV_DENIED. Callers must use the snapshot pread path instead. + */ +static bool overlay_fd_writable(int fd) +{ + int fl = fcntl(fd, F_GETFL); + if (fl < 0) + return true; + return (fl & O_ACCMODE) != O_RDONLY; +} + static int hvf_apply_file_overlay(guest_t *g, uint64_t ipa, uint64_t len, int fd, off_t file_off) { + if (!overlay_fd_writable(fd)) + return -LINUX_EACCES; thread_quiesce_siblings(); int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off); thread_resume_siblings(); @@ -2267,7 +2285,14 @@ int64_t sys_mmap(guest_t *g, * gap-finder advances the hint to the next host-page boundary * after each allocation. */ + /* overlay_fd_writable rejects read-only backing fds inside + * hvf_apply_file_overlay; mirror the check here so a read-only + * mmap takes the snapshot pread path directly, skipping the + * thread_quiesce / segment_split cycle the overlay would + * otherwise perform before returning EACCES. + */ bool overlay_aligned = (flags & LINUX_MAP_SHARED) && + overlay_fd_writable(host_backing_fd) && (result_off % hps == 0) && ((uint64_t) offset % hps == 0); if (overlay_aligned) { @@ -3977,9 +4002,25 @@ int mmap_fork_restore_overlays(guest_t *g, int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd, (off_t) file_off); if (err < 0) { - log_warn( - "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d", - (unsigned long long) ovl_s, (unsigned long long) ovl_e, err); + /* -LINUX_EACCES is the writable-fd gate in hvf_apply_file_overlay + * rejecting a read-only backing fd (foot's fontconfig caches, + * shared library file-backed regions, etc.). The fallback to + * snapshot semantics is correct for those: the child reads the + * pre-fork bytes and never writes back, which is what the parent + * already did. Log at debug level so the success path stays + * quiet. Any other failure is unexpected and stays at warn. + */ + if (err == -LINUX_EACCES) + log_debug( + "fork-child: read-only backing fd, skipping overlay " + "[0x%llx, 0x%llx) (snapshot semantics)", + (unsigned long long) ovl_s, (unsigned long long) ovl_e); + else + log_warn( + "fork-child: overlay re-install [0x%llx, 0x%llx) failed: " + "%d", + (unsigned long long) ovl_s, (unsigned long long) ovl_e, + err); rc = err; continue; } diff --git a/src/syscall/poll.c b/src/syscall/poll.c index 705a902..aa65d0b 100644 --- a/src/syscall/poll.c +++ b/src/syscall/poll.c @@ -210,7 +210,7 @@ int64_t sys_ppoll(guest_t *g, poll_timeout_ms < 0 ? 200 : poll_timeout_ms); /* Check for exit_group / futex_interrupt after waking */ - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { ret = -1; errno = EINTR; break; @@ -549,7 +549,7 @@ int64_t sys_pselect6(guest_t *g, has_timeout ? &ts : &poll_ts, NULL); } - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { ret = -1; errno = EINTR; break; @@ -1030,7 +1030,7 @@ int64_t sys_epoll_pwait(guest_t *g, nready = kevent(epoll_ref.fd, NULL, 0, kevents, cap, has_timeout ? &ts : &poll_ts); - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { nready = -1; errno = EINTR; break; diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 5e4c820..50749e7 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -474,6 +474,16 @@ int signal_pending(void) return result; } +bool signal_pending_lockfree(void) +{ + uint64_t hint = + atomic_load_explicit(&sig_pending_hint, memory_order_acquire); + if (hint == 0) + return false; + uint64_t blocked = __atomic_load_n(thread_blocked_ptr(), __ATOMIC_ACQUIRE); + return (hint & ~blocked) != 0; +} + bool signal_attention_needed(void) { /* Cheap atomic load on the sig-pending hint first; if a signal is diff --git a/src/syscall/signal.h b/src/syscall/signal.h index f5792f5..c1848d4 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -244,6 +244,17 @@ void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr); int signal_pending(void); bool signal_pending_interruption(bool *restart_out); +/* Lock-free variant: consults the atomic pending hint and this thread's + * blocked mask. Never blocks, never acquires sig_lock. May report a + * stale true (the hint races with rt_sigprocmask), but never a stale + * false: signal_queue updates the hint with SEQ_CST before unlock, and + * the per-thread blocked field is published with RELEASE in + * rt_sigprocmask. Suitable for hot paths that need a yes/no answer + * without risking lock-order inversions (futex bucket lock outranks + * sig_lock). + */ +bool signal_pending_lockfree(void); + /* True if anything that would normally be drained by signal_check_timer is * currently live: an unblocked pending signal, OR any of the three guest * itimers is armed. The shim's identity fast path consults this (indirectly diff --git a/tests/test-futex-pi.c b/tests/test-futex-pi.c index 66b84b7..ad3ebbf 100644 --- a/tests/test-futex-pi.c +++ b/tests/test-futex-pi.c @@ -12,9 +12,12 @@ * 2. FUTEX_LOCK_PI + FUTEX_UNLOCK_PI round-trip: acquire and * release a PI lock from the same thread. * - * 3. futex_wait EINTR injection (commit 18bdd0f): futex_wait with - * no timeout returns -EINTR within ~1-2 seconds when no waker - * exists (simulated periodic signal delivery). + * 3. futex_wait blocks indefinitely without a signal: an earlier + * revision returned synthetic -EINTR after ~1 s of unconditional + * blocking, which violated POSIX sem_wait callers that do not + * retry on EINTR (e.g. foot's render_worker_thread). The wait + * must only return when a real wake arrives or a signal is + * genuinely queued for the thread. * * Syscalls exercised: futex(98), clone(220), gettid(178), exit(93) */ @@ -198,92 +201,77 @@ static void test_pi_dead_owner(void) PASS(); } -/* Test 3: EINTR injection after ~1s */ +/* Test 3: futex_wait without a signal blocks until woken */ -/* Sibling that keeps the guest in a multi-threaded state for the duration of - * the EINTR probe. The synthetic EINTR injection in futex_wait only fires - * while thread_is_single_active() is false; a single-threaded guest must be - * allowed to park in FUTEX_WAIT indefinitely so it does not break glibc - * startup paths. The probe therefore has to run with at least one other guest - * thread alive. - * - * The sibling sleeps on a timed futex_wait against keepalive_word with a - * 5-second timeout. The timeout dodges the EINTR injection ('!has_timeout' is - * what gates the sim), and 5 s is long enough to outlast the worst-case parent - * EINTR window (1 s with up to 100 ms poll jitter, plus a safety margin). After - * the parent's probe returns, the parent flips keepalive_word and wakes the - * sibling. +/* Sibling that waits ~1.2 s, flips the futex word, and issues FUTEX_WAKE on + * the parent's address. Used to drive the parent out of an indefinite + * futex_wait via a real wake (not synthetic EINTR). */ -static volatile int sibling_keepalive __attribute__((aligned(4))) = 1; -static char sibling_stack_buf[8192] __attribute__((aligned(16))); +static volatile int waker_word __attribute__((aligned(4))) = 0; +static char waker_stack_buf[8192] __attribute__((aligned(16))); -static void sibling_alive_thread(void) +static void waker_thread(void) { - struct timespec ts = {5, 0}; - while (__atomic_load_n(&sibling_keepalive, __ATOMIC_SEQ_CST) == 1) { - raw_syscall6(__NR_futex, (long) &sibling_keepalive, - FUTEX_WAIT | FUTEX_PRIVATE, 1, (long) &ts, 0, 0); - } + struct timespec ts = {1, 200 * 1000 * 1000}; + raw_syscall6(__NR_nanosleep, (long) &ts, 0, 0, 0, 0, 0); + + __atomic_store_n(&waker_word, 1, __ATOMIC_SEQ_CST); + raw_futex_wake((int *) &waker_word, 1); raw_exit(0); } static void test_futex_eintr(void) { - TEST("futex_wait EINTR after ~1s"); + TEST("futex_wait blocks until real wake, no synthetic EINTR"); - /* Spawn the sibling so thread_is_single_active() is false during the wait. - * CLONE flags match test_pi_dead_owner. - */ - sibling_keepalive = 1; - void *sibling_top = sibling_stack_buf + sizeof(sibling_stack_buf); - int sibling_tid_val = 0; - long sret = raw_clone(0x7d0f00, sibling_top, &sibling_tid_val, 0, - (int *) &sibling_tid_val); + waker_word = 0; + void *waker_top = waker_stack_buf + sizeof(waker_stack_buf); + int waker_tid_val = 0; + long sret = raw_clone(0x7d0f00, waker_top, &waker_tid_val, 0, + (int *) &waker_tid_val); if (sret < 0) { - FAIL("sibling clone failed"); + FAIL("waker clone failed"); return; } if (sret == 0) { - sibling_alive_thread(); + waker_thread(); raw_exit(1); /* unreachable */ } - /* Create a futex word that no one will wake. - * futex_wait with no timeout should return -EINTR after ~1 second - * (elfuse's simulated periodic signal delivery). - */ - volatile int unwoken = 42; - struct timeval t0, t1; gettimeofday(&t0, NULL); - long r = raw_futex_wait((int *) &unwoken, 42); + /* No timeout. With no signal queued, the wait must NOT return until the + * waker thread issues FUTEX_WAKE; earlier revisions returned synthetic + * -EINTR after ~1 s, which broke glibc sem_wait callers. + */ + long r = raw_futex_wait((int *) &waker_word, 0); gettimeofday(&t1, NULL); long elapsed_ms = (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_usec - t0.tv_usec) / 1000; - /* Tear down the sibling now that the EINTR check is done. */ - __atomic_store_n(&sibling_keepalive, 0, __ATOMIC_SEQ_CST); - raw_futex_wake((int *) &sibling_keepalive, 1); + /* Reap the waker. */ for (int i = 0; i < 100; i++) { - if (__atomic_load_n(&sibling_tid_val, __ATOMIC_SEQ_CST) == 0) + if (__atomic_load_n(&waker_tid_val, __ATOMIC_SEQ_CST) == 0) break; usleep(10000); } - /* Expect -EINTR (Linux errno 4) after 800ms-3000ms. - * The 1s timeout has jitter from 100ms polling intervals. + /* The waker sleeps ~1200 ms before waking the parent. Accept either rc==0 + * (woken by FUTEX_WAKE after the wait observed waker_word change) or + * -EAGAIN (woken between the waker_word store and the FUTEX_WAKE; the + * value-mismatch path is the documented race for FUTEX_WAIT). Either + * outcome proves the parent did not bail out on synthetic EINTR. */ - if (r == -4 /* -EINTR */ && elapsed_ms >= 800 && elapsed_ms <= 3000) { + if ((r == 0 || r == -11 /* -EAGAIN */) && elapsed_ms >= 1000 && + elapsed_ms <= 4000) { PASS(); - } else if (r == -4) { - /* Got EINTR but timing seems off; still passing, but note it */ - printf("OK (EINTR after %ldms)\n", elapsed_ms); - passes++; } else { - printf("FAIL: expected -EINTR(-4) got %ld, elapsed %ldms\n", r, - elapsed_ms); + printf( + "FAIL: expected rc=0 or -EAGAIN after ~1200ms, got %ld at " + "%ldms\n", + r, elapsed_ms); fails++; } } diff --git a/tests/test-io-opt.c b/tests/test-io-opt.c index e1691c1..b139a60 100644 --- a/tests/test-io-opt.c +++ b/tests/test-io-opt.c @@ -165,6 +165,37 @@ int main(void) FAIL("open failed"); } + TEST("fallocate punch hole keeps size past EOF"); + { + const char *punch_path = "/tmp/elfuse-test-punch.bin"; + unlink(punch_path); + int fd = open(punch_path, O_CREAT | O_RDWR, 0644); + if (fd >= 0) { + const char data[] = "abc"; + const off_t data_size = (off_t) sizeof(data) - 1; + if (write(fd, data, sizeof(data) - 1) == (ssize_t) data_size && + fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1, + 4096) == 0) { + struct stat st; + char buf[sizeof(data) - 1]; + int stat_ok = fstat(fd, &st) == 0; + ssize_t nr = -1; + if (stat_ok) + nr = pread(fd, buf, sizeof(buf), 0); + if (stat_ok && st.st_size == data_size && + nr == (ssize_t) sizeof(buf) && buf[0] == 'a' && + buf[1] == '\0' && buf[2] == '\0') + PASS(); + else + FAIL("punch hole did not preserve size and zero bytes"); + } else + FAIL("punch hole setup failed"); + close(fd); + unlink(punch_path); + } else + FAIL("open failed"); + } + /* Test copy_file_range (via off_t-based API) */ TEST("copy_file_range"); { From 12096f38594e55351c8fc9a1e9a288646a192f93 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 9 Jun 2026 09:28:46 +0800 Subject: [PATCH 6/7] Translate macOS ENOATTR/ENODATA to Linux ENODATA A targeted lgetxattr probe surfaced that elfuse's errno mapping table covered every other divergent macOS errno in the 35..102 range but lost the xattr-specific pair: macOS ENOATTR(93) and ENODATA(96) both fell into the linux_errno() default and surfaced as Linux EINVAL(22). fontconfig and glibc treat "attribute not found" as ENODATA(61), so the wrong errno made lgetxattr on a missing attr look like a malformed call and short-circuited their probes. Add a LINUX_ENODATA constant, route both macOS values through it in translate.c (guarded on the alias case so duplicate switch labels do not break the compile when ENOATTR == ENODATA on a given SDK), and add a tests/test-xattr.c that pins five round-trips: lgetxattr on a regular file returns the stored value, lgetxattr on a symlink without its own attr reports ENODATA (not EINVAL), getxattr on the same symlink follows to the target, lgetxattr after lsetxattr installs a symlink-owned attr returns the link value, and lgetxattr on a missing attr reports ENODATA. The dispatch wiring at syscall.c:285 was already in place; only the translation cap was missing. --- src/syscall/abi.h | 1 + src/syscall/translate.c | 16 ++++ tests/manifest.txt | 3 + tests/test-xattr.c | 177 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 tests/test-xattr.c diff --git a/src/syscall/abi.h b/src/syscall/abi.h index c15121a..3aa50ed 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -333,6 +333,7 @@ typedef struct { #define LINUX_EMULTIHOP 72 /* Multihop attempted */ #define LINUX_EILSEQ 84 /* Illegal byte sequence */ #define LINUX_EHOSTDOWN 112 /* Host is down */ +#define LINUX_ENODATA 61 /* No data available (xattr missing, stream) */ /* Linux FD flags. */ #define LINUX_FD_CLOEXEC 1 diff --git a/src/syscall/translate.c b/src/syscall/translate.c index 018690d..7cf2981 100644 --- a/src/syscall/translate.c +++ b/src/syscall/translate.c @@ -93,6 +93,22 @@ int64_t linux_errno(void) #if ENOTSUP != EOPNOTSUPP case ENOTSUP: return -LINUX_EOPNOTSUPP; +#endif + /* macOS xattr "attribute not found" lives at 93 (ENOATTR) on modern + * SDKs; on some versions ENOATTR is a synonym for ENODATA(96). Map + * both to Linux ENODATA(61) so getxattr/lgetxattr/fgetxattr report + * missing attrs correctly. Guarded by #if to avoid duplicate cases + * when the headers alias the two macros. + */ +#ifdef ENOATTR +#if !defined(ENODATA) || ENOATTR != ENODATA + case ENOATTR: + return -LINUX_ENODATA; +#endif +#endif +#ifdef ENODATA + case ENODATA: + return -LINUX_ENODATA; #endif #ifdef ENOTRECOVERABLE case ENOTRECOVERABLE: diff --git a/tests/manifest.txt b/tests/manifest.txt index e9ce10f..b8e779f 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -108,6 +108,9 @@ test-cloexec [section] O_PATH semantics tests test-opath +[section] xattr semantics tests +test-xattr + [section] Guard page / mmap edge cases test-guard-page test-mmap-hint diff --git a/tests/test-xattr.c b/tests/test-xattr.c new file mode 100644 index 0000000..9ee055e --- /dev/null +++ b/tests/test-xattr.c @@ -0,0 +1,177 @@ +/* lgetxattr / getxattr / setxattr semantics tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Pins three properties of the elfuse xattr surface that the host + * shim has to translate: + * + * 1. lgetxattr returns the stored value on a regular file. + * 2. lgetxattr on a symlink does not follow the link, so requesting + * an attr stored on the target reports ENODATA. getxattr on the + * same symlink follows and returns the target's value. + * 3. A missing attribute reports ENODATA, not EINVAL. macOS returns + * ENOATTR(93) or its synonym ENODATA(96); both must translate to + * Linux ENODATA(61). + * + * Regression: an earlier revision lacked the ENOATTR translation + * entry, so the default in linux_errno() fell through to EINVAL, + * which masked real "attribute not present" outcomes and broke + * fontconfig / glibc xattr probes. + * + * Syscalls exercised: setxattr(5), lsetxattr(6), getxattr(8), + * lgetxattr(9) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#define NR_setxattr 5 +#define NR_lsetxattr 6 +#define NR_getxattr 8 +#define NR_lgetxattr 9 + +static long do_setxattr(const char *path, + const char *name, + const void *val, + size_t sz, + int flags) +{ + return syscall(NR_setxattr, path, name, val, sz, flags); +} + +static long do_lsetxattr(const char *path, + const char *name, + const void *val, + size_t sz, + int flags) +{ + return syscall(NR_lsetxattr, path, name, val, sz, flags); +} + +static long do_getxattr(const char *path, + const char *name, + void *out, + size_t cap) +{ + return syscall(NR_getxattr, path, name, out, cap); +} + +static long do_lgetxattr(const char *path, + const char *name, + void *out, + size_t cap) +{ + return syscall(NR_lgetxattr, path, name, out, cap); +} + +static const char tmp_file[] = "/tmp/elfuse-xattr-target"; +static const char tmp_link[] = "/tmp/elfuse-xattr-link"; + +static void setup(void) +{ + unlink(tmp_link); + unlink(tmp_file); + int fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) + return; + (void) !write(fd, "hello\n", 6); + close(fd); + symlink(tmp_file, tmp_link); +} + +static void teardown(void) +{ + unlink(tmp_link); + unlink(tmp_file); +} + +static void test_lgetxattr_regular_file(void) +{ + TEST("lgetxattr on regular file returns value"); + const char *attr = "user.elfuse_probe"; + const char *val = "wired"; + if (do_setxattr(tmp_file, attr, val, strlen(val), 0) != 0) { + FAIL("setxattr seed failed"); + return; + } + char buf[64] = {0}; + long r = do_lgetxattr(tmp_file, attr, buf, sizeof(buf)); + EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0, + "lgetxattr value mismatch"); +} + +static void test_lgetxattr_symlink_no_follow(void) +{ + TEST("lgetxattr on symlink reports ENODATA, not EINVAL"); + char buf[64] = {0}; + errno = 0; + long r = do_lgetxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf)); + EXPECT_TRUE(r == -1 && errno == ENODATA, + "expected ENODATA from lgetxattr on bare symlink"); +} + +static void test_getxattr_symlink_follows(void) +{ + TEST("getxattr on symlink follows to target"); + char buf[64] = {0}; + long r = do_getxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf)); + const char *val = "wired"; + EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0, + "getxattr value mismatch"); +} + +static void test_lgetxattr_symlink_with_attr(void) +{ + TEST("lgetxattr returns symlink-owned attr after lsetxattr"); + const char *attr = "user.elfuse_probe"; + const char *lval = "link-val"; + if (do_lsetxattr(tmp_link, attr, lval, strlen(lval), 0) != 0) { + printf("SKIP (lsetxattr on symlink unsupported: errno=%d)\n", errno); + passes++; + return; + } + char buf[64] = {0}; + long r = do_lgetxattr(tmp_link, attr, buf, sizeof(buf)); + EXPECT_TRUE( + r == (long) strlen(lval) && memcmp(buf, lval, strlen(lval)) == 0, + "lgetxattr did not return symlink-owned value"); +} + +static void test_lgetxattr_missing(void) +{ + TEST("lgetxattr on missing attr reports ENODATA"); + char buf[64] = {0}; + errno = 0; + long r = do_lgetxattr(tmp_file, "user.no_such_attr_xyz", buf, sizeof(buf)); + EXPECT_TRUE(r == -1 && errno == ENODATA, + "expected ENODATA from lgetxattr on missing attr"); +} + +int main(void) +{ + printf("test-xattr: lgetxattr / getxattr / setxattr semantics\n"); + + setup(); + + test_lgetxattr_regular_file(); + test_lgetxattr_symlink_no_follow(); + test_getxattr_symlink_follows(); + test_lgetxattr_symlink_with_attr(); + test_lgetxattr_missing(); + + teardown(); + + SUMMARY("test-xattr"); + return fails == 0 ? 0 : 1; +} From f5c640f9d3da08f1d4db1867c23918b2438f8b9c Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 9 Jun 2026 10:12:51 +0800 Subject: [PATCH 7/7] Refine futex EINTR gate futex.c: signal_pending_lockfree consults the atomic sig_pending_hint without confirming under sig_lock, which the helper itself documented as a stale-true source -- rt_sigprocmask masking a queued signal does not update the hint, so the next reader sees pending && unblocked and synthesizes -EINTR even though signal_pending() (the slow-path confirm) would have reported no deliverable signal. Both futex_wait call sites already release the bucket lock before consulting signal state (the OS-sync path holds no bucket lock at all; the bucket path explicitly drops b->lock around the signal check and re-checks waiter.woken after re-acquiring), so the slow-path call is safe at both sites without lock-order risk. Route both through signal_pending() and delete the lockfree variant. --- src/runtime/futex.c | 22 ++++++++++++++-------- src/syscall/mem.c | 32 ++++++++++++++++++-------------- src/syscall/signal.c | 10 ---------- src/syscall/signal.h | 11 ----------- 4 files changed, 32 insertions(+), 43 deletions(-) diff --git a/src/runtime/futex.c b/src/runtime/futex.c index c167c53..4351cb2 100644 --- a/src/runtime/futex.c +++ b/src/runtime/futex.c @@ -104,8 +104,10 @@ static _Atomic int futex_interrupt_requested = 0; * * The wait quantum is capped at 100 ms so proc_exit_group_requested() and * futex_interrupt_pending() get noticed promptly without a process-wide - * broadcast channel. EINTR is only synthesized when an actual deliverable - * signal is queued for this thread (signal_pending_lockfree) or when a guest + * broadcast channel. EINTR is only returned when an actual deliverable + * signal is queued for this thread (confirmed under sig_lock via + * signal_pending(), not the atomic hint, so that rt_sigprocmask masking the + * queued signal cannot leave a stale-true edge behind), or when a guest * itimer expires under the poll loop's signal_check_timer poke. Earlier * revisions returned -EINTR after one unconditional second of waiting to * unblock shutdown-stalled multi-threaded runtimes, but that broke POSIX @@ -455,9 +457,11 @@ static int64_t futex_os_sync_wait(guest_t *g, /* Return EINTR only when a real deliverable signal is queued for * this thread. POSIX callers (e.g. glibc sem_wait, foot's render * worker) often do not retry on EINTR, so synthetic spurious - * wakeups cannot be issued here. + * wakeups cannot be issued here. signal_pending() confirms under + * sig_lock so the atomic hint cannot produce a stale-true edge + * after rt_sigprocmask masked the queued signal. */ - if (signal_pending_lockfree()) + if (signal_pending()) return -LINUX_EINTR; /* For has_timeout: futex_remaining_ns returns 0 next iteration once * the user deadline elapses, so the loop exits with -ETIMEDOUT. @@ -565,13 +569,15 @@ static int64_t futex_wait(guest_t *g, /* Lock-order: bucket lock(7) outranks sig_lock(4), so signal_pending() * and signal_check_timer() may only be called once the bucket lock has - * been released. Drop it, poke the itimers, observe queued signals, - * then re-acquire and re-check waiter.woken in case a wake landed in - * the window. + * been released. Drop it, poke the itimers, observe queued signals + * under sig_lock (the slow-path confirm avoids the stale-true edge + * that the atomic hint can carry after rt_sigprocmask masks the + * queued signal), then re-acquire and re-check waiter.woken in case + * a wake landed in the window. */ pthread_mutex_unlock(&b->lock); signal_check_timer(); - bool sig_ready = signal_pending_lockfree(); + bool sig_ready = signal_pending() != 0; pthread_mutex_lock(&b->lock); if (__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) diff --git a/src/syscall/mem.c b/src/syscall/mem.c index b01c84c..b344816 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -1478,20 +1478,17 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, return 0; } -/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, - * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB - * segment is split out first if it is not already isolated. Caller - * holds mmap_lock and has not quiesced siblings yet. The function - * quiesces siblings around the unmap+remap window so concurrent vCPUs - * cannot fault on the temporarily-unmapped IPA range. - */ -/* True when the backing fd cannot be written through. The overlay path - * replaces the slab's RW host VA with MAP_SHARED|MAP_FIXED of this fd, and - * Apple HVF refuses hv_vm_map of any permission onto a host VA whose write - * capability does not cover the requested stage-2 perms. A read-only fd - * lands here with the kernel rejecting either PROT_WRITE on the host mmap - * or, after a PROT_READ downgrade, the post-overlay hv_vm_map with - * HV_DENIED. Callers must use the snapshot pread path instead. +/* True when the backing fd allows writes through. The overlay path replaces + * the slab's RW host VA with MAP_SHARED|MAP_FIXED of this fd, and Apple HVF + * refuses hv_vm_map of any permission onto a host VA whose write capability + * does not cover the requested stage-2 perms. A read-only fd lands there + * with the kernel rejecting either PROT_WRITE on the host mmap or, after a + * PROT_READ downgrade, the post-overlay hv_vm_map with HV_DENIED. + * Centralises that decision: both the overlay entry (hvf_apply_file_overlay) + * and the sys_mmap fast-path skip share this gate so read-only backers are + * routed straight to the snapshot pread path. Returns true on the optimistic + * path when fcntl itself fails: the subsequent mmap / hv_vm_map will surface + * the real error rather than this helper synthesising one. */ static bool overlay_fd_writable(int fd) { @@ -1501,6 +1498,13 @@ static bool overlay_fd_writable(int fd) return (fl & O_ACCMODE) != O_RDONLY; } +/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, + * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB + * segment is split out first if it is not already isolated. Caller + * holds mmap_lock and has not quiesced siblings yet. The function + * quiesces siblings around the unmap+remap window so concurrent vCPUs + * cannot fault on the temporarily-unmapped IPA range. + */ static int hvf_apply_file_overlay(guest_t *g, uint64_t ipa, uint64_t len, diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 50749e7..5e4c820 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -474,16 +474,6 @@ int signal_pending(void) return result; } -bool signal_pending_lockfree(void) -{ - uint64_t hint = - atomic_load_explicit(&sig_pending_hint, memory_order_acquire); - if (hint == 0) - return false; - uint64_t blocked = __atomic_load_n(thread_blocked_ptr(), __ATOMIC_ACQUIRE); - return (hint & ~blocked) != 0; -} - bool signal_attention_needed(void) { /* Cheap atomic load on the sig-pending hint first; if a signal is diff --git a/src/syscall/signal.h b/src/syscall/signal.h index c1848d4..f5792f5 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -244,17 +244,6 @@ void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr); int signal_pending(void); bool signal_pending_interruption(bool *restart_out); -/* Lock-free variant: consults the atomic pending hint and this thread's - * blocked mask. Never blocks, never acquires sig_lock. May report a - * stale true (the hint races with rt_sigprocmask), but never a stale - * false: signal_queue updates the hint with SEQ_CST before unlock, and - * the per-thread blocked field is published with RELEASE in - * rt_sigprocmask. Suitable for hot paths that need a yes/no answer - * without risking lock-order inversions (futex bucket lock outranks - * sig_lock). - */ -bool signal_pending_lockfree(void); - /* True if anything that would normally be drained by signal_check_timer is * currently live: an unblocked pending signal, OR any of the three guest * itimers is armed. The shim's identity fast path consults this (indirectly