diff --git a/src/core/guest.c b/src/core/guest.c index fca75b3..b5afcbf 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -42,7 +42,10 @@ #include "core/startup-trace.h" #include "debug/log.h" #include "utils.h" +#include "runtime/futex.h" /* futex_interrupt_request */ #include "runtime/thread.h" /* thread_destroy_all_vcpus */ +#include "syscall/poll.h" /* wakeup_pipe_signal */ +#include "syscall/proc.h" /* proc_request_exit_group */ /* Per-vCPU pending TLBI request. Zero-initialized in every host pthread * by virtue of TLS default-zeroing, which maps to TLBI_NONE. @@ -575,9 +578,34 @@ static void release_extra_mappings(guest_t *g) void guest_destroy(guest_t *g) { - /* Destroy all worker vCPUs (thread table) before tearing down the VM. - * This prevents hv_vm_destroy from racing with active vCPUs that may still - * be running if thread join timed out during exit_group. + /* Quiesce worker vCPUs before unmapping stage-2. thread_destroy_all_vcpus + * only releases vCPU handles; it does not wait for the owning pthread to + * leave hv_vcpu_run. A worker still inside the guest at unmap time takes + * a stage-2 translation fault on its next instruction fetch and surfaces + * as "unexpected exception EC=0x20" in the crash report. PR #89's foot + * reproduction tripped exactly that race. The exit_group syscall handler + * already runs request, interrupt, and join before its own teardown; the + * destroy path needs the same prefix because forkipc.c:vcpu_run_loop + * returns straight into guest_destroy without going through the guest + * exit_group handler. The request is guarded on the prior state so a + * process that already chose its exit code keeps it intact. + * + * The wake signals cover workers blocked outside hv_vcpu_run: futex + * waiters poll futex_interrupt_requested, and any thread parked in + * epoll or poll wakes off the shared pipe. Without them, host-blocked + * workers miss the hv_vcpus_exit kick (which only affects threads + * inside hv_vcpu_run) and the 100ms join cap in thread_join_workers + * detaches them, leaving live pthreads to crash on the imminent munmap. + */ + if (!proc_exit_group_requested()) + proc_request_exit_group(0); + futex_interrupt_request(); + wakeup_pipe_signal(); + thread_interrupt_all(); + thread_join_workers(); + /* Destroy all remaining worker vCPUs (thread table) before tearing down + * the VM. This prevents hv_vm_destroy from racing with active vCPUs that + * may still be running if thread join timed out during exit_group. */ thread_destroy_all_vcpus(); if (g->vcpu) { diff --git a/src/debug/crashreport.c b/src/debug/crashreport.c index 2c4b789..a5d5ef5 100644 --- a/src/debug/crashreport.c +++ b/src/debug/crashreport.c @@ -19,6 +19,19 @@ #include "syscall/proc.h" +/* Page-table descriptor bit definitions used by the diagnostic walker below. + * The full set lives in src/core/guest.c next to the helpers that build + * descriptors; the crash walker needs only the fields it prints, so keep the + * duplication minimal rather than promoting a private header. + */ +#define CR_PT_VALID 1ULL +#define CR_PT_TABLE 2ULL +#define CR_PT_ADDR_MASK 0xFFFFFFFFF000ULL +#define CR_L2_BLOCK_ADDR_MASK 0xFFFFFFE00000ULL +#define CR_BLOCK_1GIB (1024ULL * 1024 * 1024) +#define CR_BLOCK_2MIB (2ULL * 1024 * 1024) +#define CR_PAGE_SIZE 4096ULL + /* Read a sysctl string into buf (NUL-terminated). Returns 0 on success. */ static int sysctl_str(const char *name, char *buf, size_t bufsz) { @@ -111,6 +124,201 @@ static const char *esr_ec_name(uint64_t esr) } } +static bool esr_is_data_abort(uint64_t esr) +{ + unsigned ec = (unsigned) ((esr >> 26) & 0x3f); + return ec == 0x24 || ec == 0x25; +} + +/* Walk the guest stage-1 page tables for `va` and print L0/L1/L2/L3 entries + * in raw form so the crash report localises an "unmapped" claim either to the + * guest PT (entry is 0 or PT_VALID clear) or to a downstream stage-2 hole. + * The walker mirrors gva_translate_perm in src/core/guest.c but accepts any + * PT_VALID descriptor instead of enforcing requested permissions, so a + * non-executable or EL1-only page still prints with its actual contents. + * Silently skips when g, host_base, or ttbr0 are missing -- the data needed + * for the dump is gone before the walker can stage anything useful. + */ +static bool dump_pt_walk_for_va(const guest_t *g, + uint64_t va, + uint64_t *ipa_out) +{ + if (!g || !g->host_base || !g->ttbr0) + return false; + + uint64_t base = g->ipa_base; + if (g->ttbr0 < base || g->ttbr0 - base >= g->guest_size) { + fprintf(stderr, " PT walk: TTBR0 0x%llx out of slab range\n", + (unsigned long long) g->ttbr0); + return false; + } + uint64_t l0_off = g->ttbr0 - base; + const uint64_t *l0 = + (const uint64_t *) ((const uint8_t *) g->host_base + l0_off); + unsigned l0_idx = (unsigned) (va / (512ULL * CR_BLOCK_1GIB)); + if (l0_idx >= 512) { + fprintf(stderr, " PT walk: L0 index %u out of range\n", l0_idx); + return false; + } + uint64_t l0_entry = l0[l0_idx]; + fprintf(stderr, " L0[%u]=0x%llx", l0_idx, (unsigned long long) l0_entry); + if (!(l0_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + uint64_t l1_ipa = l0_entry & CR_PT_ADDR_MASK; + if (l1_ipa < base || l1_ipa - base >= g->guest_size) { + fprintf(stderr, " L1@0x%llx out-of-slab\n", + (unsigned long long) l1_ipa); + return false; + } + const uint64_t *l1 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l1_ipa - base)); + unsigned l1_idx = (unsigned) ((va / CR_BLOCK_1GIB) % 512); + uint64_t l1_entry = l1[l1_idx]; + fprintf(stderr, " L1[%u]=0x%llx", l1_idx, (unsigned long long) l1_entry); + if (!(l1_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + uint64_t l2_ipa = l1_entry & CR_PT_ADDR_MASK; + if (l2_ipa < base || l2_ipa - base >= g->guest_size) { + fprintf(stderr, " L2@0x%llx out-of-slab\n", + (unsigned long long) l2_ipa); + return false; + } + const uint64_t *l2 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l2_ipa - base)); + unsigned l2_idx = (unsigned) ((va / CR_BLOCK_2MIB) % 512); + uint64_t l2_entry = l2[l2_idx]; + fprintf(stderr, " L2[%u]=0x%llx", l2_idx, (unsigned long long) l2_entry); + if (!(l2_entry & CR_PT_VALID)) { + fprintf(stderr, " INVALID\n"); + return false; + } + + /* A valid L2 entry is either a 2MiB block (bit1=0) or a table descriptor + * pointing at a 4KiB L3 page. Only the table case requires another walk. + */ + if (!(l2_entry & CR_PT_TABLE)) { + uint64_t block_ipa = l2_entry & CR_L2_BLOCK_ADDR_MASK; + uint64_t translated_ipa = block_ipa + (va & (CR_BLOCK_2MIB - 1)); + fprintf(stderr, " (2MiB block -> IPA 0x%llx)\n", + (unsigned long long) translated_ipa); + if (ipa_out) + *ipa_out = translated_ipa; + return true; + } + + uint64_t l3_ipa = l2_entry & CR_PT_ADDR_MASK; + if (l3_ipa < base || l3_ipa - base >= g->guest_size) { + fprintf(stderr, " L3@0x%llx out-of-slab\n", + (unsigned long long) l3_ipa); + return false; + } + const uint64_t *l3 = + (const uint64_t *) ((const uint8_t *) g->host_base + (l3_ipa - base)); + unsigned l3_idx = (unsigned) ((va / CR_PAGE_SIZE) % 512); + uint64_t l3_entry = l3[l3_idx]; + if (!(l3_entry & CR_PT_VALID)) { + fprintf(stderr, " L3[%u]=0x%llx INVALID\n", l3_idx, + (unsigned long long) l3_entry); + return false; + } + + uint64_t page_ipa = l3_entry & CR_PT_ADDR_MASK; + uint64_t translated_ipa = page_ipa + (va & (CR_PAGE_SIZE - 1)); + fprintf(stderr, " L3[%u]=0x%llx -> IPA 0x%llx\n", l3_idx, + (unsigned long long) l3_entry, (unsigned long long) translated_ipa); + if (ipa_out) + *ipa_out = translated_ipa; + return true; +} + +/* Print the HVF stage-2 backing range covering `ipa` plus the region-tracker + * entry whose VA range includes the same address. A missing stage-2 backing is + * the downstream analogue of an INVALID L3 entry above; a missing region + * tracker is normal for non-tracked ranges (vDSO, shim) but useful for "is + * this VA in a known ELF segment" reasoning. + */ +static void dump_segment_and_region_for(const guest_t *g, + uint64_t va, + bool have_ipa, + uint64_t ipa) +{ + if (!g) + return; + + if (have_ipa) { + bool found_seg = false; + for (int i = 0; i < g->n_segments; i++) { + const hvf_segment_t *s = &g->segments[i]; + if (ipa >= s->ipa && ipa < s->ipa + s->len) { + fprintf(stderr, + " HVF segment[%d]: ipa=0x%llx len=0x%llx " + "(covers translated IPA 0x%llx)\n", + i, (unsigned long long) s->ipa, + (unsigned long long) s->len, (unsigned long long) ipa); + found_seg = true; + break; + } + } + if (!found_seg) { + const guest_mapping_t *m = guest_find_mapping(g, ipa); + if (m) { + fprintf(stderr, + " HVF mapping: gpa=0x%llx size=0x%zx " + "(covers translated IPA 0x%llx)\n", + (unsigned long long) m->gpa, m->size, + (unsigned long long) ipa); + found_seg = true; + } + } + if (!found_seg) { + const guest_overflow_t *o = guest_find_overflow(g, ipa); + if (o) { + fprintf(stderr, + " HVF overflow: ipa=0x%llx size=0x%llx " + "(covers translated IPA 0x%llx)\n", + (unsigned long long) o->ipa_start, + (unsigned long long) o->size, (unsigned long long) ipa); + found_seg = true; + } + } + if (!found_seg) + fprintf(stderr, + " HVF backing: NONE (stage-2 hole for IPA 0x%llx)\n", + (unsigned long long) ipa); + } else { + fprintf(stderr, + " HVF backing: not checked (no valid stage-1 translation)\n"); + } + + const guest_region_t *r = guest_region_find(g, va); + if (r) { + fprintf(stderr, " region: [0x%llx, 0x%llx) prot=%c%c%c name=\"%s\"\n", + (unsigned long long) r->start, (unsigned long long) r->end, + (r->prot & 1) ? 'r' : '-', (r->prot & 2) ? 'w' : '-', + (r->prot & 4) ? 'x' : '-', r->name[0] ? r->name : "(unnamed)"); + } else { + fprintf(stderr, " region: NONE\n"); + } +} + +static void dump_translation_diagnostics_for(const guest_t *g, + const char *label, + uint64_t va) +{ + fprintf(stderr, "## Translation diagnostics (%s=0x%llx)\n", label, + (unsigned long long) va); + uint64_t ipa = 0; + bool have_ipa = dump_pt_walk_for_va(g, va, &ipa); + dump_segment_and_region_for(g, va, have_ipa, ipa); + fprintf(stderr, "\n"); +} + void crash_report(hv_vcpu_t vcpu, const guest_t *g, crash_type_t type, @@ -247,7 +455,28 @@ void crash_report(hv_vcpu_t vcpu, fprintf(stderr, "interp_base = 0x%llx mmap_limit = 0x%llx\n", (unsigned long long) g->interp_base, (unsigned long long) g->mmap_limit); - fprintf(stderr, "nregions = %d\n\n", g->nregions); + fprintf(stderr, "nregions = %d hvf_segments = %d\n", g->nregions, + g->n_segments); + fprintf(stderr, "ttbr0 = 0x%llx\n\n", + (unsigned long long) g->ttbr0); + + /* Translation diagnostics for the faulting PC. Helps decide whether + * an inst-abort or data-abort came from a stage-1 PT entry that went + * away (L3 INVALID after an unrelated mprotect / munmap) or from a + * stage-2 backing hole (for example, an hvf_segment_split that left no + * covering entry). Data aborts also dump FAR when it differs from PC + * because FAR is the actual load/store fault address. + */ + if (vcpu) { + uint64_t pc = 0, esr = 0, far_reg = 0; + hv_vcpu_get_reg(vcpu, HV_REG_PC, &pc); + hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &esr); + hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_FAR_EL1, &far_reg); + if (pc) + dump_translation_diagnostics_for(g, "PC", pc); + if (esr_is_data_abort(esr) && far_reg && far_reg != pc) + dump_translation_diagnostics_for(g, "FAR", far_reg); + } } fprintf(stderr, diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 1c31bc0..3b2847a 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -17,6 +17,7 @@ #include "utils.h" #include "runtime/fork-state.h" +#include "runtime/procemu.h" #include "debug/log.h" #include "syscall/abi.h" @@ -447,6 +448,156 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return 0; } +/* Wire payload for one pty keepalive. The slave fd travels separately via + * SCM_RIGHTS; the parent's master_host_fd is intentionally omitted because the + * child's number will differ. The child re-derives it from fd_table[gfd]. + */ +typedef struct { + int32_t guest_fd; + uint32_t linux_pts_num; + char slave_path[64]; +} ipc_pty_keepalive_t; + +int fork_ipc_send_pty_keepalives(int ipc_sock) +{ + /* PTY_KEEPALIVE_MAX upper bound on entries; allocate to that. */ + proc_pty_ipc_entry_t snapshot[256]; + int snapshot_slave_fds[256]; + int num_snap = proc_pty_snapshot_keepalive(snapshot, snapshot_slave_fds, + ARRAY_SIZE(snapshot)); + + /* Match each keepalive's master_host_fd against a live fd_table entry to + * recover the guest_fd, which is the stable identifier across fork. + */ + ipc_pty_keepalive_t payload[256]; + int payload_slave_fds[256]; + uint32_t num_send = 0; + + pthread_mutex_lock(&fd_lock); + for (int i = 0; i < num_snap; i++) { + int matched_gfd = -1; + for (int gfd = 0; gfd < FD_TABLE_SIZE; gfd++) { + if (fd_table[gfd].type == FD_CLOSED) + continue; + if (fd_table[gfd].host_fd != snapshot[i].master_host_fd) + continue; + if (fd_type_is_synthetic(fd_table[gfd].type)) + continue; + matched_gfd = gfd; + break; + } + + if (matched_gfd < 0) { + /* Master was closed between snapshot and lookup, or never tracked + * in the guest fd table (defensive). Drop the duped slave; nothing + * else holds it. + */ + close(snapshot_slave_fds[i]); + continue; + } + + payload[num_send].guest_fd = matched_gfd; + payload[num_send].linux_pts_num = snapshot[i].linux_pts_num; + _Static_assert( + sizeof(payload[0].slave_path) == sizeof(snapshot[0].slave_path), + "keepalive slave_path size must match payload"); + memcpy(payload[num_send].slave_path, snapshot[i].slave_path, + sizeof(payload[0].slave_path)); + payload_slave_fds[num_send] = snapshot_slave_fds[i]; + num_send++; + } + pthread_mutex_unlock(&fd_lock); + + int rc = 0; + if (fork_ipc_write_all(ipc_sock, &num_send, sizeof(num_send)) < 0) { + rc = -1; + } else if (num_send > 0) { + if (fork_ipc_write_all(ipc_sock, payload, + num_send * sizeof(payload[0])) < 0) { + rc = -1; + } else if (fork_ipc_send_fds(ipc_sock, payload_slave_fds, + (int) num_send) < 0) { + log_error("clone: failed to send pty keepalive fds"); + rc = -1; + } + } + for (uint32_t i = 0; i < num_send; i++) + close(payload_slave_fds[i]); + return rc; +} + +int fork_ipc_recv_pty_keepalives(int ipc_fd) +{ + uint32_t num; + if (fork_ipc_read_all(ipc_fd, &num, sizeof(num)) < 0) { + log_error("fork-child: failed to read pty keepalive count"); + return -1; + } + if (num == 0) + return 0; + if (num > FD_TABLE_SIZE) { + log_error("fork-child: pty keepalive count %u exceeds FD_TABLE_SIZE", + num); + return -1; + } + + ipc_pty_keepalive_t *payload = calloc(num, sizeof(*payload)); + if (!payload) + return -1; + + if (fork_ipc_read_all(ipc_fd, payload, num * sizeof(*payload)) < 0) { + free(payload); + return -1; + } + + int *slave_fds = calloc(num, sizeof(int)); + if (!slave_fds) { + free(payload); + return -1; + } + int got = 0; + if (fork_ipc_recv_fds(ipc_fd, slave_fds, (int) num, &got) < 0 || + got != (int) num) { + log_error("fork-child: pty keepalive recv mismatch: got %d expected %u", + got, num); + for (int i = 0; i < got; i++) + close(slave_fds[i]); + free(slave_fds); + free(payload); + return -1; + } + + for (uint32_t i = 0; i < num; i++) { + int gfd = payload[i].guest_fd; + int child_master = -1; + if (RANGE_CHECK(gfd, 0, FD_TABLE_SIZE)) { + pthread_mutex_lock(&fd_lock); + if (fd_table[gfd].type != FD_CLOSED) + child_master = fd_table[gfd].host_fd; + pthread_mutex_unlock(&fd_lock); + } + if (child_master < 0) { + /* Master fd did not survive the fd_table batch (synthetic-type + * filter, or the slot was rejected). Drop the keepalive cleanly. + */ + close(slave_fds[i]); + continue; + } + + /* Force-NUL the path before passing it on so a malformed sender cannot + * trick the child into reading past the buffer. + */ + payload[i].slave_path[sizeof(payload[i].slave_path) - 1] = '\0'; + proc_pty_restore_keepalive(child_master, slave_fds[i], + payload[i].linux_pts_num, + payload[i].slave_path); + } + + free(slave_fds); + free(payload); + return 0; +} + static int fork_ipc_send_backing_fds(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions) @@ -714,12 +865,14 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read region count"); return -1; } + uint8_t regions_tracker_stale = 0; if (fork_ipc_read_all(ipc_fd, ®ions_tracker_stale, sizeof(regions_tracker_stale)) < 0) { log_error("fork-child: failed to read region tracker state"); return -1; } + uint32_t recv_regions = num_guest_regions; if (recv_regions > GUEST_MAX_REGIONS) recv_regions = GUEST_MAX_REGIONS; @@ -729,15 +882,17 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read regions"); return -1; } + /* Drain any excess records the parent serialized beyond the local cap. * Without this drain, the next read (num_preannounced) consumes stale - * region bytes and desynchronizes the rest of the IPC payload. Mirrors - * the preannounced-region drain below. + * region bytes and desynchronizes the rest of the IPC payload. Mirrors the + * preannounced-region drain below. */ if (num_guest_regions > recv_regions && fork_ipc_drain_bytes(ipc_fd, (num_guest_regions - recv_regions) * sizeof(guest_region_t)) < 0) return -1; + g->nregions = (int) recv_regions; g->regions_tracker_stale = (regions_tracker_stale != 0) || (num_guest_regions > recv_regions); diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index e9de0bc..27c2080 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -99,6 +99,17 @@ int fork_ipc_recv_memory_regions(int ipc_fd, guest_t *g); int fork_ipc_send_fd_table(int ipc_sock); int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g); +/* Carry the /dev/ptmx keepalive slave fds across the fork boundary. The fd + * table batch sends master fds without their hidden keepalive companions, so + * a child that inherits a master would otherwise hit the macOS ENOTTY / + * winsize-reset cliff that proc_pty_close_keepalive papers over in the + * parent. send/recv must run AFTER fork_ipc_send_fd_table / _recv_fd_table + * so the child can look up its new master host fd from the just-installed + * fd_table entry. + */ +int fork_ipc_send_pty_keepalives(int ipc_sock); +int fork_ipc_recv_pty_keepalives(int ipc_fd); + int fork_ipc_send_process_state(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions, diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index aef1d24..7b89dcc 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -261,6 +261,15 @@ int fork_child_main(int ipc_fd, return 1; } + /* Must follow fork_ipc_recv_fd_table: the keepalive recv resolves each + * payload guest_fd to its (now installed) child-side host master fd. + */ + if (fork_ipc_recv_pty_keepalives(ipc_fd) < 0) { + log_error("fork-child: failed to receive pty keepalives"); + guest_destroy(&g); + return 1; + } + signal_state_t sig; if (fork_ipc_recv_process_state(ipc_fd, &g, &sig) < 0) { log_error("fork-child: failed to receive process state"); @@ -1598,6 +1607,15 @@ int64_t sys_clone(hv_vcpu_t vcpu, goto fail_snapshot; } + /* Must follow fork_ipc_send_fd_table because the keepalive payload + * carries a guest_fd that the child resolves through its just-installed + * fd_table to recover the child-side master host fd. + */ + if (fork_ipc_send_pty_keepalives(ipc_sock) < 0) { + log_error("clone: failed to send pty keepalives"); + goto fail_snapshot; + } + uint32_t num_guest_regions = (uint32_t) nregions_snapshot; uint32_t num_preannounced = (uint32_t) npreannounced_snapshot; if (fork_ipc_send_process_state( diff --git a/src/runtime/futex.c b/src/runtime/futex.c index 683d95c..4351cb2 100644 --- a/src/runtime/futex.c +++ b/src/runtime/futex.c @@ -34,6 +34,7 @@ #include "syscall/abi.h" #include "syscall/proc.h" +#include "syscall/signal.h" #include "debug/log.h" @@ -103,10 +104,14 @@ static _Atomic int futex_interrupt_requested = 0; * * The wait quantum is capped at 100 ms so proc_exit_group_requested() and * futex_interrupt_pending() get noticed promptly without a process-wide - * broadcast channel. The 1-second EINTR simulation that the bucket path uses - * for shutdown-stalled multi-threaded runtimes is preserved here, but only - * once more than one guest thread is active. Single-threaded guests should not - * see synthetic EINTR churn on indefinite waits. + * broadcast channel. EINTR is only returned when an actual deliverable + * signal is queued for this thread (confirmed under sig_lock via + * signal_pending(), not the atomic hint, so that rt_sigprocmask masking the + * queued signal cannot leave a stale-true edge behind), or when a guest + * itimer expires under the poll loop's signal_check_timer poke. Earlier + * revisions returned -EINTR after one unconditional second of waiting to + * unblock shutdown-stalled multi-threaded runtimes, but that broke POSIX + * sem_wait callers that do not retry on EINTR (e.g. foot's render worker). */ #if ELFUSE_HAVE_OS_SYNC_WAIT_ON_ADDRESS static bool os_sync_available; @@ -114,12 +119,6 @@ static bool os_sync_wait_enabled; #endif #define FUTEX_OS_SYNC_POLL_CAP_NS (100ULL * 1000 * 1000) -#define FUTEX_OS_SYNC_EINTR_SIM_MS 1000 - -static inline bool futex_should_simulate_periodic_eintr(void) -{ - return !thread_is_single_active(); -} /* Hash table */ @@ -222,6 +221,22 @@ int futex_interrupt_pending(void) return atomic_load(&futex_interrupt_requested); } +/* Test-and-clear: returns 1 if the interrupt request was pending and atomically + * clears it, 0 otherwise. The interrupt is a one-shot edge: forkipc.c sets it + * when the last clone-thread exits so the main thread observes EINTR in its + * next blocking wait, mirroring how real Linux delivers SIGCHLD. Without the + * clear, the flag stays set and every subsequent epoll_pwait, ppoll, futex + * wait, etc. spins on EINTR until execve clears it -- in foot's case it never + * does, and the spinning main thread eventually faults in a code path the + * guest never expects to reach. + */ +int futex_interrupt_consume(void) +{ + int expected = 1; + return atomic_compare_exchange_strong(&futex_interrupt_requested, &expected, + 0); +} + /* Cap on guest-supplied tv_sec. The cap exists purely so the int64_t / time_t * arithmetic in the deadline conversion (now.tv_sec + delta_sec, where * delta_sec = lts.tv_sec - mono.tv_sec) cannot overflow even for adversarial @@ -392,19 +407,13 @@ static int64_t futex_os_sync_wait(guest_t *g, if (current != expected) return -LINUX_EAGAIN; - struct timeval wait_start; - bool simulate_periodic_eintr = - !has_timeout && futex_should_simulate_periodic_eintr(); - if (simulate_periodic_eintr) - gettimeofday(&wait_start, NULL); - /* Bound consecutive EFAULT retries. Apple documents EFAULT as transient * (kernel copyin failure under memory pressure), so a few retries are fine; * but a genuinely bad page would otherwise cause the loop to spin with no * real sleep -- timeout_ns is supplied to syscall that returns immediately - * -- until either the user deadline or the 1-second EINTR simulation - * finally bails out. Surface EFAULT to the guest after this many - * back-to-back failures so the host CPU does not burn for ~1 s. + * -- until the user deadline finally bails out. Surface EFAULT to the + * guest after this many back-to-back failures so the host CPU does not + * burn for ~1 s. */ int efault_retries = 0; @@ -436,17 +445,24 @@ static int64_t futex_os_sync_wait(guest_t *g, efault_retries = 0; } - if (proc_exit_group_requested() || futex_interrupt_pending()) + if (proc_exit_group_requested() || futex_interrupt_consume()) return -LINUX_EINTR; - if (simulate_periodic_eintr) { - struct timeval now; - gettimeofday(&now, NULL); - long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 + - (now.tv_usec - wait_start.tv_usec) / 1000; - if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS) - return -LINUX_EINTR; - } + /* Drain any expired guest itimer so its SIGALRM / SIGVTALRM / SIGPROF + * queues into sig_state.pending; without this poke, a guest with all + * threads parked in futex_wait would never advance the timers. + */ + signal_check_timer(); + + /* Return EINTR only when a real deliverable signal is queued for + * this thread. POSIX callers (e.g. glibc sem_wait, foot's render + * worker) often do not retry on EINTR, so synthetic spurious + * wakeups cannot be issued here. signal_pending() confirms under + * sig_lock so the atomic hint cannot produce a stale-true edge + * after rt_sigprocmask masked the queued signal. + */ + if (signal_pending()) + return -LINUX_EINTR; /* For has_timeout: futex_remaining_ns returns 0 next iteration once * the user deadline elapses, so the loop exits with -ETIMEDOUT. */ @@ -528,22 +544,6 @@ static int64_t futex_wait(guest_t *g, /* Wait until woken or timeout */ int ret = 0; - /* Record start time for the no-timeout path. On real Linux, any pending - * signal interrupts futex_wait with -EINTR. Without a timer signal - * (SIGVTALRM from timer_create/setitimer), some multi-threaded runtimes - * can deadlock when a thread blocks in futex_wait and no wakeup arrives - * (e.g., a shutdown signal delivered to the wrong I/O manager). - * FUTEX_WAIT returns -EINTR after 1 second of blocking to simulate - * periodic signal delivery. All real futex callers (musl, glibc, and - * other managed runtimes) handle -EINTR correctly by re-checking their - * condition and retrying. - */ - struct timeval wait_start; - bool simulate_periodic_eintr = - !has_timeout && futex_should_simulate_periodic_eintr(); - if (simulate_periodic_eintr) - gettimeofday(&wait_start, NULL); - while (!__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) { if (has_timeout) { int rc = pthread_cond_timedwait(&waiter.cond, &b->lock, &deadline); @@ -552,35 +552,45 @@ static int64_t futex_wait(guest_t *g, ret = -LINUX_ETIMEDOUT; break; } - } else { - /* No timeout specified: poll every 100ms to check for exit_group, - * futex_interrupt (simulated SIGCHLD), or excessive wait time - * (simulated signal interruption). - */ - struct timespec poll_ts; - timespec_deadline_in_ms(&poll_ts, 100); - pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts); + continue; + } - if (proc_exit_group_requested() || futex_interrupt_pending()) { - ret = -LINUX_EINTR; - break; - } + /* No timeout specified: poll every 100 ms to check for exit_group, + * futex_interrupt, expired guest itimers, and queued signals. + */ + struct timespec poll_ts; + timespec_deadline_in_ms(&poll_ts, 100); + pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts); - /* Simulate periodic signal delivery only for multi-threaded - * guests. Single-threaded glibc startup paths can legitimately - * park in FUTEX_WAIT forever until a real wake arrives, and - * synthetic EINTR here breaks that contract. - */ - if (simulate_periodic_eintr) { - struct timeval now; - gettimeofday(&now, NULL); - long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 + - (now.tv_usec - wait_start.tv_usec) / 1000; - if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS) { - ret = -LINUX_EINTR; - break; - } - } + if (proc_exit_group_requested() || futex_interrupt_consume()) { + ret = -LINUX_EINTR; + break; + } + + /* Lock-order: bucket lock(7) outranks sig_lock(4), so signal_pending() + * and signal_check_timer() may only be called once the bucket lock has + * been released. Drop it, poke the itimers, observe queued signals + * under sig_lock (the slow-path confirm avoids the stale-true edge + * that the atomic hint can carry after rt_sigprocmask masks the + * queued signal), then re-acquire and re-check waiter.woken in case + * a wake landed in the window. + */ + pthread_mutex_unlock(&b->lock); + signal_check_timer(); + bool sig_ready = signal_pending() != 0; + pthread_mutex_lock(&b->lock); + + if (__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) + break; + + /* Return EINTR only when a real deliverable signal is queued for + * this thread. POSIX callers (e.g. glibc sem_wait, foot's render + * worker) often do not retry on EINTR, so synthetic spurious + * wakeups cannot be issued here. + */ + if (sig_ready) { + ret = -LINUX_EINTR; + break; } } diff --git a/src/runtime/futex.h b/src/runtime/futex.h index 79f60eb..186b6a8 100644 --- a/src/runtime/futex.h +++ b/src/runtime/futex.h @@ -27,6 +27,7 @@ void futex_init(void); void futex_interrupt_request(void); void futex_interrupt_clear(void); int futex_interrupt_pending(void); +int futex_interrupt_consume(void); /* Main futex syscall entry point. * op: futex operation (FUTEX_WAIT, FUTEX_WAKE, etc.) diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index c1b2e31..297d102 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -19,6 +19,8 @@ */ #define MAPS_NAME_COLUMN 73 +#include +#include #include #include #include @@ -40,6 +42,8 @@ #include #include #include +#include +#include #include "utils.h" @@ -1479,11 +1483,805 @@ static void proc_task_collect_cb(thread_entry_t *t, void *arg) c->tids[c->ntids++] = t->guest_tid; } +/* Pseudoterminal master side-table. + * + * Bridges two host vs guest mismatches in one place: + * + * 1. The macOS /dev/ptmx master is not itself a tty. TIOCSWINSZ / TIOCGWINSZ + * on the bare master return ENOTTY until something has opened the + * corresponding slave once, and the stored winsize gets cleared whenever + * the slave refcount drops to zero (verified empirically on macOS 15). + * Linux ptmx masters are tty fds in their own right, so guests assume those + * ioctls work without an open slave. To bridge the gap, every /dev/ptmx + * open eagerly opens one slave host fd that elfuse holds for the lifetime + * of the master and never exposes to the guest. + * + * 2. macOS slaves live at /dev/ttysNNN; Linux glibc looks for /dev/pts/N where + * N comes from TIOCGPTN. Guest opens of /dev/pts/N route back to the + * macOS path captured from ptsname(3) at /dev/ptmx open time, not a + * re-formatted guess, so format changes in macOS (or unusual minor + * encodings) cannot strand the guest with the wrong slave. + * + * Entries are keyed by the host master fd because that is what fd_cleanup_entry + * has when the guest closes a master. Capacity matches the macOS default UNIX98 + * slave count; overflow leaves the entry empty and the guest gets the pre-fix + * degraded behavior for that one pair instead of an open failure. + * + * Fork-restored entries may outlive their master for one /dev/pts/N open. A + * foot / sshd / posix-compliant child closes the master fd after fork before + * opening the slave (the child has no use for the master); without retaining + * the path mapping past close, the subsequent /dev/pts/N open in the child + * loses its translation and fails with ENOENT even though the parent still + * holds the master and the macOS slave node is openable. Those stale entries + * keep the received slave fd until the first translated open attempt, then + * expire before the minor can be reused for an unrelated host tty. Ordinary + * local master closes clear the mapping immediately. + */ +#define PTY_KEEPALIVE_MAX 256 +#define PTY_KEEPALIVE_FREE (-1) +/* PTY_SLAVE_PATH_MAX lives in procemu.h so this table and the fork-IPC + * payload (proc_pty_ipc_entry_t) cannot drift apart. + */ +static struct { + int master_host_fd; + int slave_host_fd; + uint32_t linux_pts_num; + bool stale_open_once; + char slave_path[PTY_SLAVE_PATH_MAX]; +} pty_keepalive_table[PTY_KEEPALIVE_MAX]; +static pthread_mutex_t pty_keepalive_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_once_t pty_keepalive_once = PTHREAD_ONCE_INIT; + +/* Sentinel-init. Other fields stay BSS-zero; without sentinels a host fd 0 + * close would match slot 0 and close the wrong fd inside elfuse. + */ +static void pty_keepalive_init(void) +{ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE; + } +} + +static void pty_keepalive_lock_acquire(void) +{ + pthread_once(&pty_keepalive_once, pty_keepalive_init); + pthread_mutex_lock(&pty_keepalive_lock); +} + +/* Find a slot by master_host_fd; -1 if none. Caller holds the lock. */ +static int pty_keepalive_find_master_locked(int master_host_fd) +{ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) + if (pty_keepalive_table[i].master_host_fd == master_host_fd) + return i; + return -1; +} + +static int pty_keepalive_clear_slot_locked(int slot) +{ + int slave = pty_keepalive_table[slot].slave_host_fd; + pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[slot].slave_host_fd = PTY_KEEPALIVE_FREE; + pty_keepalive_table[slot].linux_pts_num = 0; + pty_keepalive_table[slot].stale_open_once = false; + pty_keepalive_table[slot].slave_path[0] = '\0'; + return slave; +} + +static uint32_t pty_extract_pts_num(const char *slave_path) +{ + /* macOS canonical slave paths are /dev/ttysNNN with a decimal tail. Read + * the longest decimal suffix and return it as the Linux pts number used + * by guest /dev/pts/N. Returns UINT32_MAX on parse failure so callers + * can reject ambiguous names rather than silently aliasing. + */ + if (!slave_path) + return UINT32_MAX; + const char *p = slave_path + strlen(slave_path); + while (p > slave_path && isdigit((unsigned char) p[-1])) + p--; + if (!*p || !isdigit((unsigned char) *p)) + return UINT32_MAX; + char *endp; + unsigned long n = strtoul(p, &endp, 10); + if (endp == p || *endp != '\0' || n > UINT32_MAX) + return UINT32_MAX; + return (uint32_t) n; +} + +/* Result codes for the locked register helper. */ +#define PTY_REG_INSERTED 0 /* new entry installed */ +#define PTY_REG_EXISTS 1 /* a matching entry already existed */ +#define PTY_REG_FULL (-1) /* table out of free slots */ + +/* Caller-holds-lock variant. Returns one of PTY_REG_* and, on PTY_REG_EXISTS, + * writes the existing entry's pts number to *existing_pts_num. The lock-held + * variant exists so proc_pty_master_adopt can atomically pair fd-table slot + * validation with keepalive insertion under fd_lock + pty_keepalive_lock, + * eliminating the race window where a sibling close+recycle between validate + * and register would attach the keepalive to the wrong file. + */ +static int pty_keepalive_register_locked(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path, + bool stale_open_once, + uint32_t *existing_pts_num) +{ + int empty_slot = -1; + int stale_path_slot = -1; + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == master_host_fd) { + if (existing_pts_num) + *existing_pts_num = pty_keepalive_table[i].linux_pts_num; + return PTY_REG_EXISTS; + } + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + /* Prefer a stale-path slot with the same pts number: the macOS minor + * deterministically maps to the same slave_path string, so reusing + * keeps lookups path-correct and bounds the table at one slot per + * live minor instead of accumulating a new entry on every reopen. + */ + if (pty_keepalive_table[i].slave_path[0] != '\0' && + pty_keepalive_table[i].linux_pts_num == linux_pts_num) { + stale_path_slot = i; + } else if (empty_slot < 0 && + pty_keepalive_table[i].slave_path[0] == '\0') { + empty_slot = i; + } + } + int slot = (stale_path_slot >= 0) ? stale_path_slot : empty_slot; + if (slot < 0) { + /* Out of empty slots and no stale-path match: evict the lowest-index + * stale-path entry so the live registration cannot starve. Live entries + * are never evicted. The eviction policy is approximately LRU: empty + * slots fill from low indices, so the lowest-index stale slot tends to + * be the oldest closed. A theoretical race exists with the + * close-before-open child pattern (a child stales slot K under + * pty_keepalive_lock and races into open("/dev/pts/N") just as another + * thread evicts slot K to register a different minor) but needs the + * keepalive table to be full -- live and stale entries both count -- + * with the staling thread's slot being the lowest-index stale. Well + * outside the foot / sshd workload that motivated this code. + */ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE && + pty_keepalive_table[i].slave_path[0] != '\0') { + slot = i; + break; + } + } + if (slot < 0) + return PTY_REG_FULL; + } + pty_keepalive_table[slot].master_host_fd = master_host_fd; + if (pty_keepalive_table[slot].slave_host_fd >= 0 && + pty_keepalive_table[slot].slave_host_fd != slave_host_fd) + close(pty_keepalive_table[slot].slave_host_fd); + pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + pty_keepalive_table[slot].linux_pts_num = linux_pts_num; + pty_keepalive_table[slot].stale_open_once = stale_open_once; + if (slave_path) + str_copy_trunc(pty_keepalive_table[slot].slave_path, slave_path, + PTY_SLAVE_PATH_MAX); + else + pty_keepalive_table[slot].slave_path[0] = '\0'; + return PTY_REG_INSERTED; +} + +/* Lock-acquiring convenience wrapper used by the open-time and fork-restore + * paths where atomicity with fd_table is not required. Returns 0 on success + * (including PTY_REG_EXISTS, in which case the caller should close its own + * redundant slave_host_fd), -1 with errno set on table-full (ENOSPC). + */ +static int pty_keepalive_register(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path, + bool stale_open_once) +{ + pty_keepalive_lock_acquire(); + int rc = pty_keepalive_register_locked(master_host_fd, slave_host_fd, + linux_pts_num, slave_path, + stale_open_once, NULL); + pthread_mutex_unlock(&pty_keepalive_lock); + if (rc == PTY_REG_FULL) { + errno = ENOSPC; + return -1; + } + if (rc == PTY_REG_EXISTS) + errno = EEXIST; + return 0; +} + +uint32_t proc_pty_master_pts_num(int master_host_fd) +{ + if (master_host_fd < 0) + return UINT32_MAX; + pty_keepalive_lock_acquire(); + int slot = pty_keepalive_find_master_locked(master_host_fd); + uint32_t pts_num = + (slot < 0) ? UINT32_MAX : pty_keepalive_table[slot].linux_pts_num; + pthread_mutex_unlock(&pty_keepalive_lock); + return pts_num; +} + +/* Re-validate that fd_table[guest_fd] still refers to (host_fd, generation). + * Returns true when both match the snapshot, false otherwise (slot closed or + * recycled). Used by proc_pty_master_adopt to bracket every host-fd-number + * access against the closing-and-reuse race. + */ +static bool pty_fd_still_canonical(int guest_fd, + int canonical_host_fd, + uint64_t canonical_gen) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return false; + return snap.host_fd == canonical_host_fd && + snap.generation == canonical_gen; +} + +uint32_t proc_pty_master_adopt(int guest_fd) +{ + /* Step 1: atomically snapshot (host_fd, generation) and dup the + * canonical fd in a single fd_lock window. fd_snapshot_and_dup pins + * the file object behind the canonical host fd, so even if a sibling + * closes the guest fd and the host fd number is recycled by an + * unrelated open, host syscalls against the probe still operate on + * the right tty. The generation captured here is the witness for the + * subsequent table lookup and register validations. + */ + fd_entry_t snap; + int probe = fd_snapshot_and_dup(guest_fd, &snap); + if (probe < 0) + return UINT32_MAX; + int canonical_host_fd = snap.host_fd; + uint64_t canonical_gen = snap.generation; + + /* Fast path: a keepalive was already registered for this canonical fd + * (typical case for /dev/ptmx opens that went through pty_open_master). + * The keepalive table is keyed by host fd number, so re-validate the + * slot identity before trusting the returned pts_num. If the fd has + * been recycled to a different file (generation mismatch), the + * existing entry belongs to that file, not ours, and the slow path + * below must register a fresh entry for our pinned probe. + */ + uint32_t existing = proc_pty_master_pts_num(canonical_host_fd); + if (existing != UINT32_MAX && + pty_fd_still_canonical(guest_fd, canonical_host_fd, canonical_gen)) { + close(probe); + return existing; + } + + /* Step 2: confirm the file really is a /dev/ptmx master. ptsname(3) + * returns NULL/ENOTTY on non-pty descriptors, so a stray TIOCGPTN + * against a regular file is rejected without any side effect. + */ + char slave_path[PTY_SLAVE_PATH_MAX]; + uint32_t pts_num = UINT32_MAX; + int slave = -1; + if (ptsname_r(probe, slave_path, sizeof(slave_path)) != 0) + goto out; + pts_num = pty_extract_pts_num(slave_path); + if (pts_num == UINT32_MAX) + goto out; + + /* unlockpt(3) is harmless if the sender already unlocked. EINVAL means + * already unlocked; anything else means the slave will not open and + * we give up cleanly. + */ + if (unlockpt(probe) < 0 && errno != EINVAL) { + pts_num = UINT32_MAX; + goto out; + } + slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC); + if (slave < 0) { + pts_num = UINT32_MAX; + goto out; + } + + /* Step 3: re-validate AND publish under the joint pty_keepalive_lock + + * fd_lock window. Lock order is pty_keepalive_lock first; + * duplicate_guest_fd uses the same order when bracketing + * fd_snapshot_and_dup + proc_pty_dup_keepalive_locked, so the two paths + * cannot deadlock. With both held, no sibling can flip the fd_table slot + * between the validation read and the keepalive insert, so the keepalive + * cannot attach to a recycled canonical host fd. + */ + pty_keepalive_lock_acquire(); + pthread_mutex_lock(&fd_lock); + if (fd_table[guest_fd].type == FD_CLOSED || + fd_table[guest_fd].host_fd != canonical_host_fd || + fd_table[guest_fd].generation != canonical_gen) { + pthread_mutex_unlock(&fd_lock); + pthread_mutex_unlock(&pty_keepalive_lock); + close(slave); + pts_num = UINT32_MAX; + goto out; + } + uint32_t existing_pts = UINT32_MAX; + int rc = pty_keepalive_register_locked(canonical_host_fd, slave, pts_num, + slave_path, false, &existing_pts); + pthread_mutex_unlock(&fd_lock); + pthread_mutex_unlock(&pty_keepalive_lock); + if (rc == PTY_REG_FULL) { + close(slave); + pts_num = UINT32_MAX; + } else if (rc == PTY_REG_EXISTS) { + /* Another adopter registered first; their slave keeps the tty alive. + * The pts_num came from the locked scan above, so it is the value the + * winning entry holds and is not subject to a lookup-after-recycle + * race. + */ + close(slave); + pts_num = existing_pts; + } + +out: + close(probe); + return pts_num; +} + +/* Look up the captured macOS slave path for a Linux pts number. Returns 0 and + * writes the path on hit, -1 with errno=ENOENT on miss. Used by the /dev/pts/N + * open and stat intercepts so they hit the exact path returned by ptsname(3) + * rather than a guessed /dev/ttys%03lu reformat that breaks if macOS changes + * its naming scheme or uses an unexpected minor encoding. + */ +static int pty_lookup_slave_path(uint32_t linux_pts_num, + char *out, + size_t out_sz) +{ + if (!out || out_sz == 0) { + errno = EINVAL; + return -1; + } + int hit = -1; + pty_keepalive_lock_acquire(); + /* Prefer a live entry (master still open in this process) over a stale + * path entry. Both encode the same slave_path for a given minor on macOS, + * so the preference only matters if a future change ever lets the two + * diverge - live wins by breaking out of the scan on first match. + */ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].linux_pts_num != linux_pts_num) + continue; + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) { + hit = i; + break; + } + if (!pty_keepalive_table[i].stale_open_once || + pty_keepalive_table[i].slave_host_fd < 0) + continue; + if (hit < 0) + hit = i; + } + if (hit < 0) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENOENT; + return -1; + } + size_t len = strlen(pty_keepalive_table[hit].slave_path); + if (len >= out_sz) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENAMETOOLONG; + return -1; + } + memcpy(out, pty_keepalive_table[hit].slave_path, len + 1); + pthread_mutex_unlock(&pty_keepalive_lock); + return 0; +} + +static int pty_open_slave(uint32_t linux_pts_num, int linux_flags) +{ + int oflags = translate_open_flags(linux_flags) & + (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + char host_path[PTY_SLAVE_PATH_MAX]; + int stale_hit = -1; + int retained_slaves[PTY_KEEPALIVE_MAX]; + int nretained = 0; + int fd = -1; + + pty_keepalive_lock_acquire(); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].linux_pts_num != linux_pts_num) + continue; + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) { + size_t len = strlen(pty_keepalive_table[i].slave_path); + if (len >= sizeof(host_path)) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENAMETOOLONG; + return -1; + } + memcpy(host_path, pty_keepalive_table[i].slave_path, len + 1); + pthread_mutex_unlock(&pty_keepalive_lock); + return open(host_path, oflags); + } + if (stale_hit < 0 && pty_keepalive_table[i].stale_open_once && + pty_keepalive_table[i].slave_host_fd >= 0) + stale_hit = i; + } + + if (stale_hit < 0) { + pthread_mutex_unlock(&pty_keepalive_lock); + errno = ENOENT; + return -1; + } + + /* Stale fork-child entries are one-shot. The retained slave fd pins the + * macOS tty while we translate the close-before-open sequence, preventing + * the cached path from resolving to a reused unrelated minor. Regardless + * of open success, consume the stale mapping before returning. + */ + size_t len = strlen(pty_keepalive_table[stale_hit].slave_path); + if (len >= sizeof(host_path)) { + int retained_slave = pty_keepalive_clear_slot_locked(stale_hit); + pthread_mutex_unlock(&pty_keepalive_lock); + if (retained_slave >= 0) + close(retained_slave); + errno = ENAMETOOLONG; + return -1; + } + memcpy(host_path, pty_keepalive_table[stale_hit].slave_path, len + 1); + fd = open(host_path, oflags); + int saved = errno; + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + if (!pty_keepalive_table[i].stale_open_once) + continue; + if (strncmp(pty_keepalive_table[i].slave_path, host_path, + PTY_SLAVE_PATH_MAX) != 0) + continue; + int retained_slave = pty_keepalive_clear_slot_locked(i); + if (retained_slave >= 0 && nretained < PTY_KEEPALIVE_MAX) + retained_slaves[nretained++] = retained_slave; + } + pthread_mutex_unlock(&pty_keepalive_lock); + for (int i = 0; i < nretained; i++) + close(retained_slaves[i]); + errno = saved; + return fd; +} + +static int pty_open_pts_dir(int linux_flags) +{ + char dir[80]; + uint32_t pts_nums[PTY_KEEPALIVE_MAX]; + int pts_count = 0; + int n = snprintf(dir, sizeof(dir), "/tmp/elfuse-pts-XXXXXX"); + if (n < 0 || (size_t) n >= sizeof(dir)) { + errno = ENAMETOOLONG; + return -1; + } + if (!mkdtemp(dir)) + return -1; + + pty_keepalive_lock_acquire(); + /* Enumerate live masters and fork-child one-shot stale entries. The stale + * entries retain a slave fd until the first open attempt consumes them, so + * they cannot name a reused unrelated tty while they appear in readdir. + */ + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].slave_path[0] == '\0') + continue; + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE && + (!pty_keepalive_table[i].stale_open_once || + pty_keepalive_table[i].slave_host_fd < 0)) + continue; + /* The recycle/reuse-by-pts_num invariant in + * pty_keepalive_register_locked keeps at most one entry per minor, so + * no de-duplication pass is needed here. + */ + pts_nums[pts_count++] = pty_keepalive_table[i].linux_pts_num; + } + pthread_mutex_unlock(&pty_keepalive_lock); + + for (int i = 0; i < pts_count; i++) { + char entry[160]; + int en = snprintf(entry, sizeof(entry), "%s/%u", dir, pts_nums[i]); + if (en <= 0 || (size_t) en >= sizeof(entry)) + continue; + int tfd = open(entry, O_CREAT | O_WRONLY, 0444); + if (tfd >= 0) + close(tfd); + } + + pthread_once(&proc_scratch_atexit_once, proc_scratch_register_atexit); + + pthread_mutex_lock(&proc_scratch_lock); + if (proc_scratch_dirs_count < PROC_SCRATCH_DIRS_MAX) { + str_copy_trunc(proc_scratch_dirs[proc_scratch_dirs_count++], dir, + sizeof(proc_scratch_dirs[0])); + } + pthread_mutex_unlock(&proc_scratch_lock); + + int fd = proc_open_dir_fd(dir, linux_flags); + if (fd < 0) { + int saved = errno; + proc_scratch_remove_one(dir); + errno = saved; + } + return fd; +} + +void proc_pty_lock_for_dup(void) +{ + pty_keepalive_lock_acquire(); +} + +void proc_pty_unlock_for_dup(void) +{ + pthread_mutex_unlock(&pty_keepalive_lock); +} + +void proc_pty_dup_keepalive_locked(int src_master_host_fd, + int dst_master_host_fd) +{ + /* Caller-holds-lock variant; see header for the dup race this guards. */ + if (src_master_host_fd < 0 || dst_master_host_fd < 0) + return; + + int slot = pty_keepalive_find_master_locked(src_master_host_fd); + if (slot < 0) + return; + int dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); + if (dst_slave < 0) + return; + uint32_t src_pts_num = pty_keepalive_table[slot].linux_pts_num; + char src_slave_path[PTY_SLAVE_PATH_MAX]; + memcpy(src_slave_path, pty_keepalive_table[slot].slave_path, + PTY_SLAVE_PATH_MAX); + + /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into + * a guest child that has no map back to it. + */ + int fdflags = fcntl(dst_slave, F_GETFD); + if (fdflags < 0 || fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { + close(dst_slave); + return; + } + int rc = + pty_keepalive_register_locked(dst_master_host_fd, dst_slave, + src_pts_num, src_slave_path, false, NULL); + if (rc != PTY_REG_INSERTED) { + /* Table full or duplicate entry for dst_master_host_fd; drop the + * redundant slave. Duplicate is unexpected: dst is a freshly-duped + * host fd that should not already be in the table unless a prior + * close skipped proc_pty_close_keepalive. + */ + close(dst_slave); + } +} + +void proc_pty_close_keepalive(int master_host_fd) +{ + /* fd_cleanup_entry calls this for every guest fd close, not just pty + * masters; pty_keepalive_lock_acquire guarantees sentinel-init first. + */ + if (master_host_fd < 0) + return; + + int slave = -1; + pty_keepalive_lock_acquire(); + int slot = pty_keepalive_find_master_locked(master_host_fd); + if (slot >= 0) { + if (pty_keepalive_table[slot].stale_open_once) { + /* Fork-restored child entry: retain the slave fd and path for one + * /dev/pts/N open after close(master). pty_open_slave consumes and + * closes it on the first translated open attempt. + */ + pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE; + } else { + slave = pty_keepalive_clear_slot_locked(slot); + } + } + pthread_mutex_unlock(&pty_keepalive_lock); + if (slave >= 0) + close(slave); +} + +static void proc_pty_expire_stale_by_path(const char *slave_path) +{ + if (!slave_path || slave_path[0] == '\0') + return; + + int stale_slaves[PTY_KEEPALIVE_MAX]; + int nslaves = 0; + pty_keepalive_lock_acquire(); + for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) { + if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) + continue; + if (!pty_keepalive_table[i].stale_open_once) + continue; + if (strncmp(pty_keepalive_table[i].slave_path, slave_path, + PTY_SLAVE_PATH_MAX) != 0) + continue; + int slave = pty_keepalive_clear_slot_locked(i); + if (slave >= 0 && nslaves < PTY_KEEPALIVE_MAX) + stale_slaves[nslaves++] = slave; + } + pthread_mutex_unlock(&pty_keepalive_lock); + for (int i = 0; i < nslaves; i++) + close(stale_slaves[i]); +} + +static int pty_keepalive_register_recycled(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path, + bool stale_open_once) +{ + proc_pty_expire_stale_by_path(slave_path); + return pty_keepalive_register(master_host_fd, slave_host_fd, linux_pts_num, + slave_path, stale_open_once); +} + +int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, + int *out_slave_fds, + int max_entries) +{ + if (!out_entries || !out_slave_fds || max_entries <= 0) + return 0; + + int n = 0; + pty_keepalive_lock_acquire(); + for (int i = 0; i < PTY_KEEPALIVE_MAX && n < max_entries; i++) { + if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) + continue; + + /* dup under the lock so the slave fd cannot be closed and the host fd + * number recycled before SCM_RIGHTS reads it. The caller closes the dup + * after the send completes. + */ + int duped = dup(pty_keepalive_table[i].slave_host_fd); + if (duped < 0) + continue; + + out_entries[n].master_host_fd = pty_keepalive_table[i].master_host_fd; + out_entries[n].linux_pts_num = pty_keepalive_table[i].linux_pts_num; + _Static_assert(sizeof(out_entries[n].slave_path) == PTY_SLAVE_PATH_MAX, + "ipc slave_path size must match keepalive table"); + memcpy(out_entries[n].slave_path, pty_keepalive_table[i].slave_path, + PTY_SLAVE_PATH_MAX); + out_slave_fds[n] = duped; + n++; + } + pthread_mutex_unlock(&pty_keepalive_lock); + return n; +} + +void proc_pty_restore_keepalive(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path) +{ + /* fork-IPC hand-off. SCM_RIGHTS drops FD_CLOEXEC; set it here so the + * keepalive does not survive exec. Any failure drops the slave fd. + */ + if (master_host_fd < 0) + goto drop; + + if (slave_host_fd >= 0) { + int fdflags = fcntl(slave_host_fd, F_GETFD); + if (fdflags < 0 || + fcntl(slave_host_fd, F_SETFD, fdflags | FD_CLOEXEC) < 0) + goto drop; + } + + /* Trust the parent's linux_pts_num verbatim instead of re-parsing + * slave_path. The wire-format string is bounded to PTY_SLAVE_PATH_MAX - 1 + * bytes; if a future macOS canonical form ever exceeded that, the parent + * would have truncated and reparsing here would yield the wrong number. On + * EEXIST the child's fd_table-restore path replayed master_host_fd over a + * prior recv-keepalive entry; drop the redundant slave so it does not leak. + */ + errno = 0; + if (pty_keepalive_register_recycled(master_host_fd, slave_host_fd, + linux_pts_num, slave_path, true) < 0 || + errno == EEXIST) + goto drop; + return; + +drop: + if (slave_host_fd >= 0) + close(slave_host_fd); +} + +/* Open /dev/ptmx, unlock the slave, and instantiate a keepalive slave fd so the + * master's tty ioctls work before the guest opens the slave itself. + * Returns the master host fd on success, -1 with errno set on failure. + */ +static int pty_open_master(int linux_flags) +{ + /* /dev/ptmx is a character device; O_CREAT / O_TRUNC / O_EXCL make no sense + * here. Strip them and only honor accmode + descriptor flags so the host + * open(2) never sees a variadic-mode-required combination without a mode + * arg. + */ + int oflags = translate_open_flags(linux_flags) & + (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + int master = open("/dev/ptmx", oflags); + if (master < 0) + return -1; + + /* grantpt(3) is a no-op on a unix98 pty mount, but call it for clarity + * and to match what posix_openpt(3)'s callers expect to have happened. + */ + char slave_path[PTY_SLAVE_PATH_MAX]; + if (grantpt(master) < 0 || unlockpt(master) < 0 || + ptsname_r(master, slave_path, sizeof(slave_path)) != 0) { + close_keep_errno(master); + return -1; + } + + /* Establish the (linux_pts_num, slave_path) mapping that /dev/pts/N opens + * and stats resolve through. If table or slave-fd registration fails after + * the master is open, report EMFILE rather than silently returning a master + * fd whose pts number cannot be opened back through /dev/pts/N. The caller + * can close other pty pairs and retry instead of dealing with a half-broken + * descriptor. + */ + uint32_t linux_pts_num = pty_extract_pts_num(slave_path); + if (linux_pts_num == UINT32_MAX) { + close(master); + errno = ENOTTY; + return -1; + } + int slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC); + if (slave < 0) { + close_keep_errno(master); + return -1; + } + errno = 0; + if (pty_keepalive_register_recycled(master, slave, linux_pts_num, + slave_path, false) < 0) { + close(slave); + close(master); + errno = EMFILE; + return -1; + } + /* Defense-in-depth: the freshly-opened master fd should not already have + * a keepalive (would indicate a stale entry from a prior close that did + * not run proc_pty_close_keepalive). Drop the redundant slave so it does + * not leak. + */ + if (errno == EEXIST) + close(slave); + return master; +} + int proc_intercept_open(const guest_t *g, const char *path, int linux_flags, int mode) { + /* /dev/ptmx -> host /dev/ptmx + keepalive slave (see pty_open_master). + * O_PATH is path-only on Linux: it must not run the device open hook or + * allocate a pty pair. Use a harmless backing fd; FD_PATH gates I/O and + * ioctl, while proc_intercept_stat supplies the visible device metadata. + */ + if (!strcmp(path, "/dev/ptmx")) { + if (linux_flags & LINUX_O_PATH) { + if (linux_flags & LINUX_O_DIRECTORY) { + errno = ENOTDIR; + return -1; + } + int oflags = O_RDONLY; + if (linux_flags & LINUX_O_CLOEXEC) + oflags |= O_CLOEXEC; + return open("/dev/null", oflags); + } + return pty_open_master(linux_flags); + } + /* /dev/null, /dev/zero, /dev/(u)random, /dev/tty */ const char *host_dev = NULL; int host_accmode = translate_open_flags(linux_flags) & O_ACCMODE; @@ -1525,6 +2323,7 @@ int proc_intercept_open(const guest_t *g, const char *shm = shm_dir_path(); return shm ? proc_open_dir_fd(shm, linux_flags) : -1; } + if (!strncmp(path, "/dev/shm/", 9)) { char host_path[512]; if (dev_shm_resolve_path(path + 9, host_path, sizeof(host_path)) < 0) @@ -1548,6 +2347,40 @@ int proc_intercept_open(const guest_t *g, if (!strncmp(path, "/dev/fd/", 8)) return dev_fd_dup(path, 8); + /* /dev/pts -> synthetic devpts directory. stat/access advertise this + * directory even on macOS hosts without /dev/pts, so open must be + * intercepted too or callers that probe then enumerate see inconsistent + * Linux-visible behavior. + */ + if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/")) + return pty_open_pts_dir(linux_flags); + + /* /dev/pts/N -> the macOS slave path captured at /dev/ptmx open time. + * Looking up the exact ptsname(3) string (rather than reformatting + * /dev/ttys%03lu) keeps the guest correct against any future macOS format + * change and against tty minor encodings that do not round-trip through + * plain zero-padding. ENOENT until the owning master is opened matches + * Linux devpts behavior for an unallocated slave number. + */ + if (!strncmp(path, "/dev/pts/", 9)) { + const char *digits = path + 9; + if (!*digits) { + errno = ENOENT; + return -1; + } + char *endp; + unsigned long n = strtoul(digits, &endp, 10); + if (endp == digits || *endp != '\0' || n > UINT32_MAX) { + errno = ENOENT; + return -1; + } + /* /dev/pts/N is a character device; strip O_CREAT and friends so + * the two-argument open(2) never sees a creation-mode-required + * combination without a mode arg. + */ + return pty_open_slave((uint32_t) n, linux_flags); + } + /* /proc -> synthetic directory with PID entries for busybox ps, top, etc. * Creates a temp dir once (cached for the process lifetime) with entries * matching the current single-process model: the current PID directory + @@ -1573,9 +2406,8 @@ int proc_intercept_open(const guest_t *g, * Each open gets its own scratch dir so concurrent enumerations cannot * mutate one another (see proc_open_fd_scratch). */ - if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) { + if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) return proc_open_fd_scratch("elfuse-fd", linux_flags); - } if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) { const char *dir = ensure_proc_tmpdir(g); @@ -1590,9 +2422,9 @@ int proc_intercept_open(const guest_t *g, return proc_open_dir_fd(netdir, linux_flags); } - /* /proc/[/...] -> /proc/self[...]. Returns -1 on - * ENAMETOOLONG so the guest sees the same error a real Linux kernel - * would produce instead of falling through to a host syscall. + /* /proc/[/...] -> /proc/self[...]. + * Returns -1 on ENAMETOOLONG so the guest sees the same error a real Linux + * kernel would produce instead of falling through to a host syscall. */ { char alias[LINUX_PATH_MAX]; @@ -1621,9 +2453,9 @@ int proc_intercept_open(const guest_t *g, * return an actual file descriptor to the binary. * Under rosetta, the binfmt_misc convention treats rosetta as the * interpreter visible to the guest: rosetta opens /proc/self/fd/X - * via /proc/self/exe to identify itself and then issues the VZ - * ioctls on that descriptor. Return ROSETTA_PATH so the VZ ioctl - * gate (rosetta_ioctl_target_fd) recognises the fd. + * via /proc/self/exe to identify itself and then issues the VZ ioctls on + * that descriptor. Return ROSETTA_PATH so the VZ ioctl gate + * (rosetta_ioctl_target_fd) recognises the fd. */ if (!strcmp(path, "/proc/self/exe")) { if (g && g->is_rosetta) @@ -1637,8 +2469,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/cpuinfo -> synthetic file with CPU count. - * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid - * silent truncation on hosts with >16 CPUs. + * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid silent + * truncation on hosts with >16 CPUs. */ if (!strcmp(path, "/proc/cpuinfo")) { int ncpu = (int) sysconf(_SC_NPROCESSORS_ONLN); @@ -1746,8 +2578,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/task -> directory with per-thread TID entries. - * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at - * startup to discover thread count and per-thread state. + * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at startup + * to discover thread count and per-thread state. * * Rebuilds a temp directory on each open (thread set is dynamic). * Cannot rmdir before returning the fd because macOS getdents on unlinked @@ -1815,8 +2647,8 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/task/ directory itself: synthesize a dir with - * stat/status placeholder entries. Persistent so getdents sees - * the entries on macOS (which cannot enumerate unlinked dirs). + * stat/status placeholder entries. Persistent so getdents sees the + * entries on macOS (which cannot enumerate unlinked dirs). */ if (*endp == '\0' || !strcmp(endp, "/")) { static proc_persistent_dir_t tiddir = @@ -1881,9 +2713,9 @@ int proc_intercept_open(const guest_t *g, /* Add preannounced entries only while they still have an uncovered * tail. Once the union of live regions covers the full advertised - * interval, suppress the shadow entry so /proc/self/maps shows only - * the realized split VMAs. A partial union must stay visible because - * some reserved-but-not-realized span remains to advertise. + * interval, suppress the shadow entry so /proc/self/maps shows only the + * realized split VMAs. A partial union must stay visible because some + * reserved-but-not-realized span remains to advertise. */ for (int i = 0; i < g->npreannounced && nentries < MAPS_ENTRY_MAX; i++) { @@ -2185,8 +3017,9 @@ int proc_intercept_open(const guest_t *g, uint64_t mask; /* fs/signalfd.c uses a tab after the colon (matching the * pos:/flags:/mnt_id: convention in fs/proc/fd.c, not the - * single-space style of eventfd/timerfd). Verified against a - * real Linux 6.x /proc/self/fdinfo dump. */ + * single-space style of eventfd/timerfd). Verified against a real + * Linux 6.x /proc/self/fdinfo dump. + */ if (signalfd_fdinfo_snapshot(n, &mask)) snprintf(extra, sizeof(extra), "sigmask:\t%016llx\n", (unsigned long long) mask); @@ -2196,11 +3029,12 @@ int proc_intercept_open(const guest_t *g, int64_t value_ns, interval_ns; if (timerfd_fdinfo_snapshot(n, &clockid, &ticks, &value_ns, &interval_ns)) { - /* Linux fs/timerfd.c emits these fields with single - * spaces after the colon, not tabs (unlike pos:/flags:/ - * mnt_id: in fs/proc/fd.c, which do use tabs). Match the - * upstream format so guest readers parsing fdinfo via a - * "it_value: (" prefix find the field. */ + /* Linux fs/timerfd.c emits these fields with single spaces + * after the colon, not tabs (unlike pos:/flags:/mnt_id: in + * fs/proc/fd.c, which do use tabs). Match the upstream format + * so guest readers parsing fdinfo via a "it_value: (" prefix + * find the field. + */ snprintf(extra, sizeof(extra), "clockid: %d\n" "ticks: %llu\n" @@ -2227,9 +3061,9 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/fdinfo -> directory listing. Each open gets its own scratch - * dir so concurrent getdents on independent dirfds cannot interfere - * (the previous shared-dir design unlinked entries under a sibling - * enumerator). The dirs are tracked for atexit cleanup. + * dir so concurrent getdents on independent dirfds cannot interfere (the + * previous shared-dir design unlinked entries under a sibling enumerator). + * The dirs are tracked for atexit cleanup. */ if (!strcmp(path, "/proc/self/fdinfo") || !strcmp(path, "/proc/self/fdinfo/")) { @@ -2276,6 +3110,7 @@ int proc_intercept_open(const guest_t *g, buffers_kb = total_kb / 20; cached_kb = total_kb / 4; } + return proc_emit_fmt( "MemTotal: %llu kB\n" "MemFree: %llu kB\n" @@ -2313,9 +3148,9 @@ int proc_intercept_open(const guest_t *g, } /* /proc/self/io -> synthetic I/O counters. - * Some node-style observability runtimes read this for resource - * monitoring metrics. procfs emulation returns zeroed counters because - * it does not track per-guest I/O. + * Some node-style observability runtimes read this for resource monitoring + * metrics. procfs emulation returns zeroed counters because it does not + * track per-guest I/O. */ if (!strcmp(path, "/proc/self/io")) { return proc_emit_literal( @@ -2474,6 +3309,23 @@ int proc_intercept_stat(const char *path, struct stat *st) if (!strcmp(path, "/dev/fuse")) return fuse_proc_stat(st); + /* Linux /dev/ptmx is the Unix98 pty multiplexer character device (5:2). + * Keep this synthetic so O_PATH probes can fstat the path fd without + * forcing a real host /dev/ptmx open, which would allocate a pty. + */ + if (!strcmp(path, "/dev/ptmx")) { + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFCHR | 0666; + st->st_nlink = 1; + st->st_dev = PROC_SYNTH_DEV; + st->st_ino = proc_synth_ino(path); + st->st_uid = proc_get_uid(); + st->st_gid = proc_get_gid(); + st->st_rdev = ((dev_t) 5u << 24) | (dev_t) 2u; + st->st_blksize = 1024; + return 0; + } + /* /dev/shm is a directory */ if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) { stat_fill_proc_dir(st, 01777, 2, @@ -2488,6 +3340,63 @@ int proc_intercept_stat(const char *path, struct stat *st) return stat(host_path, st); } + /* /dev/pts directory and /dev/pts/N slave entries. glibc ptsname(3) + * stats /dev/pts/N after TIOCGPTN and rejects with ENOENT if absent. + * Synthesize a minimal char-device stat whose st_rdev decodes to Linux's + * standard pts major (136) so glibc's major(rdev) == UNIX98_PTY_SLAVE_MAJOR + * check passes. The numeric tail must round-trip with /dev/ttysN via the + * open intercept (see proc_intercept_open). + */ + if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/")) { + stat_fill_proc_dir(st, 0755, 2, path); + return 0; + } + if (!strncmp(path, "/dev/pts/", 9)) { + const char *digits = path + 9; + if (!*digits) { + errno = ENOENT; + return -1; + } + char *endp; + unsigned long n = strtoul(digits, &endp, 10); + if (endp == digits || *endp != '\0' || n > UINT32_MAX) { + errno = ENOENT; + return -1; + } + /* Resolve through the captured-path table: ENOENT unless the + * corresponding master is currently open. This avoids the host + * stat false-positive where /dev/ttysNNN happens to exist for an + * unrelated tty allocated outside elfuse. + */ + char host_path[PTY_SLAVE_PATH_MAX]; + if (pty_lookup_slave_path((uint32_t) n, host_path, sizeof(host_path)) < + 0) + return -1; + struct stat host_st; + if (stat(host_path, &host_st) < 0) { + errno = ENOENT; + return -1; + } + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFCHR | 0620; + st->st_nlink = 1; + st->st_uid = host_st.st_uid; + st->st_gid = host_st.st_gid; + /* macOS dev_t = (major << 24) | minor; the fs-stat translation layer + * (mac_to_linux_dev) re-encodes that into Linux's split major/minor + * layout, so storing 136 in the macOS-major slot makes glibc's + * major(rdev) yield UNIX98_PTY_SLAVE_MAJOR. + */ + st->st_rdev = ((dev_t) 136u << 24) | (dev_t) (n & 0xFFFFFFu); + st->st_size = 0; + st->st_blksize = 1024; + st->st_blocks = 0; + st->st_atime = host_st.st_atime; + st->st_mtime = host_st.st_mtime; + st->st_ctime = host_st.st_ctime; + return 0; + } + /* /proc and /proc/ are directories */ if (!strcmp(path, "/proc") || !strcmp(path, "/proc/")) { stat_fill_proc_dir(st, 0555, 3, path); @@ -2645,10 +3554,11 @@ int proc_intercept_stat(const char *path, struct stat *st) struct stat host_st; if (lstat(host_path, &host_st) < 0) return -1; + /* Replace host inode/dev with the synthetic-procfs convention so - * the guest sees a stable identity that does not collide with - * real host files (and so st_size reads as 0 for cpumask files, - * matching real sysfs). + * the guest sees a stable identity that does not collide with real + * host files (and so st_size reads as 0 for cpumask files, matching + * real sysfs). */ if (S_ISDIR(host_st.st_mode)) stat_fill_proc_dir(st, 0555, 2, path); @@ -2699,9 +3609,9 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) char sysroot_snap[LINUX_PATH_MAX]; if (proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap))) { /* proc_set_sysroot stores a realpath()-canonicalized form, so - * canonicalize exe before the prefix check or the strip fails - * when /var -> /private/var (and similar macOS symlinks) make - * the two strings diverge. + * canonicalize exe before the prefix check or the strip fails when + * /var -> /private/var (and similar macOS symlinks) make the two + * strings diverge. */ const char *exe_cmp = exe; if (realpath(exe, exe_real)) @@ -2843,10 +3753,11 @@ int proc_intercept_write(int guest_fd, return 0; int kind = proc_oom_path_kind(snap.proc_path); if (kind == OOM_PATH_SCORE) { - /* Linux: oom_score has no write handler. proc_reg_write returns - * -EIO when the underlying proc_dir_entry exposes no write op, - * not -EINVAL. Match that so guests probing the error code see - * the same value as on a real kernel. */ + /* Linux: oom_score has no write handler. proc_reg_write returns -EIO + * when the underlying proc_dir_entry exposes no write op, not -EINVAL. + * Match that so guests probing the error code see the same value as on + * a real kernel. + */ errno = EIO; return -1; } @@ -2854,8 +3765,8 @@ int proc_intercept_write(int guest_fd, return 0; /* Linux: zero-byte writes to proc nodes succeed without side effects. - * Without this short-circuit, sys_writev would funnel a zero-length - * vector through proc_parse_int_write and get -EINVAL. + * Without this short-circuit, sys_writev would funnel a zero-length vector + * through proc_parse_int_write and get -EINVAL. */ if (count == 0) { *written_out = 0; @@ -2902,6 +3813,7 @@ int proc_intercept_write(int guest_fd, goto unlock; if (!use_pwrite && lseek(host_fd, offset + (int64_t) count, SEEK_SET) < 0) goto unlock; + atomic_store(&oom_score_adj_value, score_adj); proc_oom_refresh_live_fds_locked(); *written_out = (ssize_t) count; diff --git a/src/runtime/procemu.h b/src/runtime/procemu.h index 58f4f30..78227ae 100644 --- a/src/runtime/procemu.h +++ b/src/runtime/procemu.h @@ -11,6 +11,7 @@ #pragma once #include +#include #include #include #include "core/guest.h" @@ -86,3 +87,80 @@ const char *proc_get_shm_dir(void); int proc_dev_shm_resolve(const char *guest_suffix, char *host_path, size_t host_path_sz); + +/* Drop the keepalive slave fd paired with a /dev/ptmx master host fd. Called + * from fd_cleanup_entry when the guest closes the master so the host kernel + * eventually tears the tty down. Idempotent and safe to call on any host fd + * (no-op when no keepalive is registered). + */ +void proc_pty_close_keepalive(int master_host_fd); + +/* Caller-locked variant of pty keepalive duplication. Brackets the host + * fd_snapshot_and_dup and the keepalive mirror under one pty_keepalive_lock + * window so a sibling close cannot run proc_pty_close_keepalive in the + * gap and leave the alias without a keepalive entry. Caller must wrap the + * sequence with proc_pty_lock_for_dup / proc_pty_unlock_for_dup. + */ +void proc_pty_lock_for_dup(void); +void proc_pty_unlock_for_dup(void); +void proc_pty_dup_keepalive_locked(int src_master_host_fd, + int dst_master_host_fd); + +/* Return the captured Linux pts number for a host master fd, or UINT32_MAX + * when no keepalive is registered. Lets sys_ioctl TIOCGPTN report the value + * /dev/pts/N opens / stats round-trip through, instead of independently + * parsing the macOS slave path and risking divergence with the open table. + */ +uint32_t proc_pty_master_pts_num(int master_host_fd); + +/* Best-effort lazy registration for a /dev/ptmx master that elfuse did not + * open itself (e.g. one received from a peer via SCM_RIGHTS). Takes a guest + * fd because the canonical host fd would race with sibling close+reuse if + * passed in directly: this helper snapshots fd_table[guest_fd].host_fd and + * its generation, performs the slave open against a private dup, and only + * registers the keepalive after re-verifying the slot still holds the + * original (host_fd, generation). Returns the pts number on success, or + * UINT32_MAX if the fd is not a pty master, the slot got closed/recycled + * mid-adoption, or the keepalive table is full. Idempotent: if the master + * already has a keepalive entry, returns its stored linux_pts_num. + */ +uint32_t proc_pty_master_adopt(int guest_fd); + +/* Max bytes for a captured macOS slave path (e.g. "/dev/ttys004"). Lives in + * the header so proc_pty_ipc_entry_t below and the in-memory keepalive table + * in procemu.c share one source of truth; a divergence would silently corrupt + * the fork-IPC wire format. + */ +#define PTY_SLAVE_PATH_MAX 64 + +/* Serialized form of a pty keepalive entry, used by fork-IPC. The slave host + * fd travels separately via SCM_RIGHTS; this struct only carries the + * lifetime-independent metadata that the child needs to re-register. + */ +typedef struct { + int32_t master_host_fd; + uint32_t linux_pts_num; + char slave_path[PTY_SLAVE_PATH_MAX]; +} proc_pty_ipc_entry_t; + +/* Snapshot the current pty keepalive table into out_entries / out_slave_fds. + * dup()s every slave fd under the keepalive lock so the snapshot stays valid + * across the SCM_RIGHTS send even if the original master gets closed before + * the IPC drains. Returns the number of live entries written (always + * <= max_entries); on success the caller owns the duplicated slave fds and + * must close them after the IPC send completes. + */ +int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, + int *out_slave_fds, + int max_entries); + +/* Re-register a single keepalive in the child after a fork-IPC. master_host_fd + * is the child-side host fd that just landed in fd_table (its number is + * different from the parent's). Takes ownership of slave_host_fd. The slave + * path is recorded so /dev/pts/N opens in the child resolve to the same macOS + * tty the parent has been talking to. + */ +void proc_pty_restore_keepalive(int master_host_fd, + int slave_host_fd, + uint32_t linux_pts_num, + const char *slave_path); diff --git a/src/syscall/abi.h b/src/syscall/abi.h index ac58574..3aa50ed 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -333,6 +333,7 @@ typedef struct { #define LINUX_EMULTIHOP 72 /* Multihop attempted */ #define LINUX_EILSEQ 84 /* Illegal byte sequence */ #define LINUX_EHOSTDOWN 112 /* Host is down */ +#define LINUX_ENODATA 61 /* No data available (xattr missing, stream) */ /* Linux FD flags. */ #define LINUX_FD_CLOEXEC 1 @@ -351,6 +352,7 @@ typedef struct { #define LINUX_TIOCSPGRP 0x5410 /* -> macOS TIOCSPGRP (same semantics) */ #define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */ #define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */ +#define LINUX_TIOCSWINSZ 0x5414 /* -> macOS TIOCSWINSZ (same struct) */ #define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */ #define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */ #define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */ @@ -362,6 +364,13 @@ typedef struct { #define LINUX_TCSETS2 0x402c542b /* termios2 set (TCSANOW) */ #define LINUX_TCSETSW2 0x402c542c /* termios2 set (TCSADRAIN) */ #define LINUX_TCSETSF2 0x402c542d /* termios2 set (TCSAFLUSH) */ +/* Pseudoterminal multiplexer ioctls. The numeric encodings match Linux + * include/uapi/asm-generic/ioctls.h regardless of architecture. macOS exposes + * an equivalent /dev/ptmx and unlockpt(3); ptsname(3) returns /dev/ttysNNN. + */ +#define LINUX_TIOCGPTN 0x80045430 /* _IOR('T', 0x30, unsigned int) */ +#define LINUX_TIOCSPTLCK 0x40045431 /* _IOW('T', 0x31, int) */ +#define LINUX_TIOCGPTPEER 0x5441 /* _IO('T', 0x41); arg is open flags */ /* Linux open flags. */ #define LINUX_O_RDONLY 0x0000 @@ -390,6 +399,13 @@ typedef struct { #define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */ #define LINUX_O_PATH 0x200000 /* 010000000 octal */ +/* Linux fallocate(2) mode bits (linux/falloc.h). PUNCH_HOLE requires the + * caller to also set KEEP_SIZE per the manpage; collapse/insert/zero/unshare + * range modes are recognised numerically but elsewhere unsupported. + */ +#define LINUX_FALLOC_FL_KEEP_SIZE 0x01 +#define LINUX_FALLOC_FL_PUNCH_HOLE 0x02 + /* Linux AT_* constants. */ #define LINUX_AT_FDCWD (-100) #define LINUX_AT_SYMLINK_NOFOLLOW 0x100 diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index a072cfb..13e95bd 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -21,6 +21,7 @@ #include "utils.h" #include "core/shim-globals.h" +#include "runtime/procemu.h" #include "syscall/abi.h" #include "syscall/internal.h" @@ -477,6 +478,12 @@ void fd_cleanup_entry(int guest_fd, const fd_entry_t *snap) if (snap->cleanup) snap->cleanup(guest_fd); + /* Drop any /dev/ptmx keepalive slave fd paired with this host fd. Must + * happen before close(snap->host_fd) because the side table is keyed by + * the still-live host master fd. No-op for non-pty fds. + */ + proc_pty_close_keepalive(snap->host_fd); + /* Keep stdin/stdout/stderr open on the host */ if (snap->type != FD_STDIO) close(snap->host_fd); diff --git a/src/syscall/fs-stat.c b/src/syscall/fs-stat.c index eb584b7..d246016 100644 --- a/src/syscall/fs-stat.c +++ b/src/syscall/fs-stat.c @@ -212,29 +212,45 @@ static int64_t stat_at_path(guest_t *g, return 0; } - host_fd_ref_t dir_ref; - if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) - return -LINUX_EBADF; - int64_t rc = 0; + host_fd_ref_t dir_ref = {.fd = -1, .owned = false}; if ((flags & LINUX_AT_EMPTY_PATH) && pathp[0] == '\0') { /* Linux: AT_EMPTY_PATH with dirfd == AT_FDCWD operates on the * current working directory. */ - if (dir_ref.fd == AT_FDCWD) { + if (dirfd == LINUX_AT_FDCWD) { + dir_ref.fd = AT_FDCWD; int mac_flags = translate_at_flags(flags); if (fstatat(AT_FDCWD, ".", mac_st, mac_flags) < 0) { rc = linux_errno(); goto done; } - } else if (dir_ref.fd < 0) { - rc = -LINUX_EBADF; - goto done; - } else if (fstat(dir_ref.fd, mac_st) < 0) { - rc = linux_errno(); - goto done; + } else { + fd_entry_t snap; + dir_ref.fd = fd_snapshot_and_dup(dirfd, &snap); + dir_ref.owned = true; + if (dir_ref.fd < 0) { + rc = -LINUX_EBADF; + goto done; + } + if (snap.type == FD_PATH && snap.proc_path[0] != '\0') { + int intercepted = proc_intercept_stat(snap.proc_path, mac_st); + if (intercepted == 0) + goto done; + if (intercepted == -1) { + rc = linux_errno(); + goto done; + } + } + if (fstat(dir_ref.fd, mac_st) < 0) { + rc = linux_errno(); + goto done; + } } } else { + if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) + return -LINUX_EBADF; + int intercepted = PROC_NOT_INTERCEPTED; if (path_might_use_stat_intercept(tx.intercept_path)) { intercepted = proc_intercept_stat(tx.intercept_path, mac_st); @@ -275,6 +291,19 @@ int64_t sys_fstat(guest_t *g, int fd, uint64_t stat_gva) if (frc != -LINUX_EBADF) return frc; + fd_entry_t snap; + if (fd_snapshot(fd, &snap) && snap.type == FD_PATH && + snap.proc_path[0] != '\0') { + int intercepted = proc_intercept_stat(snap.proc_path, &mac_st); + if (intercepted == 0) { + if (write_linux_stat(g, stat_gva, &mac_st) < 0) + return -LINUX_EFAULT; + return 0; + } + if (intercepted == -1) + return linux_errno(); + } + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) { log_debug("fstat(%d): invalid guest fd", fd); diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 004f3b2..627f29b 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -105,19 +105,33 @@ static const char *proc_stateful_file_path(const char *path) return NULL; } -static void fd_note_proc_path(int guest_fd, const char *path) +/* Resolve the proc_path the fd table should record for an intercepted path. + * Returns true and fills *out when a mapping exists; false otherwise so the + * caller can skip the install entirely. Pure string work; safe to call before + * any lock acquisition. + */ +static bool resolve_virtual_path(const char *path, char *out, size_t out_size) { - if (!path || strncmp(path, "/proc", 5) != 0) - return; + if (!path || out_size == 0) + return false; + + if (!strcmp(path, "/dev/ptmx")) { + str_copy_trunc(out, path, out_size); + return true; + } + + if (strncmp(path, "/proc", 5) != 0) + return false; char virt_buf[64]; const char *virt = proc_virtual_dir_path(path, virt_buf, sizeof(virt_buf)); if (!virt) virt = proc_stateful_file_path(path); + if (!virt) + return false; - if (virt) - str_copy_trunc(fd_table[guest_fd].proc_path, virt, - sizeof(fd_table[guest_fd].proc_path)); + str_copy_trunc(out, virt, out_size); + return true; } static const char *proc_virtual_dir_path(const char *path, @@ -185,7 +199,8 @@ static int fd_alloc_opened_host(int host_fd, int type, int linux_flags, int min_guest_fd, - void (*cleanup)(int)) + void (*cleanup)(int), + const char *virtual_path) { DIR *dir = NULL; @@ -213,16 +228,23 @@ static int fd_alloc_opened_host(int host_fd, return -1; } - /* Publish linux_flags, dir, and the urandom bitmap bit atomically - * with respect to the slot's identity. fd_alloc_*_relaxed drops + /* Resolve the virtual-path stamp before taking fd_lock; the helper is pure + * string work and must not run inside the critical section. + */ + char proc_path_buf[FD_VIRTUAL_PATH_MAX]; + bool have_proc_path = resolve_virtual_path(virtual_path, proc_path_buf, + sizeof(proc_path_buf)); + + /* Publish linux_flags, dir, proc_path, and the urandom bitmap bit + * atomically with respect to the slot's identity. fd_alloc_*_relaxed drops * fd_lock before returning, so a sibling vCPU's pathological - * close(guest_fd) + open() could reuse the slot between alloc and - * the metadata install below. Re-acquire fd_lock and verify the - * (type, host_fd) tuple still matches what just got allocated; - * if it does not, the slot belongs to a different file now and - * any install would clobber the sibling's entry. The sibling's - * close path already cleaned up our host_fd via fd_cleanup_entry, - * so this side only owns dir, which gets closed below. + * close(guest_fd) + open() could reuse the slot between alloc and the + * metadata install below. Re-acquire fd_lock and verify the + * (type, host_fd) tuple still matches what just got allocated; if it does + * not, the slot belongs to a different file now and any install would + * clobber the sibling's entry. The sibling's close path already cleaned up + * the host_fd of this side via fd_cleanup_entry, so this side only owns + * dir, which gets closed below. */ bool installed = false; pthread_mutex_lock(&fd_lock); @@ -231,6 +253,9 @@ static int fd_alloc_opened_host(int host_fd, fd_table[guest_fd].linux_flags = linux_flags; if (dir) fd_table[guest_fd].dir = dir; + if (have_proc_path) + memcpy(fd_table[guest_fd].proc_path, proc_path_buf, + sizeof(proc_path_buf)); bool readable_urandom = type == FD_URANDOM && (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY; @@ -283,8 +308,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(sidecar_fd); return linux_errno(); } - int guest_fd = - fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL); + int guest_fd = fd_alloc_opened_host(sidecar_fd, type, linux_flags, + -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(sidecar_fd); return linux_errno(); @@ -314,7 +339,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -342,19 +367,25 @@ int64_t sys_openat_path(guest_t *g, int type = intercepted_fd_type(tx.intercept_path, intercepted, linux_flags); if (type < 0) { + /* /dev/ptmx registers a keepalive slave under intercepted + * before this point; without dropping it here the slave fd + * leaks because nothing else has the master in fd_table. + * proc_pty_close_keepalive is a no-op for other paths. + */ + proc_pty_close_keepalive(intercepted); close_keep_errno(intercepted); return linux_errno(); } int min_guest_fd = (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128; - int guest_fd = - fd_alloc_opened_host(intercepted, type, linux_flags, - min_guest_fd, fd_cleanup_for_type(type)); + int guest_fd = fd_alloc_opened_host( + intercepted, type, linux_flags, min_guest_fd, + fd_cleanup_for_type(type), tx.intercept_path); if (guest_fd < 0) { + proc_pty_close_keepalive(intercepted); close_keep_errno(intercepted); return linux_errno(); } - fd_note_proc_path(guest_fd, tx.intercept_path); return guest_fd; } if (intercepted == -1) { @@ -375,7 +406,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -397,7 +428,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -430,6 +462,13 @@ int64_t sys_close(int fd) int host_fd = -1; if (fd_close_regular_relaxed(fd, &host_fd)) { + /* The fast path bypasses fd_cleanup_entry, so any side tables + * keyed by host_fd that the slow path drops must be drained here + * too. proc_pty_close_keepalive is a cheap no-op for non-pty fds + * and prevents the keepalive slave from leaking past a /dev/ptmx + * close when no per-type cleanup is registered. + */ + proc_pty_close_keepalive(host_fd); if (close(host_fd) < 0) return linux_errno(); return 0; @@ -542,18 +581,27 @@ static int duplicate_guest_fd(int src_fd, bool fixed_slot, int linux_flags) { - /* Snapshot the source entry and dup its host fd in a single fd_lock - * critical section so the type, host fd, and metadata captured here - * cannot drift apart under a racing close + reopen. + /* Hold pty_keepalive_lock across the source snapshot, host dup, and + * keepalive mirror so a concurrent sys_close on src_fd cannot remove + * the source's keepalive entry between fd_snapshot_and_dup and + * proc_pty_dup_keepalive_locked. Without this bracket the alias would + * land in fd_table with no keepalive of its own. + * + * Lock order is pty_keepalive_lock -> fd_lock (fd_snapshot_and_dup + * takes fd_lock internally); proc_pty_master_adopt's joint-locked + * publish uses the same order so the two paths do not deadlock. */ + proc_pty_lock_for_dup(); fd_entry_t src_snap; int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap); if (new_host_fd < 0 && src_snap.type == FD_CLOSED) { + proc_pty_unlock_for_dup(); errno = EBADF; return -1; } if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE || src_snap.type == FD_FUSE_DIR) { + proc_pty_unlock_for_dup(); if (new_host_fd >= 0) close_keep_errno(new_host_fd); return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, @@ -566,13 +614,27 @@ static int duplicate_guest_fd(int src_fd, * bind there. */ if (src_snap.type == FD_EVENTFD) { + proc_pty_unlock_for_dup(); if (new_host_fd >= 0) close_keep_errno(new_host_fd); return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd, fixed_guest_fd, fixed_slot, linux_flags); } - if (new_host_fd < 0) + if (new_host_fd < 0) { + proc_pty_unlock_for_dup(); return -1; + } + + /* Mirror any /dev/ptmx keepalive BEFORE fd_alloc publishes guest_fd. + * Once the guest fd exists, a sibling thread can close it; that runs + * fd_cleanup_entry which calls proc_pty_close_keepalive(new_host_fd). + * For that cleanup to drop the freshly-duped keepalive, the keepalive + * entry must already be in the table; registering after fd_alloc would + * lose the race and leak the slave fd. No-op when the source has no + * keepalive. + */ + proc_pty_dup_keepalive_locked(src_snap.host_fd, new_host_fd); + proc_pty_unlock_for_dup(); int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type; void (*cleanup)(int) = fd_cleanup_for_type(new_type); @@ -583,6 +645,10 @@ static int duplicate_guest_fd(int src_fd, if (guest_fd < 0) { if (fixed_slot) errno = EBADF; + /* fd_cleanup_entry never ran on new_host_fd (no guest fd was + * registered), so the keepalive must be dropped explicitly here. + */ + proc_pty_close_keepalive(new_host_fd); close_keep_errno(new_host_fd); return -1; } diff --git a/src/syscall/io.c b/src/syscall/io.c index 67f02cd..879db75 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1551,6 +1552,7 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) } case LINUX_TIOCGWINSZ: { /* Get terminal window size */ + (void) proc_pty_master_adopt(fd); struct winsize ws; if (ioctl(host_fd, TIOCGWINSZ, &ws) < 0) { host_fd_ref_close(&host_ref); @@ -1569,6 +1571,33 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) host_fd_ref_close(&host_ref); return 0; } + case LINUX_TIOCSWINSZ: { + /* Set terminal window size. Same struct as TIOCGWINSZ; foot, sshd, + * tmux, and any libvte-derived emulator call this on the PTY master + * after spawning the slave child. Without it, terminal startup fails + * with -ENOTTY from the default arm below. + * + * A master received through SCM_RIGHTS bypasses /dev/ptmx open + * interception, so lazily create its keepalive before the host ioctl. + * The helper is a no-op for non-pty fds; the real ioctl below still + * supplies the final errno. + */ + linux_winsize_t lws; + if (guest_read_small(g, arg, &lws, sizeof(lws)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + struct winsize ws = { + .ws_row = lws.ws_row, + .ws_col = lws.ws_col, + .ws_xpixel = lws.ws_xpixel, + .ws_ypixel = lws.ws_ypixel, + }; + (void) proc_pty_master_adopt(fd); + int rc = ioctl(host_fd, TIOCSWINSZ, &ws); + host_fd_ref_close(&host_ref); + return rc < 0 ? linux_errno() : 0; + } case LINUX_TCGETS: { /* Get terminal attributes. c_cc index mapping is in file-scope @@ -1715,6 +1744,96 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) return 0; } + case LINUX_TIOCGPTN: { + /* Get the slave pty number associated with a /dev/ptmx master fd. + * Pass the guest fd: proc_pty_master_adopt snapshots the canonical + * (host_fd, generation) under fd_lock, performs the slave open on a + * private dup, then re-validates the slot before publishing the + * keepalive. Passing the per-syscall host_fd_ref dup or a raw host + * fd would race with sibling close+reuse. + */ + uint32_t val = proc_pty_master_adopt(fd); + if (val == UINT32_MAX) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + if (guest_write_small(g, arg, &val, sizeof(val)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + host_fd_ref_close(&host_ref); + return 0; + } + case LINUX_TIOCSPTLCK: { + /* Lock/unlock the slave side of a pty. glibc unlockpt() always passes + * 0 (unlock); util-linux's setlock(1) passes 1 to lock. macOS exposes + * unlockpt(3) but no re-lock primitive, so the lock branch is accepted + * as a best-effort no-op for real ptmx masters rather than surfacing + * as -EINVAL: an application probing the result would otherwise + * misread the failure as "this kernel has no devpts". + */ + int32_t lock = 0; + if (guest_read_small(g, arg, &lock, sizeof(lock)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + int rc = 0; + if (lock == 0) { + rc = unlockpt(host_fd); + } else { + char slave[64]; + if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + } + host_fd_ref_close(&host_ref); + return rc < 0 ? linux_errno() : 0; + } + case LINUX_TIOCGPTPEER: { + /* Return a fresh fd referring to the slave side of a /dev/ptmx master. + * Linux added this in 4.13 so callers can avoid the ptsname(3) round + * trip and any /dev/pts visibility races. The arg holds open(2)-style + * flags. Restrict to the bits Linux's pty driver actually honors + * (accmode + O_NOCTTY + O_NONBLOCK + O_CLOEXEC); any other bit, in + * particular O_CREAT / O_TRUNC / O_EXCL / O_PATH, would be silently + * ignored on Linux and is rejected with EINVAL here so misuse does + * not leak nonsense flags into the guest fd table. + */ + int linux_flags = (int) arg; + const int allowed = LINUX_O_ACCMODE | LINUX_O_NOCTTY | + LINUX_O_NONBLOCK | LINUX_O_CLOEXEC; + if (linux_flags & ~allowed) { + host_fd_ref_close(&host_ref); + return -LINUX_EINVAL; + } + char slave[64]; + if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) { + host_fd_ref_close(&host_ref); + return -LINUX_ENOTTY; + } + int oflags = translate_open_flags(linux_flags); + int host_slave_fd = open(slave, oflags); + if (host_slave_fd < 0) { + int saved_errno = errno; + host_fd_ref_close(&host_ref); + errno = saved_errno; + return linux_errno(); + } + host_fd_ref_close(&host_ref); + int guest_fd = fd_alloc(FD_REGULAR, host_slave_fd, NULL); + if (guest_fd < 0) { + close(host_slave_fd); + return -LINUX_EMFILE; + } + /* Track CLOEXEC + accmode in the guest table so exec honors them; the + * host fd's own FD_CLOEXEC is per-descriptor and would be lost on the + * dup that host_fd_ref hands multi-threaded callers. + */ + fd_publish_linux_flags(guest_fd, linux_flags); + return guest_fd; + } + case LINUX_FIONBIO: { /* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg: * nonzero enables non-blocking, zero disables it. libuv's @@ -1759,8 +1878,83 @@ int64_t sys_fallocate(int fd, int mode, int64_t offset, int64_t len) return -LINUX_EINVAL; } - /* mode 0 = basic allocation -> ftruncate fallback. - * Other modes (FALLOC_FL_PUNCH_HOLE etc.) not supported. + /* FALLOC_FL_PUNCH_HOLE always requires FALLOC_FL_KEEP_SIZE on Linux; + * map both to macOS F_PUNCHHOLE on the host fd, with a pwrite-zeros + * fallback for misalignment. + * + * The Linux semantic is "reads in [offset, offset+len) return zero; + * file size unchanged". macOS F_PUNCHHOLE enforces filesystem block + * alignment on both ends and rejects sub-block requests with EINVAL -- + * that one-byte probe (offset=0 len=1) foot's wl_shm pool issues + * surfaces as "fallocate(FALLOC_FL_PUNCH_HOLE) not supported (Invalid + * argument)" otherwise, and foot disables punch-hole for the whole + * session. + * + * Writing zeros over the region produces the same observable result: + * reads return zero, file size unchanged. The disk-space deallocation + * optimisation is lost on the pwrite path, but the probe succeeds, so + * foot keeps punch-hole enabled and the later, properly aligned calls + * (page-sized buffers) still take the F_PUNCHHOLE fast path. + */ + const int kPunchHole = + LINUX_FALLOC_FL_PUNCH_HOLE | LINUX_FALLOC_FL_KEEP_SIZE; + if (mode == kPunchHole) { + struct fpunchhole hole = { + .fp_flags = 0, + .reserved = 0, + .fp_offset = (off_t) offset, + .fp_length = (off_t) len, + }; + if (fcntl(host_ref.fd, F_PUNCHHOLE, &hole) == 0) { + host_fd_ref_close(&host_ref); + return 0; + } + /* EINVAL: misaligned, sub-block, or non-regular file. pwrite zeros + * only through the current EOF so KEEP_SIZE remains guest-visible. + * Any other host errno propagates verbatim. + */ + if (errno != EINVAL) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + struct stat st; + if (fstat(host_ref.fd, &st) < 0) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + if (offset >= st.st_size) { + host_fd_ref_close(&host_ref); + return 0; + } + int64_t remaining = st.st_size - offset; + if (remaining > len) + remaining = len; + + static const char zeros[4096]; + off_t cur = (off_t) offset; + while (remaining > 0) { + size_t chunk = remaining > (int64_t) sizeof(zeros) + ? sizeof(zeros) + : (size_t) remaining; + ssize_t nw = pwrite(host_ref.fd, zeros, chunk, cur); + if (nw < 0) { + if (errno == EINTR) + continue; + host_fd_ref_close(&host_ref); + return linux_errno(); + } + if (nw == 0) + break; /* defensive; pwrite on a regular file should not 0 */ + cur += nw; + remaining -= nw; + } + host_fd_ref_close(&host_ref); + return 0; + } + + /* mode 0 = basic allocation -> ftruncate fallback. Anything else + * (collapse range, zero range, insert range, unshare range) stays + * unsupported and surfaces as -EOPNOTSUPP for the guest to handle. */ if (mode != 0) { host_fd_ref_close(&host_ref); diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 415fa2e..b344816 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -1478,6 +1478,26 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, return 0; } +/* True when the backing fd allows writes through. The overlay path replaces + * the slab's RW host VA with MAP_SHARED|MAP_FIXED of this fd, and Apple HVF + * refuses hv_vm_map of any permission onto a host VA whose write capability + * does not cover the requested stage-2 perms. A read-only fd lands there + * with the kernel rejecting either PROT_WRITE on the host mmap or, after a + * PROT_READ downgrade, the post-overlay hv_vm_map with HV_DENIED. + * Centralises that decision: both the overlay entry (hvf_apply_file_overlay) + * and the sys_mmap fast-path skip share this gate so read-only backers are + * routed straight to the snapshot pread path. Returns true on the optimistic + * path when fcntl itself fails: the subsequent mmap / hv_vm_map will surface + * the real error rather than this helper synthesising one. + */ +static bool overlay_fd_writable(int fd) +{ + int fl = fcntl(fd, F_GETFL); + if (fl < 0) + return true; + return (fl & O_ACCMODE) != O_RDONLY; +} + /* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB * segment is split out first if it is not already isolated. Caller @@ -1491,6 +1511,8 @@ static int hvf_apply_file_overlay(guest_t *g, int fd, off_t file_off) { + if (!overlay_fd_writable(fd)) + return -LINUX_EACCES; thread_quiesce_siblings(); int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off); thread_resume_siblings(); @@ -2267,7 +2289,14 @@ int64_t sys_mmap(guest_t *g, * gap-finder advances the hint to the next host-page boundary * after each allocation. */ + /* overlay_fd_writable rejects read-only backing fds inside + * hvf_apply_file_overlay; mirror the check here so a read-only + * mmap takes the snapshot pread path directly, skipping the + * thread_quiesce / segment_split cycle the overlay would + * otherwise perform before returning EACCES. + */ bool overlay_aligned = (flags & LINUX_MAP_SHARED) && + overlay_fd_writable(host_backing_fd) && (result_off % hps == 0) && ((uint64_t) offset % hps == 0); if (overlay_aligned) { @@ -3977,9 +4006,25 @@ int mmap_fork_restore_overlays(guest_t *g, int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd, (off_t) file_off); if (err < 0) { - log_warn( - "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d", - (unsigned long long) ovl_s, (unsigned long long) ovl_e, err); + /* -LINUX_EACCES is the writable-fd gate in hvf_apply_file_overlay + * rejecting a read-only backing fd (foot's fontconfig caches, + * shared library file-backed regions, etc.). The fallback to + * snapshot semantics is correct for those: the child reads the + * pre-fork bytes and never writes back, which is what the parent + * already did. Log at debug level so the success path stays + * quiet. Any other failure is unexpected and stays at warn. + */ + if (err == -LINUX_EACCES) + log_debug( + "fork-child: read-only backing fd, skipping overlay " + "[0x%llx, 0x%llx) (snapshot semantics)", + (unsigned long long) ovl_s, (unsigned long long) ovl_e); + else + log_warn( + "fork-child: overlay re-install [0x%llx, 0x%llx) failed: " + "%d", + (unsigned long long) ovl_s, (unsigned long long) ovl_e, + err); rc = err; continue; } diff --git a/src/syscall/path.c b/src/syscall/path.c index b8599ed..d2bf6d8 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -75,6 +75,13 @@ bool path_might_use_stat_intercept(const char *path) return true; if (!strcmp(path, "/dev/fuse")) return true; + /* glibc ptsname(3) stats /dev/pts/N after TIOCGPTN to confirm the slave + * exists and is a char device; without this the stat falls through to the + * host where /dev/pts is absent and ptsname returns ENOENT. + */ + if (!strncmp(path, "/dev/pts/", 9) || !strcmp(path, "/dev/pts") || + !strcmp(path, "/dev/pts/")) + return true; if (fuse_path_matches_mount(path)) return true; if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) diff --git a/src/syscall/poll.c b/src/syscall/poll.c index 705a902..aa65d0b 100644 --- a/src/syscall/poll.c +++ b/src/syscall/poll.c @@ -210,7 +210,7 @@ int64_t sys_ppoll(guest_t *g, poll_timeout_ms < 0 ? 200 : poll_timeout_ms); /* Check for exit_group / futex_interrupt after waking */ - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { ret = -1; errno = EINTR; break; @@ -549,7 +549,7 @@ int64_t sys_pselect6(guest_t *g, has_timeout ? &ts : &poll_ts, NULL); } - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { ret = -1; errno = EINTR; break; @@ -1030,7 +1030,7 @@ int64_t sys_epoll_pwait(guest_t *g, nready = kevent(epoll_ref.fd, NULL, 0, kevents, cap, has_timeout ? &ts : &poll_ts); - if (proc_exit_group_requested() || futex_interrupt_pending()) { + if (proc_exit_group_requested() || futex_interrupt_consume()) { nready = -1; errno = EINTR; break; diff --git a/src/syscall/translate.c b/src/syscall/translate.c index 018690d..7cf2981 100644 --- a/src/syscall/translate.c +++ b/src/syscall/translate.c @@ -93,6 +93,22 @@ int64_t linux_errno(void) #if ENOTSUP != EOPNOTSUPP case ENOTSUP: return -LINUX_EOPNOTSUPP; +#endif + /* macOS xattr "attribute not found" lives at 93 (ENOATTR) on modern + * SDKs; on some versions ENOATTR is a synonym for ENODATA(96). Map + * both to Linux ENODATA(61) so getxattr/lgetxattr/fgetxattr report + * missing attrs correctly. Guarded by #if to avoid duplicate cases + * when the headers alias the two macros. + */ +#ifdef ENOATTR +#if !defined(ENODATA) || ENOATTR != ENODATA + case ENOATTR: + return -LINUX_ENODATA; +#endif +#endif +#ifdef ENODATA + case ENODATA: + return -LINUX_ENODATA; #endif #ifdef ENOTRECOVERABLE case ENOTRECOVERABLE: diff --git a/tests/manifest.txt b/tests/manifest.txt index 8d836eb..b8e779f 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -68,6 +68,7 @@ test-epoll-aba test-timerfd test-large-io-boundary test-ioctl-cloexec +test-pty [section] /proc and /dev emulation tests test-proc @@ -107,6 +108,9 @@ test-cloexec [section] O_PATH semantics tests test-opath +[section] xattr semantics tests +test-xattr + [section] Guard page / mmap edge cases test-guard-page test-mmap-hint diff --git a/tests/test-futex-pi.c b/tests/test-futex-pi.c index 66b84b7..ad3ebbf 100644 --- a/tests/test-futex-pi.c +++ b/tests/test-futex-pi.c @@ -12,9 +12,12 @@ * 2. FUTEX_LOCK_PI + FUTEX_UNLOCK_PI round-trip: acquire and * release a PI lock from the same thread. * - * 3. futex_wait EINTR injection (commit 18bdd0f): futex_wait with - * no timeout returns -EINTR within ~1-2 seconds when no waker - * exists (simulated periodic signal delivery). + * 3. futex_wait blocks indefinitely without a signal: an earlier + * revision returned synthetic -EINTR after ~1 s of unconditional + * blocking, which violated POSIX sem_wait callers that do not + * retry on EINTR (e.g. foot's render_worker_thread). The wait + * must only return when a real wake arrives or a signal is + * genuinely queued for the thread. * * Syscalls exercised: futex(98), clone(220), gettid(178), exit(93) */ @@ -198,92 +201,77 @@ static void test_pi_dead_owner(void) PASS(); } -/* Test 3: EINTR injection after ~1s */ +/* Test 3: futex_wait without a signal blocks until woken */ -/* Sibling that keeps the guest in a multi-threaded state for the duration of - * the EINTR probe. The synthetic EINTR injection in futex_wait only fires - * while thread_is_single_active() is false; a single-threaded guest must be - * allowed to park in FUTEX_WAIT indefinitely so it does not break glibc - * startup paths. The probe therefore has to run with at least one other guest - * thread alive. - * - * The sibling sleeps on a timed futex_wait against keepalive_word with a - * 5-second timeout. The timeout dodges the EINTR injection ('!has_timeout' is - * what gates the sim), and 5 s is long enough to outlast the worst-case parent - * EINTR window (1 s with up to 100 ms poll jitter, plus a safety margin). After - * the parent's probe returns, the parent flips keepalive_word and wakes the - * sibling. +/* Sibling that waits ~1.2 s, flips the futex word, and issues FUTEX_WAKE on + * the parent's address. Used to drive the parent out of an indefinite + * futex_wait via a real wake (not synthetic EINTR). */ -static volatile int sibling_keepalive __attribute__((aligned(4))) = 1; -static char sibling_stack_buf[8192] __attribute__((aligned(16))); +static volatile int waker_word __attribute__((aligned(4))) = 0; +static char waker_stack_buf[8192] __attribute__((aligned(16))); -static void sibling_alive_thread(void) +static void waker_thread(void) { - struct timespec ts = {5, 0}; - while (__atomic_load_n(&sibling_keepalive, __ATOMIC_SEQ_CST) == 1) { - raw_syscall6(__NR_futex, (long) &sibling_keepalive, - FUTEX_WAIT | FUTEX_PRIVATE, 1, (long) &ts, 0, 0); - } + struct timespec ts = {1, 200 * 1000 * 1000}; + raw_syscall6(__NR_nanosleep, (long) &ts, 0, 0, 0, 0, 0); + + __atomic_store_n(&waker_word, 1, __ATOMIC_SEQ_CST); + raw_futex_wake((int *) &waker_word, 1); raw_exit(0); } static void test_futex_eintr(void) { - TEST("futex_wait EINTR after ~1s"); + TEST("futex_wait blocks until real wake, no synthetic EINTR"); - /* Spawn the sibling so thread_is_single_active() is false during the wait. - * CLONE flags match test_pi_dead_owner. - */ - sibling_keepalive = 1; - void *sibling_top = sibling_stack_buf + sizeof(sibling_stack_buf); - int sibling_tid_val = 0; - long sret = raw_clone(0x7d0f00, sibling_top, &sibling_tid_val, 0, - (int *) &sibling_tid_val); + waker_word = 0; + void *waker_top = waker_stack_buf + sizeof(waker_stack_buf); + int waker_tid_val = 0; + long sret = raw_clone(0x7d0f00, waker_top, &waker_tid_val, 0, + (int *) &waker_tid_val); if (sret < 0) { - FAIL("sibling clone failed"); + FAIL("waker clone failed"); return; } if (sret == 0) { - sibling_alive_thread(); + waker_thread(); raw_exit(1); /* unreachable */ } - /* Create a futex word that no one will wake. - * futex_wait with no timeout should return -EINTR after ~1 second - * (elfuse's simulated periodic signal delivery). - */ - volatile int unwoken = 42; - struct timeval t0, t1; gettimeofday(&t0, NULL); - long r = raw_futex_wait((int *) &unwoken, 42); + /* No timeout. With no signal queued, the wait must NOT return until the + * waker thread issues FUTEX_WAKE; earlier revisions returned synthetic + * -EINTR after ~1 s, which broke glibc sem_wait callers. + */ + long r = raw_futex_wait((int *) &waker_word, 0); gettimeofday(&t1, NULL); long elapsed_ms = (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_usec - t0.tv_usec) / 1000; - /* Tear down the sibling now that the EINTR check is done. */ - __atomic_store_n(&sibling_keepalive, 0, __ATOMIC_SEQ_CST); - raw_futex_wake((int *) &sibling_keepalive, 1); + /* Reap the waker. */ for (int i = 0; i < 100; i++) { - if (__atomic_load_n(&sibling_tid_val, __ATOMIC_SEQ_CST) == 0) + if (__atomic_load_n(&waker_tid_val, __ATOMIC_SEQ_CST) == 0) break; usleep(10000); } - /* Expect -EINTR (Linux errno 4) after 800ms-3000ms. - * The 1s timeout has jitter from 100ms polling intervals. + /* The waker sleeps ~1200 ms before waking the parent. Accept either rc==0 + * (woken by FUTEX_WAKE after the wait observed waker_word change) or + * -EAGAIN (woken between the waker_word store and the FUTEX_WAKE; the + * value-mismatch path is the documented race for FUTEX_WAIT). Either + * outcome proves the parent did not bail out on synthetic EINTR. */ - if (r == -4 /* -EINTR */ && elapsed_ms >= 800 && elapsed_ms <= 3000) { + if ((r == 0 || r == -11 /* -EAGAIN */) && elapsed_ms >= 1000 && + elapsed_ms <= 4000) { PASS(); - } else if (r == -4) { - /* Got EINTR but timing seems off; still passing, but note it */ - printf("OK (EINTR after %ldms)\n", elapsed_ms); - passes++; } else { - printf("FAIL: expected -EINTR(-4) got %ld, elapsed %ldms\n", r, - elapsed_ms); + printf( + "FAIL: expected rc=0 or -EAGAIN after ~1200ms, got %ld at " + "%ldms\n", + r, elapsed_ms); fails++; } } diff --git a/tests/test-io-opt.c b/tests/test-io-opt.c index e1691c1..b139a60 100644 --- a/tests/test-io-opt.c +++ b/tests/test-io-opt.c @@ -165,6 +165,37 @@ int main(void) FAIL("open failed"); } + TEST("fallocate punch hole keeps size past EOF"); + { + const char *punch_path = "/tmp/elfuse-test-punch.bin"; + unlink(punch_path); + int fd = open(punch_path, O_CREAT | O_RDWR, 0644); + if (fd >= 0) { + const char data[] = "abc"; + const off_t data_size = (off_t) sizeof(data) - 1; + if (write(fd, data, sizeof(data) - 1) == (ssize_t) data_size && + fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1, + 4096) == 0) { + struct stat st; + char buf[sizeof(data) - 1]; + int stat_ok = fstat(fd, &st) == 0; + ssize_t nr = -1; + if (stat_ok) + nr = pread(fd, buf, sizeof(buf), 0); + if (stat_ok && st.st_size == data_size && + nr == (ssize_t) sizeof(buf) && buf[0] == 'a' && + buf[1] == '\0' && buf[2] == '\0') + PASS(); + else + FAIL("punch hole did not preserve size and zero bytes"); + } else + FAIL("punch hole setup failed"); + close(fd); + unlink(punch_path); + } else + FAIL("open failed"); + } + /* Test copy_file_range (via off_t-based API) */ TEST("copy_file_range"); { diff --git a/tests/test-pty.c b/tests/test-pty.c new file mode 100644 index 0000000..ce6344f --- /dev/null +++ b/tests/test-pty.c @@ -0,0 +1,584 @@ +/* PTY ioctl regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Foot, sshd, tmux, and any libvte-derived terminal need the multiplexer + * primitives glibc's posix_openpt(3) / ptsname(3) / openpty(3) stack rests + * on. Exercise the pieces glibc and posix-compliant pty children depend on: + * + * 1. TIOCSWINSZ on the /dev/ptmx master fd (the direct failure foot saw) + * 2. TIOCGPTN -> /dev/pts/N path round trip + * 3. TIOCSPTLCK(0) for unlockpt(), plus non-zero lock request fd typing + * 4. /dev/pts/N open + stat intercept and the slave fd's window size + * 5. /dev/pts/N open in a forked child after the child has already closed + * its master (foot/sshd/openssh sftp-server pattern) + * + * The test stays self-contained on the syscall surface (no libutil/openpty), + * so it runs the same way under elfuse-aarch64, qemu-aarch64, and any future + * elfuse-x86_64 reuse. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +#ifndef TIOCSWINSZ +#define TIOCSWINSZ 0x5414 +#endif +#ifndef TIOCGPTN +#define TIOCGPTN 0x80045430 +#endif +#ifndef TIOCSPTLCK +#define TIOCSPTLCK 0x40045431 +#endif +#ifndef TIOCGPTPEER +#define TIOCGPTPEER 0x5441 +#endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif +#ifndef AT_EMPTY_PATH +#define AT_EMPTY_PATH 0x1000 +#endif +#ifndef SYS_statx +#define SYS_statx 291 +#endif + +int passes = 0, fails = 0; + +static int count_pts_entries(void) +{ + DIR *dir = opendir("/dev/pts"); + if (!dir) + return -1; + + int count = 0; + struct dirent *ent; + while ((ent = readdir(dir))) { + if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) + continue; + int numeric = 1; + for (const char *p = ent->d_name; *p; p++) { + if (!isdigit((unsigned char) *p)) { + numeric = 0; + break; + } + } + if (numeric) + count++; + } + closedir(dir); + return count; +} + +int main(void) +{ + printf("test-pty: PTY ioctl + /dev/pts/N path support\n"); + + /* Regression guard for the pty_keepalive_table BSS-zero collision: any + * close that runs before the very first /dev/ptmx open would walk the + * still-zero-initialized table and match a master_host_fd of zero, + * silently closing the wrong slave_host_fd (also zero). Closing stdin + * here forces fd_cleanup_entry to invoke proc_pty_close_keepalive in + * that vulnerable window. If the fix regresses, every subsequent test + * still passes locally but a future stdio fd in another vCPU may go + * missing. The cheap sentinel makes sure stdin's close itself does + * not corrupt elfuse's own file table. + */ + close(STDIN_FILENO); + + TEST("open(/dev/ptmx, O_PATH) does not allocate a pty"); + int before_path_pts = count_pts_entries(); + int path_fd = open("/dev/ptmx", O_PATH | O_CLOEXEC); + int after_path_pts = count_pts_entries(); + if (path_fd >= 0) { + struct stat ptmx_path_st; + int fstat_rc = fstat(path_fd, &ptmx_path_st); + struct stat ptmx_at_st; + int fstatat_rc = fstatat(path_fd, "", &ptmx_at_st, AT_EMPTY_PATH); + struct statx ptmx_sx; + memset(&ptmx_sx, 0, sizeof(ptmx_sx)); + long statx_rc = + syscall(SYS_statx, path_fd, "", AT_EMPTY_PATH, 0x7ff, &ptmx_sx); + errno = 0; + int fchdir_rc = fchdir(path_fd); + int fchdir_errno = errno; + int ok = before_path_pts >= 0 && after_path_pts == before_path_pts && + fstat_rc == 0 && S_ISCHR(ptmx_path_st.st_mode) && + major(ptmx_path_st.st_rdev) == 5 && + minor(ptmx_path_st.st_rdev) == 2 && fstatat_rc == 0 && + S_ISCHR(ptmx_at_st.st_mode) && + major(ptmx_at_st.st_rdev) == 5 && + minor(ptmx_at_st.st_rdev) == 2 && statx_rc == 0 && + S_ISCHR(ptmx_sx.stx_mode) && ptmx_sx.stx_rdev_major == 5 && + ptmx_sx.stx_rdev_minor == 2 && fchdir_rc < 0 && + fchdir_errno == ENOTDIR; + close(path_fd); + EXPECT_TRUE(ok, + "O_PATH open allocated a pty or exposed wrong path fd " + "semantics"); + } else { + FAIL("O_PATH /dev/ptmx open failed"); + } + + int ptmx = open("/dev/ptmx", O_RDWR | O_NOCTTY); + TEST("open(/dev/ptmx, O_RDWR | O_NOCTTY)"); + EXPECT_TRUE(ptmx >= 0, "open /dev/ptmx failed"); + if (ptmx < 0) { + SUMMARY("test-pty"); + return 1; + } + + /* TIOCSWINSZ on the master was the direct regression that broke foot's + * terminal startup: sys_ioctl had no case for it and fell through to the + * default -ENOTTY arm. + */ + TEST("TIOCSWINSZ on /dev/ptmx master"); + struct winsize ws_set = { + .ws_row = 40, + .ws_col = 132, + .ws_xpixel = 1056, + .ws_ypixel = 640, + }; + EXPECT_TRUE(ioctl(ptmx, TIOCSWINSZ, &ws_set) == 0, "TIOCSWINSZ failed"); + + TEST("TIOCGWINSZ round-trips the values set above"); + struct winsize ws_get = {0}; + int ok = ioctl(ptmx, TIOCGWINSZ, &ws_get) == 0 && ws_get.ws_row == 40 && + ws_get.ws_col == 132 && ws_get.ws_xpixel == 1056 && + ws_get.ws_ypixel == 640; + EXPECT_TRUE(ok, "TIOCGWINSZ round trip mismatch"); + + TEST("TIOCSPTLCK(0) unlocks the slave"); + int unlock = 0; + EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &unlock) == 0, "TIOCSPTLCK(0) failed"); + + /* Linux TIOCSPTLCK(non-zero) locks the slave and returns success. + * elfuse cannot actually enforce the lock on macOS (no re-lock primitive) + * but must still report success so callers do not misread the result as + * "this kernel has no devpts". + */ + TEST("TIOCSPTLCK(1) accepted as best-effort no-op"); + int lock = 1; + EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &lock) == 0, "TIOCSPTLCK(1)"); + + TEST("TIOCSPTLCK(1) rejects a regular file"); + char lock_template[] = "/tmp/elfuse-pty-lock-XXXXXX"; + int regular = mkstemp(lock_template); + if (regular < 0) { + FAIL("mkstemp failed"); + } else { + EXPECT_ERRNO(ioctl(regular, TIOCSPTLCK, &lock), ENOTTY, + "regular fd accepted TIOCSPTLCK(1)"); + close(regular); + unlink(lock_template); + } + + TEST("TIOCSPTLCK(1) rejects a pipe"); + int lock_pipe[2]; + if (pipe(lock_pipe) != 0) { + FAIL("pipe(lock_pipe) failed"); + } else { + EXPECT_ERRNO(ioctl(lock_pipe[0], TIOCSPTLCK, &lock), ENOTTY, + "pipe fd accepted TIOCSPTLCK(1)"); + close(lock_pipe[0]); + close(lock_pipe[1]); + } + + TEST("TIOCGPTN returns a numeric slave id"); + unsigned int ptyno = (unsigned int) -1; + EXPECT_TRUE(ioctl(ptmx, TIOCGPTN, &ptyno) == 0 && ptyno < 100000u, + "TIOCGPTN failed"); + + TEST("stat(/dev/pts) succeeds and reports a directory"); + struct stat pts_dir_st; + int pts_dir_statrc = stat("/dev/pts", &pts_dir_st); + EXPECT_TRUE(pts_dir_statrc == 0 && S_ISDIR(pts_dir_st.st_mode), + "stat /dev/pts failed"); + + TEST("open(/dev/pts, O_DIRECTORY) succeeds"); + int pts_dir_fd = open("/dev/pts", O_RDONLY | O_DIRECTORY); + EXPECT_TRUE(pts_dir_fd >= 0, "open /dev/pts directory failed"); + if (pts_dir_fd >= 0) + close(pts_dir_fd); + + TEST("readdir(/dev/pts) lists the active slave id"); + DIR *pts_dir = opendir("/dev/pts"); + if (!pts_dir) { + FAIL("opendir /dev/pts failed"); + } else { + char want[32]; + snprintf(want, sizeof(want), "%u", ptyno); + int saw_ptyno = 0; + struct dirent *ent; + while ((ent = readdir(pts_dir))) { + if (!strcmp(ent->d_name, want)) { + saw_ptyno = 1; + break; + } + } + closedir(pts_dir); + EXPECT_TRUE(saw_ptyno, "active pts id missing from /dev/pts"); + } + + char pts_path[32]; + snprintf(pts_path, sizeof(pts_path), "/dev/pts/%u", ptyno); + + /* glibc ptsname(3) stats the formatted path before returning it. + * Until the path.c stat allowlist included /dev/pts/N, the stat went + * to the host (which has no /dev/pts at all) and ptsname returned + * ENOENT, leaving every caller without a usable slave path. + */ + TEST("stat(/dev/pts/N) succeeds and reports a char device"); + struct stat st; + int statrc = stat(pts_path, &st); + EXPECT_TRUE(statrc == 0 && S_ISCHR(st.st_mode), "stat /dev/pts/N failed"); + + TEST("stat(/dev/pts/N) major is Linux pts (136)"); + EXPECT_EQ(major(st.st_rdev), 136, "wrong pts major"); + + TEST("open(/dev/pts/N) returns a usable slave fd"); + int slave = open(pts_path, O_RDWR | O_NOCTTY); + EXPECT_TRUE(slave >= 0, "open /dev/pts/N failed"); + + if (slave >= 0) { + TEST("TIOCGWINSZ on the slave reflects the master-side update"); + struct winsize ws_slave = {0}; + int slave_ok = ioctl(slave, TIOCGWINSZ, &ws_slave) == 0 && + ws_slave.ws_row == 40 && ws_slave.ws_col == 132; + EXPECT_TRUE(slave_ok, "slave winsize mismatch"); + + TEST("TIOCSPTLCK(1) rejects the pty slave"); + EXPECT_ERRNO(ioctl(slave, TIOCSPTLCK, &lock), ENOTTY, + "slave fd accepted TIOCSPTLCK(1)"); + + TEST("TIOCSWINSZ on the slave propagates back to the master"); + struct winsize ws_resize = { + .ws_row = 24, + .ws_col = 80, + .ws_xpixel = 0, + .ws_ypixel = 0, + }; + if (ioctl(slave, TIOCSWINSZ, &ws_resize) != 0) { + FAIL("slave TIOCSWINSZ failed"); + } else { + struct winsize ws_after = {0}; + int after_ok = ioctl(ptmx, TIOCGWINSZ, &ws_after) == 0 && + ws_after.ws_row == 24 && ws_after.ws_col == 80; + EXPECT_TRUE(after_ok, "master did not see slave resize"); + } + + close(slave); + } + + /* TIOCGPTPEER short-circuits the ptsname/stat/open dance. Recent foot + * and util-linux prefer it; older kernels return ENOTTY and the caller + * falls back to /dev/pts. Accept either an fd or ENOTTY (some hosts + * legitimately do not implement it), but never silent corruption. */ + TEST("TIOCGPTPEER returns a slave fd or ENOTTY"); + int peer = ioctl(ptmx, TIOCGPTPEER, O_RDWR | O_NOCTTY); + int peer_ok = peer >= 0 || (peer == -1 && errno == ENOTTY); + EXPECT_TRUE(peer_ok, "TIOCGPTPEER returned unexpected status"); + if (peer >= 0) + close(peer); + + /* dup of the master must keep both aliases functional even after the + * original is closed. The keepalive slave needs to be mirrored across + * the dup so the surviving alias still observes master-side tty ioctls. + */ + TEST("dup(ptmx) followed by close(orig) leaves alias usable"); + int alias = dup(ptmx); + if (alias < 0) { + FAIL("dup(ptmx) failed"); + } else { + close(ptmx); + struct winsize ws_alias = {.ws_row = 50, .ws_col = 100}; + if (ioctl(alias, TIOCSWINSZ, &ws_alias) != 0) { + FAIL("TIOCSWINSZ on alias after closing original"); + } else { + struct winsize ws_check = {0}; + int alias_ok = ioctl(alias, TIOCGWINSZ, &ws_check) == 0 && + ws_check.ws_row == 50 && ws_check.ws_col == 100; + EXPECT_TRUE(alias_ok, "alias winsize did not stick"); + } + ptmx = alias; /* the alias is the live master from here on */ + } + + /* Fork must propagate the master's keepalive across the IPC handoff so + * the child can do master-side tty ioctls without the macOS ENOTTY + * fallback. The parent's keepalive slave fd is independent (each side + * holds its own slot) so closing one side does not affect the other. + */ + TEST("child fork inherits master keepalive (TIOCSWINSZ works)"); + int sync_pipe[2]; + if (pipe(sync_pipe) != 0) { + FAIL("pipe(sync_pipe) failed"); + } else { + pid_t pid = fork(); + if (pid < 0) { + FAIL("fork failed"); + close(sync_pipe[0]); + close(sync_pipe[1]); + } else if (pid == 0) { + /* Child: do master-side TIOCSWINSZ and report rc via pipe. */ + close(sync_pipe[0]); + struct winsize ws_child = { + .ws_row = 30, + .ws_col = 90, + }; + int rc = ioctl(ptmx, TIOCSWINSZ, &ws_child); + char status = (rc == 0) ? 'Y' : 'N'; + (void) !write(sync_pipe[1], &status, 1); + close(sync_pipe[1]); + _exit(rc == 0 ? 0 : 1); + } else { + close(sync_pipe[1]); + char status = '?'; + ssize_t n = read(sync_pipe[0], &status, 1); + close(sync_pipe[0]); + int wstatus = 0; + waitpid(pid, &wstatus, 0); + int child_ok = (n == 1) && (status == 'Y') && WIFEXITED(wstatus) && + WEXITSTATUS(wstatus) == 0; + EXPECT_TRUE(child_ok, "child TIOCSWINSZ on master failed"); + /* Parent should still see the child's update because the slave + * keepalive in the parent is still alive. */ + struct winsize ws_parent = {0}; + int parent_ok = ioctl(ptmx, TIOCGWINSZ, &ws_parent) == 0 && + ws_parent.ws_row == 30 && ws_parent.ws_col == 90; + TEST("parent observes the child's master-side resize"); + EXPECT_TRUE(parent_ok, "parent winsize mismatch after child"); + } + } + + close(ptmx); + + /* Re-open and immediately close should not leak the keepalive slave. + * Without the proc_pty_close_keepalive call in sys_close's fast path, + * single-thread close goes through fd_close_regular_relaxed and + * bypasses fd_cleanup_entry, leaving the hidden slave fd open until + * elfuse exits. Loop enough times to expose any per-close leak. + */ + TEST("repeated open/close does not exhaust the keepalive table"); + int leak_loop_ok = 1; + for (int i = 0; i < 300; i++) { + int f = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (f < 0) { + leak_loop_ok = 0; + break; + } + close(f); + } + EXPECT_TRUE(leak_loop_ok, + "repeated /dev/ptmx open/close exhausted the keepalive table"); + + /* foot/sshd/openssh sftp-server pattern: the child closes its inherited + * master fd before opening the slave (the child has no use for the + * master). Earlier, this dropped the keepalive table entry that held the + * macOS slave_path mapping, and the subsequent open("/dev/pts/N") in the + * child failed with ENOENT even though the parent still held the master + * and the macOS slave node was openable. The retained-path semantics in + * proc_pty_close_keepalive must let the child still translate the path. + */ + TEST("child can open /dev/pts/N after closing its master"); + int spawn_master = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (spawn_master < 0) { + FAIL("open(/dev/ptmx) for spawn scenario"); + } else { + unsigned int spawn_ptyno = (unsigned int) -1; + int unlock_zero = 0; + if (ioctl(spawn_master, TIOCGPTN, &spawn_ptyno) != 0 || + ioctl(spawn_master, TIOCSPTLCK, &unlock_zero) != 0) { + FAIL("TIOCGPTN/TIOCSPTLCK on spawn master"); + close(spawn_master); + } else { + char spawn_pts[32]; + snprintf(spawn_pts, sizeof(spawn_pts), "/dev/pts/%u", spawn_ptyno); + int spawn_pipe[2]; + if (pipe(spawn_pipe) != 0) { + FAIL("pipe for spawn scenario"); + close(spawn_master); + } else { + pid_t spawn_pid = fork(); + if (spawn_pid < 0) { + FAIL("fork for spawn scenario"); + close(spawn_pipe[0]); + close(spawn_pipe[1]); + close(spawn_master); + } else if (spawn_pid == 0) { + /* Child: foot's slave_exec sequence -- close the master, + * then open(pts_name) for the controlling terminal. + */ + close(spawn_pipe[0]); + if (setsid() < 0) + _exit(11); + close(spawn_master); + int slave_fd = open(spawn_pts, O_RDWR); + char status = (slave_fd >= 0) ? 'Y' : 'N'; + (void) !write(spawn_pipe[1], &status, 1); + if (slave_fd >= 0) { + (void) !write(slave_fd, "ok\n", 3); + close(slave_fd); + } + close(spawn_pipe[1]); + _exit(slave_fd >= 0 ? 0 : 12); + } else { + close(spawn_pipe[1]); + char status = '?'; + ssize_t n = read(spawn_pipe[0], &status, 1); + close(spawn_pipe[0]); + int wstatus = 0; + waitpid(spawn_pid, &wstatus, 0); + int spawn_ok = (n == 1) && (status == 'Y') && + WIFEXITED(wstatus) && + WEXITSTATUS(wstatus) == 0; + EXPECT_TRUE(spawn_ok, + "child open(/dev/pts/N) after close(master)"); + if (spawn_ok) { + char drain[16] = {0}; + int flags = fcntl(spawn_master, F_GETFL); + if (flags >= 0 && fcntl(spawn_master, F_SETFL, + flags | O_NONBLOCK) == 0) { + (void) !read(spawn_master, drain, + sizeof(drain) - 1); + (void) fcntl(spawn_master, F_SETFL, flags); + } + } + close(spawn_master); + + TEST("stale /dev/pts/N expires after master teardown"); + int stale_fd = open(spawn_pts, O_RDWR); + /* Both ENOENT (devfs node gone) and ENXIO (devfs node + * lingers but the pty pair has been torn down) are valid + * macOS responses depending on kernel version. The + * invariant the test guards is "the stale cached path + * does not silently hand back an unrelated tty"; any + * open failure satisfies that. + */ + int stale_ok = + stale_fd < 0 && (errno == ENOENT || errno == ENXIO); + if (stale_fd >= 0) + close(stale_fd); + EXPECT_TRUE(stale_ok, "stale /dev/pts/N stayed openable"); + } + } + } + } + + /* /dev/pts/N for a never-allocated minor must surface ENOENT (the cached + * paths in the keepalive table cannot satisfy an arbitrary number). + */ + TEST("/dev/pts/ returns ENOENT"); + int unknown = open("/dev/pts/999999", O_RDWR); + int unknown_ok = unknown < 0 && errno == ENOENT; + if (unknown >= 0) + close(unknown); + EXPECT_TRUE(unknown_ok, "open(/dev/pts/999999) did not return ENOENT"); + + /* A pty master received via SCM_RIGHTS bypasses the /dev/ptmx open + * intercept, so the receiver process has no keepalive entry for it. + * proc_pty_master_adopt must lazily register one before master-side tty + * ioctls, even when TIOCSWINSZ runs before the first TIOCGPTN. Two + * checks live inside this block: TIOCSWINSZ-first and the post-adopt + * stat(/dev/pts/N). Each has its own TEST() label below. + */ + int sp[2]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0) { + TEST("socketpair for SCM_RIGHTS adoption setup"); + FAIL("socketpair failed"); + } else { + int donor = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (donor < 0) { + FAIL("donor open(/dev/ptmx) failed"); + close(sp[0]); + close(sp[1]); + } else { + char iobuf = 'x'; + struct iovec iov = {.iov_base = &iobuf, .iov_len = 1}; + char ctrl[CMSG_SPACE(sizeof(int))] = {0}; + struct msghdr msg = {.msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl)}; + struct cmsghdr *cm = CMSG_FIRSTHDR(&msg); + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + cm->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cm), &donor, sizeof(int)); + ssize_t sent = sendmsg(sp[0], &msg, 0); + close(donor); + if (sent < 0) { + FAIL("sendmsg(SCM_RIGHTS) failed"); + } else { + char rbuf; + struct iovec riov = {.iov_base = &rbuf, .iov_len = 1}; + char rctrl[CMSG_SPACE(sizeof(int))] = {0}; + struct msghdr rmsg = {.msg_iov = &riov, + .msg_iovlen = 1, + .msg_control = rctrl, + .msg_controllen = sizeof(rctrl)}; + ssize_t got = recvmsg(sp[1], &rmsg, 0); + if (got < 0) { + FAIL("recvmsg(SCM_RIGHTS) failed"); + } else { + struct cmsghdr *rcm = CMSG_FIRSTHDR(&rmsg); + int recv_master = -1; + if (rcm && rcm->cmsg_level == SOL_SOCKET && + rcm->cmsg_type == SCM_RIGHTS) + memcpy(&recv_master, CMSG_DATA(rcm), sizeof(int)); + if (recv_master < 0) { + TEST("SCM_RIGHTS recv yields a master fd"); + FAIL("recv_master fd not received"); + } else { + TEST("TIOCSWINSZ first on SCM_RIGHTS-received master"); + struct winsize ws_recv = { + .ws_row = 33, + .ws_col = 101, + .ws_xpixel = 808, + .ws_ypixel = 528, + }; + EXPECT_TRUE( + ioctl(recv_master, TIOCSWINSZ, &ws_recv) == 0, + "TIOCSWINSZ before TIOCGPTN on " + "SCM_RIGHTS-received master"); + TEST("TIOCGPTN on SCM_RIGHTS-received master"); + unsigned int recv_ptyno = (unsigned int) -1; + int gpn_rc = ioctl(recv_master, TIOCGPTN, &recv_ptyno); + EXPECT_TRUE(gpn_rc == 0 && recv_ptyno < 100000u, + "TIOCGPTN on SCM_RIGHTS-received master"); + char recv_pts_path[32]; + snprintf(recv_pts_path, sizeof(recv_pts_path), + "/dev/pts/%u", recv_ptyno); + TEST("stat(/dev/pts/N) after SCM_RIGHTS adoption"); + struct stat recv_st; + EXPECT_TRUE(stat(recv_pts_path, &recv_st) == 0 && + S_ISCHR(recv_st.st_mode), + "stat after SCM_RIGHTS adoption failed"); + close(recv_master); + } + } + } + close(sp[0]); + close(sp[1]); + } + } + + SUMMARY("test-pty"); + return fails > 0 ? 1 : 0; +} diff --git a/tests/test-xattr.c b/tests/test-xattr.c new file mode 100644 index 0000000..9ee055e --- /dev/null +++ b/tests/test-xattr.c @@ -0,0 +1,177 @@ +/* lgetxattr / getxattr / setxattr semantics tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Pins three properties of the elfuse xattr surface that the host + * shim has to translate: + * + * 1. lgetxattr returns the stored value on a regular file. + * 2. lgetxattr on a symlink does not follow the link, so requesting + * an attr stored on the target reports ENODATA. getxattr on the + * same symlink follows and returns the target's value. + * 3. A missing attribute reports ENODATA, not EINVAL. macOS returns + * ENOATTR(93) or its synonym ENODATA(96); both must translate to + * Linux ENODATA(61). + * + * Regression: an earlier revision lacked the ENOATTR translation + * entry, so the default in linux_errno() fell through to EINVAL, + * which masked real "attribute not present" outcomes and broke + * fontconfig / glibc xattr probes. + * + * Syscalls exercised: setxattr(5), lsetxattr(6), getxattr(8), + * lgetxattr(9) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#define NR_setxattr 5 +#define NR_lsetxattr 6 +#define NR_getxattr 8 +#define NR_lgetxattr 9 + +static long do_setxattr(const char *path, + const char *name, + const void *val, + size_t sz, + int flags) +{ + return syscall(NR_setxattr, path, name, val, sz, flags); +} + +static long do_lsetxattr(const char *path, + const char *name, + const void *val, + size_t sz, + int flags) +{ + return syscall(NR_lsetxattr, path, name, val, sz, flags); +} + +static long do_getxattr(const char *path, + const char *name, + void *out, + size_t cap) +{ + return syscall(NR_getxattr, path, name, out, cap); +} + +static long do_lgetxattr(const char *path, + const char *name, + void *out, + size_t cap) +{ + return syscall(NR_lgetxattr, path, name, out, cap); +} + +static const char tmp_file[] = "/tmp/elfuse-xattr-target"; +static const char tmp_link[] = "/tmp/elfuse-xattr-link"; + +static void setup(void) +{ + unlink(tmp_link); + unlink(tmp_file); + int fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) + return; + (void) !write(fd, "hello\n", 6); + close(fd); + symlink(tmp_file, tmp_link); +} + +static void teardown(void) +{ + unlink(tmp_link); + unlink(tmp_file); +} + +static void test_lgetxattr_regular_file(void) +{ + TEST("lgetxattr on regular file returns value"); + const char *attr = "user.elfuse_probe"; + const char *val = "wired"; + if (do_setxattr(tmp_file, attr, val, strlen(val), 0) != 0) { + FAIL("setxattr seed failed"); + return; + } + char buf[64] = {0}; + long r = do_lgetxattr(tmp_file, attr, buf, sizeof(buf)); + EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0, + "lgetxattr value mismatch"); +} + +static void test_lgetxattr_symlink_no_follow(void) +{ + TEST("lgetxattr on symlink reports ENODATA, not EINVAL"); + char buf[64] = {0}; + errno = 0; + long r = do_lgetxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf)); + EXPECT_TRUE(r == -1 && errno == ENODATA, + "expected ENODATA from lgetxattr on bare symlink"); +} + +static void test_getxattr_symlink_follows(void) +{ + TEST("getxattr on symlink follows to target"); + char buf[64] = {0}; + long r = do_getxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf)); + const char *val = "wired"; + EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0, + "getxattr value mismatch"); +} + +static void test_lgetxattr_symlink_with_attr(void) +{ + TEST("lgetxattr returns symlink-owned attr after lsetxattr"); + const char *attr = "user.elfuse_probe"; + const char *lval = "link-val"; + if (do_lsetxattr(tmp_link, attr, lval, strlen(lval), 0) != 0) { + printf("SKIP (lsetxattr on symlink unsupported: errno=%d)\n", errno); + passes++; + return; + } + char buf[64] = {0}; + long r = do_lgetxattr(tmp_link, attr, buf, sizeof(buf)); + EXPECT_TRUE( + r == (long) strlen(lval) && memcmp(buf, lval, strlen(lval)) == 0, + "lgetxattr did not return symlink-owned value"); +} + +static void test_lgetxattr_missing(void) +{ + TEST("lgetxattr on missing attr reports ENODATA"); + char buf[64] = {0}; + errno = 0; + long r = do_lgetxattr(tmp_file, "user.no_such_attr_xyz", buf, sizeof(buf)); + EXPECT_TRUE(r == -1 && errno == ENODATA, + "expected ENODATA from lgetxattr on missing attr"); +} + +int main(void) +{ + printf("test-xattr: lgetxattr / getxattr / setxattr semantics\n"); + + setup(); + + test_lgetxattr_regular_file(); + test_lgetxattr_symlink_no_follow(); + test_getxattr_symlink_follows(); + test_lgetxattr_symlink_with_attr(); + test_lgetxattr_missing(); + + teardown(); + + SUMMARY("test-xattr"); + return fails == 0 ? 0 : 1; +}