diff --git a/src/core/guest.c b/src/core/guest.c
index fca75b3..b5afcbf 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -42,7 +42,10 @@
 #include "core/startup-trace.h"
 #include "debug/log.h"
 #include "utils.h"
+#include "runtime/futex.h"  /* futex_interrupt_request */
 #include "runtime/thread.h" /* thread_destroy_all_vcpus */
+#include "syscall/poll.h"   /* wakeup_pipe_signal */
+#include "syscall/proc.h"   /* proc_request_exit_group */
 
 /* Per-vCPU pending TLBI request. Zero-initialized in every host pthread
  * by virtue of TLS default-zeroing, which maps to TLBI_NONE.
@@ -575,9 +578,34 @@ static void release_extra_mappings(guest_t *g)
 
 void guest_destroy(guest_t *g)
 {
-    /* Destroy all worker vCPUs (thread table) before tearing down the VM.
-     * This prevents hv_vm_destroy from racing with active vCPUs that may still
-     * be running if thread join timed out during exit_group.
+    /* Quiesce worker vCPUs before unmapping stage-2. thread_destroy_all_vcpus
+     * only releases vCPU handles; it does not wait for the owning pthread to
+     * leave hv_vcpu_run. A worker still inside the guest at unmap time takes
+     * a stage-2 translation fault on its next instruction fetch and surfaces
+     * as "unexpected exception EC=0x20" in the crash report. PR #89's foot
+     * reproduction tripped exactly that race. The exit_group syscall handler
+     * already runs request, interrupt, and join before its own teardown; the
+     * destroy path needs the same prefix because forkipc.c:vcpu_run_loop
+     * returns straight into guest_destroy without going through the guest
+     * exit_group handler. The request is guarded on the prior state so a
+     * process that already chose its exit code keeps it intact.
+     *
+     * The wake signals cover workers blocked outside hv_vcpu_run: futex
+     * waiters poll futex_interrupt_requested, and any thread parked in
+     * epoll or poll wakes off the shared pipe. Without them, host-blocked
+     * workers miss the hv_vcpus_exit kick (which only affects threads
+     * inside hv_vcpu_run) and the 100ms join cap in thread_join_workers
+     * detaches them, leaving live pthreads to crash on the imminent munmap.
+     */
+    if (!proc_exit_group_requested())
+        proc_request_exit_group(0);
+    futex_interrupt_request();
+    wakeup_pipe_signal();
+    thread_interrupt_all();
+    thread_join_workers();
+    /* Destroy all remaining worker vCPUs (thread table) before tearing down
+     * the VM. This prevents hv_vm_destroy from racing with active vCPUs that
+     * may still be running if thread join timed out during exit_group.
      */
     thread_destroy_all_vcpus();
     if (g->vcpu) {
diff --git a/src/debug/crashreport.c b/src/debug/crashreport.c
index 2c4b789..a5d5ef5 100644
--- a/src/debug/crashreport.c
+++ b/src/debug/crashreport.c
@@ -19,6 +19,19 @@
 
 #include "syscall/proc.h"
 
+/* Page-table descriptor bit definitions used by the diagnostic walker below.
+ * The full set lives in src/core/guest.c next to the helpers that build
+ * descriptors; the crash walker needs only the fields it prints, so keep the
+ * duplication minimal rather than promoting a private header.
+ */
+#define CR_PT_VALID 1ULL
+#define CR_PT_TABLE 2ULL
+#define CR_PT_ADDR_MASK 0xFFFFFFFFF000ULL
+#define CR_L2_BLOCK_ADDR_MASK 0xFFFFFFE00000ULL
+#define CR_BLOCK_1GIB (1024ULL * 1024 * 1024)
+#define CR_BLOCK_2MIB (2ULL * 1024 * 1024)
+#define CR_PAGE_SIZE 4096ULL
+
 /* Read a sysctl string into buf (NUL-terminated). Returns 0 on success. */
 static int sysctl_str(const char *name, char *buf, size_t bufsz)
 {
@@ -111,6 +124,201 @@ static const char *esr_ec_name(uint64_t esr)
     }
 }
 
+static bool esr_is_data_abort(uint64_t esr)
+{
+    unsigned ec = (unsigned) ((esr >> 26) & 0x3f);
+    return ec == 0x24 || ec == 0x25;
+}
+
+/* Walk the guest stage-1 page tables for `va` and print L0/L1/L2/L3 entries
+ * in raw form so the crash report localises an "unmapped" claim either to the
+ * guest PT (entry is 0 or PT_VALID clear) or to a downstream stage-2 hole.
+ * The walker mirrors gva_translate_perm in src/core/guest.c but accepts any
+ * PT_VALID descriptor instead of enforcing requested permissions, so a
+ * non-executable or EL1-only page still prints with its actual contents.
+ * Silently skips when g, host_base, or ttbr0 are missing -- the data needed
+ * for the dump is gone before the walker can stage anything useful.
+ */
+static bool dump_pt_walk_for_va(const guest_t *g,
+                                uint64_t va,
+                                uint64_t *ipa_out)
+{
+    if (!g || !g->host_base || !g->ttbr0)
+        return false;
+
+    uint64_t base = g->ipa_base;
+    if (g->ttbr0 < base || g->ttbr0 - base >= g->guest_size) {
+        fprintf(stderr, "  PT walk: TTBR0 0x%llx out of slab range\n",
+                (unsigned long long) g->ttbr0);
+        return false;
+    }
+    uint64_t l0_off = g->ttbr0 - base;
+    const uint64_t *l0 =
+        (const uint64_t *) ((const uint8_t *) g->host_base + l0_off);
+    unsigned l0_idx = (unsigned) (va / (512ULL * CR_BLOCK_1GIB));
+    if (l0_idx >= 512) {
+        fprintf(stderr, "  PT walk: L0 index %u out of range\n", l0_idx);
+        return false;
+    }
+    uint64_t l0_entry = l0[l0_idx];
+    fprintf(stderr, "  L0[%u]=0x%llx", l0_idx, (unsigned long long) l0_entry);
+    if (!(l0_entry & CR_PT_VALID)) {
+        fprintf(stderr, " INVALID\n");
+        return false;
+    }
+
+    uint64_t l1_ipa = l0_entry & CR_PT_ADDR_MASK;
+    if (l1_ipa < base || l1_ipa - base >= g->guest_size) {
+        fprintf(stderr, " L1@0x%llx out-of-slab\n",
+                (unsigned long long) l1_ipa);
+        return false;
+    }
+    const uint64_t *l1 =
+        (const uint64_t *) ((const uint8_t *) g->host_base + (l1_ipa - base));
+    unsigned l1_idx = (unsigned) ((va / CR_BLOCK_1GIB) % 512);
+    uint64_t l1_entry = l1[l1_idx];
+    fprintf(stderr, " L1[%u]=0x%llx", l1_idx, (unsigned long long) l1_entry);
+    if (!(l1_entry & CR_PT_VALID)) {
+        fprintf(stderr, " INVALID\n");
+        return false;
+    }
+
+    uint64_t l2_ipa = l1_entry & CR_PT_ADDR_MASK;
+    if (l2_ipa < base || l2_ipa - base >= g->guest_size) {
+        fprintf(stderr, " L2@0x%llx out-of-slab\n",
+                (unsigned long long) l2_ipa);
+        return false;
+    }
+    const uint64_t *l2 =
+        (const uint64_t *) ((const uint8_t *) g->host_base + (l2_ipa - base));
+    unsigned l2_idx = (unsigned) ((va / CR_BLOCK_2MIB) % 512);
+    uint64_t l2_entry = l2[l2_idx];
+    fprintf(stderr, " L2[%u]=0x%llx", l2_idx, (unsigned long long) l2_entry);
+    if (!(l2_entry & CR_PT_VALID)) {
+        fprintf(stderr, " INVALID\n");
+        return false;
+    }
+
+    /* A valid L2 entry is either a 2MiB block (bit1=0) or a table descriptor
+     * pointing at a 4KiB L3 page. Only the table case requires another walk.
+     */
+    if (!(l2_entry & CR_PT_TABLE)) {
+        uint64_t block_ipa = l2_entry & CR_L2_BLOCK_ADDR_MASK;
+        uint64_t translated_ipa = block_ipa + (va & (CR_BLOCK_2MIB - 1));
+        fprintf(stderr, " (2MiB block -> IPA 0x%llx)\n",
+                (unsigned long long) translated_ipa);
+        if (ipa_out)
+            *ipa_out = translated_ipa;
+        return true;
+    }
+
+    uint64_t l3_ipa = l2_entry & CR_PT_ADDR_MASK;
+    if (l3_ipa < base || l3_ipa - base >= g->guest_size) {
+        fprintf(stderr, " L3@0x%llx out-of-slab\n",
+                (unsigned long long) l3_ipa);
+        return false;
+    }
+    const uint64_t *l3 =
+        (const uint64_t *) ((const uint8_t *) g->host_base + (l3_ipa - base));
+    unsigned l3_idx = (unsigned) ((va / CR_PAGE_SIZE) % 512);
+    uint64_t l3_entry = l3[l3_idx];
+    if (!(l3_entry & CR_PT_VALID)) {
+        fprintf(stderr, " L3[%u]=0x%llx INVALID\n", l3_idx,
+                (unsigned long long) l3_entry);
+        return false;
+    }
+
+    uint64_t page_ipa = l3_entry & CR_PT_ADDR_MASK;
+    uint64_t translated_ipa = page_ipa + (va & (CR_PAGE_SIZE - 1));
+    fprintf(stderr, " L3[%u]=0x%llx -> IPA 0x%llx\n", l3_idx,
+            (unsigned long long) l3_entry, (unsigned long long) translated_ipa);
+    if (ipa_out)
+        *ipa_out = translated_ipa;
+    return true;
+}
+
+/* Print the HVF stage-2 backing range covering `ipa` plus the region-tracker
+ * entry whose VA range includes the same address. A missing stage-2 backing is
+ * the downstream analogue of an INVALID L3 entry above; a missing region
+ * tracker is normal for non-tracked ranges (vDSO, shim) but useful for "is
+ * this VA in a known ELF segment" reasoning.
+ */
+static void dump_segment_and_region_for(const guest_t *g,
+                                        uint64_t va,
+                                        bool have_ipa,
+                                        uint64_t ipa)
+{
+    if (!g)
+        return;
+
+    if (have_ipa) {
+        bool found_seg = false;
+        for (int i = 0; i < g->n_segments; i++) {
+            const hvf_segment_t *s = &g->segments[i];
+            if (ipa >= s->ipa && ipa < s->ipa + s->len) {
+                fprintf(stderr,
+                        "  HVF segment[%d]: ipa=0x%llx len=0x%llx "
+                        "(covers translated IPA 0x%llx)\n",
+                        i, (unsigned long long) s->ipa,
+                        (unsigned long long) s->len, (unsigned long long) ipa);
+                found_seg = true;
+                break;
+            }
+        }
+        if (!found_seg) {
+            const guest_mapping_t *m = guest_find_mapping(g, ipa);
+            if (m) {
+                fprintf(stderr,
+                        "  HVF mapping: gpa=0x%llx size=0x%zx "
+                        "(covers translated IPA 0x%llx)\n",
+                        (unsigned long long) m->gpa, m->size,
+                        (unsigned long long) ipa);
+                found_seg = true;
+            }
+        }
+        if (!found_seg) {
+            const guest_overflow_t *o = guest_find_overflow(g, ipa);
+            if (o) {
+                fprintf(stderr,
+                        "  HVF overflow: ipa=0x%llx size=0x%llx "
+                        "(covers translated IPA 0x%llx)\n",
+                        (unsigned long long) o->ipa_start,
+                        (unsigned long long) o->size, (unsigned long long) ipa);
+                found_seg = true;
+            }
+        }
+        if (!found_seg)
+            fprintf(stderr,
+                    "  HVF backing: NONE (stage-2 hole for IPA 0x%llx)\n",
+                    (unsigned long long) ipa);
+    } else {
+        fprintf(stderr,
+                "  HVF backing: not checked (no valid stage-1 translation)\n");
+    }
+
+    const guest_region_t *r = guest_region_find(g, va);
+    if (r) {
+        fprintf(stderr, "  region: [0x%llx, 0x%llx) prot=%c%c%c name=\"%s\"\n",
+                (unsigned long long) r->start, (unsigned long long) r->end,
+                (r->prot & 1) ? 'r' : '-', (r->prot & 2) ? 'w' : '-',
+                (r->prot & 4) ? 'x' : '-', r->name[0] ? r->name : "(unnamed)");
+    } else {
+        fprintf(stderr, "  region: NONE\n");
+    }
+}
+
+static void dump_translation_diagnostics_for(const guest_t *g,
+                                             const char *label,
+                                             uint64_t va)
+{
+    fprintf(stderr, "## Translation diagnostics (%s=0x%llx)\n", label,
+            (unsigned long long) va);
+    uint64_t ipa = 0;
+    bool have_ipa = dump_pt_walk_for_va(g, va, &ipa);
+    dump_segment_and_region_for(g, va, have_ipa, ipa);
+    fprintf(stderr, "\n");
+}
+
 void crash_report(hv_vcpu_t vcpu,
                   const guest_t *g,
                   crash_type_t type,
@@ -247,7 +455,28 @@ void crash_report(hv_vcpu_t vcpu,
         fprintf(stderr, "interp_base = 0x%llx  mmap_limit = 0x%llx\n",
                 (unsigned long long) g->interp_base,
                 (unsigned long long) g->mmap_limit);
-        fprintf(stderr, "nregions    = %d\n\n", g->nregions);
+        fprintf(stderr, "nregions    = %d  hvf_segments = %d\n", g->nregions,
+                g->n_segments);
+        fprintf(stderr, "ttbr0       = 0x%llx\n\n",
+                (unsigned long long) g->ttbr0);
+
+        /* Translation diagnostics for the faulting PC. Helps decide whether
+         * an inst-abort or data-abort came from a stage-1 PT entry that went
+         * away (L3 INVALID after an unrelated mprotect / munmap) or from a
+         * stage-2 backing hole (for example, an hvf_segment_split that left no
+         * covering entry). Data aborts also dump FAR when it differs from PC
+         * because FAR is the actual load/store fault address.
+         */
+        if (vcpu) {
+            uint64_t pc = 0, esr = 0, far_reg = 0;
+            hv_vcpu_get_reg(vcpu, HV_REG_PC, &pc);
+            hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &esr);
+            hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_FAR_EL1, &far_reg);
+            if (pc)
+                dump_translation_diagnostics_for(g, "PC", pc);
+            if (esr_is_data_abort(esr) && far_reg && far_reg != pc)
+                dump_translation_diagnostics_for(g, "FAR", far_reg);
+        }
     }
 
     fprintf(stderr,
diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index 1c31bc0..3b2847a 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -17,6 +17,7 @@
 #include "utils.h"
 
 #include "runtime/fork-state.h"
+#include "runtime/procemu.h"
 
 #include "debug/log.h"
 #include "syscall/abi.h"
@@ -447,6 +448,156 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
     return 0;
 }
 
+/* Wire payload for one pty keepalive. The slave fd travels separately via
+ * SCM_RIGHTS; the parent's master_host_fd is intentionally omitted because the
+ * child's number will differ. The child re-derives it from fd_table[gfd].
+ */
+typedef struct {
+    int32_t guest_fd;
+    uint32_t linux_pts_num;
+    char slave_path[64];
+} ipc_pty_keepalive_t;
+
+int fork_ipc_send_pty_keepalives(int ipc_sock)
+{
+    /* PTY_KEEPALIVE_MAX upper bound on entries; allocate to that. */
+    proc_pty_ipc_entry_t snapshot[256];
+    int snapshot_slave_fds[256];
+    int num_snap = proc_pty_snapshot_keepalive(snapshot, snapshot_slave_fds,
+                                               ARRAY_SIZE(snapshot));
+
+    /* Match each keepalive's master_host_fd against a live fd_table entry to
+     * recover the guest_fd, which is the stable identifier across fork.
+     */
+    ipc_pty_keepalive_t payload[256];
+    int payload_slave_fds[256];
+    uint32_t num_send = 0;
+
+    pthread_mutex_lock(&fd_lock);
+    for (int i = 0; i < num_snap; i++) {
+        int matched_gfd = -1;
+        for (int gfd = 0; gfd < FD_TABLE_SIZE; gfd++) {
+            if (fd_table[gfd].type == FD_CLOSED)
+                continue;
+            if (fd_table[gfd].host_fd != snapshot[i].master_host_fd)
+                continue;
+            if (fd_type_is_synthetic(fd_table[gfd].type))
+                continue;
+            matched_gfd = gfd;
+            break;
+        }
+
+        if (matched_gfd < 0) {
+            /* Master was closed between snapshot and lookup, or never tracked
+             * in the guest fd table (defensive). Drop the duped slave; nothing
+             * else holds it.
+             */
+            close(snapshot_slave_fds[i]);
+            continue;
+        }
+
+        payload[num_send].guest_fd = matched_gfd;
+        payload[num_send].linux_pts_num = snapshot[i].linux_pts_num;
+        _Static_assert(
+            sizeof(payload[0].slave_path) == sizeof(snapshot[0].slave_path),
+            "keepalive slave_path size must match payload");
+        memcpy(payload[num_send].slave_path, snapshot[i].slave_path,
+               sizeof(payload[0].slave_path));
+        payload_slave_fds[num_send] = snapshot_slave_fds[i];
+        num_send++;
+    }
+    pthread_mutex_unlock(&fd_lock);
+
+    int rc = 0;
+    if (fork_ipc_write_all(ipc_sock, &num_send, sizeof(num_send)) < 0) {
+        rc = -1;
+    } else if (num_send > 0) {
+        if (fork_ipc_write_all(ipc_sock, payload,
+                               num_send * sizeof(payload[0])) < 0) {
+            rc = -1;
+        } else if (fork_ipc_send_fds(ipc_sock, payload_slave_fds,
+                                     (int) num_send) < 0) {
+            log_error("clone: failed to send pty keepalive fds");
+            rc = -1;
+        }
+    }
+    for (uint32_t i = 0; i < num_send; i++)
+        close(payload_slave_fds[i]);
+    return rc;
+}
+
+int fork_ipc_recv_pty_keepalives(int ipc_fd)
+{
+    uint32_t num;
+    if (fork_ipc_read_all(ipc_fd, &num, sizeof(num)) < 0) {
+        log_error("fork-child: failed to read pty keepalive count");
+        return -1;
+    }
+    if (num == 0)
+        return 0;
+    if (num > FD_TABLE_SIZE) {
+        log_error("fork-child: pty keepalive count %u exceeds FD_TABLE_SIZE",
+                  num);
+        return -1;
+    }
+
+    ipc_pty_keepalive_t *payload = calloc(num, sizeof(*payload));
+    if (!payload)
+        return -1;
+
+    if (fork_ipc_read_all(ipc_fd, payload, num * sizeof(*payload)) < 0) {
+        free(payload);
+        return -1;
+    }
+
+    int *slave_fds = calloc(num, sizeof(int));
+    if (!slave_fds) {
+        free(payload);
+        return -1;
+    }
+    int got = 0;
+    if (fork_ipc_recv_fds(ipc_fd, slave_fds, (int) num, &got) < 0 ||
+        got != (int) num) {
+        log_error("fork-child: pty keepalive recv mismatch: got %d expected %u",
+                  got, num);
+        for (int i = 0; i < got; i++)
+            close(slave_fds[i]);
+        free(slave_fds);
+        free(payload);
+        return -1;
+    }
+
+    for (uint32_t i = 0; i < num; i++) {
+        int gfd = payload[i].guest_fd;
+        int child_master = -1;
+        if (RANGE_CHECK(gfd, 0, FD_TABLE_SIZE)) {
+            pthread_mutex_lock(&fd_lock);
+            if (fd_table[gfd].type != FD_CLOSED)
+                child_master = fd_table[gfd].host_fd;
+            pthread_mutex_unlock(&fd_lock);
+        }
+        if (child_master < 0) {
+            /* Master fd did not survive the fd_table batch (synthetic-type
+             * filter, or the slot was rejected). Drop the keepalive cleanly.
+             */
+            close(slave_fds[i]);
+            continue;
+        }
+
+        /* Force-NUL the path before passing it on so a malformed sender cannot
+         * trick the child into reading past the buffer.
+         */
+        payload[i].slave_path[sizeof(payload[i].slave_path) - 1] = '\0';
+        proc_pty_restore_keepalive(child_master, slave_fds[i],
+                                   payload[i].linux_pts_num,
+                                   payload[i].slave_path);
+    }
+
+    free(slave_fds);
+    free(payload);
+    return 0;
+}
+
 static int fork_ipc_send_backing_fds(int ipc_sock,
                                      const guest_region_t *regions_snapshot,
                                      uint32_t num_guest_regions)
@@ -714,12 +865,14 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
         log_error("fork-child: failed to read region count");
         return -1;
     }
+
     uint8_t regions_tracker_stale = 0;
     if (fork_ipc_read_all(ipc_fd, &regions_tracker_stale,
                           sizeof(regions_tracker_stale)) < 0) {
         log_error("fork-child: failed to read region tracker state");
         return -1;
     }
+
     uint32_t recv_regions = num_guest_regions;
     if (recv_regions > GUEST_MAX_REGIONS)
         recv_regions = GUEST_MAX_REGIONS;
@@ -729,15 +882,17 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
         log_error("fork-child: failed to read regions");
         return -1;
     }
+
     /* Drain any excess records the parent serialized beyond the local cap.
      * Without this drain, the next read (num_preannounced) consumes stale
-     * region bytes and desynchronizes the rest of the IPC payload. Mirrors
-     * the preannounced-region drain below.
+     * region bytes and desynchronizes the rest of the IPC payload. Mirrors the
+     * preannounced-region drain below.
      */
     if (num_guest_regions > recv_regions &&
         fork_ipc_drain_bytes(ipc_fd, (num_guest_regions - recv_regions) *
                                          sizeof(guest_region_t)) < 0)
         return -1;
+
     g->nregions = (int) recv_regions;
     g->regions_tracker_stale =
         (regions_tracker_stale != 0) || (num_guest_regions > recv_regions);
diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h
index e9de0bc..27c2080 100644
--- a/src/runtime/fork-state.h
+++ b/src/runtime/fork-state.h
@@ -99,6 +99,17 @@ int fork_ipc_recv_memory_regions(int ipc_fd, guest_t *g);
 int fork_ipc_send_fd_table(int ipc_sock);
 int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g);
 
+/* Carry the /dev/ptmx keepalive slave fds across the fork boundary. The fd
+ * table batch sends master fds without their hidden keepalive companions, so
+ * a child that inherits a master would otherwise hit the macOS ENOTTY /
+ * winsize-reset cliff that proc_pty_close_keepalive papers over in the
+ * parent. send/recv must run AFTER fork_ipc_send_fd_table / _recv_fd_table
+ * so the child can look up its new master host fd from the just-installed
+ * fd_table entry.
+ */
+int fork_ipc_send_pty_keepalives(int ipc_sock);
+int fork_ipc_recv_pty_keepalives(int ipc_fd);
+
 int fork_ipc_send_process_state(int ipc_sock,
                                 const guest_region_t *regions_snapshot,
                                 uint32_t num_guest_regions,
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index aef1d24..7b89dcc 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -261,6 +261,15 @@ int fork_child_main(int ipc_fd,
         return 1;
     }
 
+    /* Must follow fork_ipc_recv_fd_table: the keepalive recv resolves each
+     * payload guest_fd to its (now installed) child-side host master fd.
+     */
+    if (fork_ipc_recv_pty_keepalives(ipc_fd) < 0) {
+        log_error("fork-child: failed to receive pty keepalives");
+        guest_destroy(&g);
+        return 1;
+    }
+
     signal_state_t sig;
     if (fork_ipc_recv_process_state(ipc_fd, &g, &sig) < 0) {
         log_error("fork-child: failed to receive process state");
@@ -1598,6 +1607,15 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         goto fail_snapshot;
     }
 
+    /* Must follow fork_ipc_send_fd_table because the keepalive payload
+     * carries a guest_fd that the child resolves through its just-installed
+     * fd_table to recover the child-side master host fd.
+     */
+    if (fork_ipc_send_pty_keepalives(ipc_sock) < 0) {
+        log_error("clone: failed to send pty keepalives");
+        goto fail_snapshot;
+    }
+
     uint32_t num_guest_regions = (uint32_t) nregions_snapshot;
     uint32_t num_preannounced = (uint32_t) npreannounced_snapshot;
     if (fork_ipc_send_process_state(
diff --git a/src/runtime/futex.c b/src/runtime/futex.c
index 683d95c..4351cb2 100644
--- a/src/runtime/futex.c
+++ b/src/runtime/futex.c
@@ -34,6 +34,7 @@
 
 #include "syscall/abi.h"
 #include "syscall/proc.h"
+#include "syscall/signal.h"
 
 #include "debug/log.h"
 
@@ -103,10 +104,14 @@ static _Atomic int futex_interrupt_requested = 0;
  *
  * The wait quantum is capped at 100 ms so proc_exit_group_requested() and
  * futex_interrupt_pending() get noticed promptly without a process-wide
- * broadcast channel. The 1-second EINTR simulation that the bucket path uses
- * for shutdown-stalled multi-threaded runtimes is preserved here, but only
- * once more than one guest thread is active. Single-threaded guests should not
- * see synthetic EINTR churn on indefinite waits.
+ * broadcast channel. EINTR is only returned when an actual deliverable
+ * signal is queued for this thread (confirmed under sig_lock via
+ * signal_pending(), not the atomic hint, so that rt_sigprocmask masking the
+ * queued signal cannot leave a stale-true edge behind), or when a guest
+ * itimer expires under the poll loop's signal_check_timer poke. Earlier
+ * revisions returned -EINTR after one unconditional second of waiting to
+ * unblock shutdown-stalled multi-threaded runtimes, but that broke POSIX
+ * sem_wait callers that do not retry on EINTR (e.g. foot's render worker).
  */
 #if ELFUSE_HAVE_OS_SYNC_WAIT_ON_ADDRESS
 static bool os_sync_available;
@@ -114,12 +119,6 @@ static bool os_sync_wait_enabled;
 #endif
 
 #define FUTEX_OS_SYNC_POLL_CAP_NS (100ULL * 1000 * 1000)
-#define FUTEX_OS_SYNC_EINTR_SIM_MS 1000
-
-static inline bool futex_should_simulate_periodic_eintr(void)
-{
-    return !thread_is_single_active();
-}
 
 /* Hash table */
 
@@ -222,6 +221,22 @@ int futex_interrupt_pending(void)
     return atomic_load(&futex_interrupt_requested);
 }
 
+/* Test-and-clear: returns 1 if the interrupt request was pending and atomically
+ * clears it, 0 otherwise. The interrupt is a one-shot edge: forkipc.c sets it
+ * when the last clone-thread exits so the main thread observes EINTR in its
+ * next blocking wait, mirroring how real Linux delivers SIGCHLD. Without the
+ * clear, the flag stays set and every subsequent epoll_pwait, ppoll, futex
+ * wait, etc. spins on EINTR until execve clears it -- in foot's case it never
+ * does, and the spinning main thread eventually faults in a code path the
+ * guest never expects to reach.
+ */
+int futex_interrupt_consume(void)
+{
+    int expected = 1;
+    return atomic_compare_exchange_strong(&futex_interrupt_requested, &expected,
+                                          0);
+}
+
 /* Cap on guest-supplied tv_sec. The cap exists purely so the int64_t / time_t
  * arithmetic in the deadline conversion (now.tv_sec + delta_sec, where
  * delta_sec = lts.tv_sec - mono.tv_sec) cannot overflow even for adversarial
@@ -392,19 +407,13 @@ static int64_t futex_os_sync_wait(guest_t *g,
     if (current != expected)
         return -LINUX_EAGAIN;
 
-    struct timeval wait_start;
-    bool simulate_periodic_eintr =
-        !has_timeout && futex_should_simulate_periodic_eintr();
-    if (simulate_periodic_eintr)
-        gettimeofday(&wait_start, NULL);
-
     /* Bound consecutive EFAULT retries. Apple documents EFAULT as transient
      * (kernel copyin failure under memory pressure), so a few retries are fine;
      * but a genuinely bad page would otherwise cause the loop to spin with no
      * real sleep -- timeout_ns is supplied to syscall that returns immediately
-     * -- until either the user deadline or the 1-second EINTR simulation
-     *  finally bails out. Surface EFAULT to the guest after this many
-     *  back-to-back failures so the host CPU does not burn for ~1 s.
+     * -- until the user deadline finally bails out. Surface EFAULT to the
+     * guest after this many back-to-back failures so the host CPU does not
+     * burn for ~1 s.
      */
     int efault_retries = 0;
 
@@ -436,17 +445,24 @@ static int64_t futex_os_sync_wait(guest_t *g,
             efault_retries = 0;
         }
 
-        if (proc_exit_group_requested() || futex_interrupt_pending())
+        if (proc_exit_group_requested() || futex_interrupt_consume())
             return -LINUX_EINTR;
 
-        if (simulate_periodic_eintr) {
-            struct timeval now;
-            gettimeofday(&now, NULL);
-            long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 +
-                              (now.tv_usec - wait_start.tv_usec) / 1000;
-            if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS)
-                return -LINUX_EINTR;
-        }
+        /* Drain any expired guest itimer so its SIGALRM / SIGVTALRM / SIGPROF
+         * queues into sig_state.pending; without this poke, a guest with all
+         * threads parked in futex_wait would never advance the timers.
+         */
+        signal_check_timer();
+
+        /* Return EINTR only when a real deliverable signal is queued for
+         * this thread. POSIX callers (e.g. glibc sem_wait, foot's render
+         * worker) often do not retry on EINTR, so synthetic spurious
+         * wakeups cannot be issued here. signal_pending() confirms under
+         * sig_lock so the atomic hint cannot produce a stale-true edge
+         * after rt_sigprocmask masked the queued signal.
+         */
+        if (signal_pending())
+            return -LINUX_EINTR;
         /* For has_timeout: futex_remaining_ns returns 0 next iteration once
          * the user deadline elapses, so the loop exits with -ETIMEDOUT.
          */
@@ -528,22 +544,6 @@ static int64_t futex_wait(guest_t *g,
     /* Wait until woken or timeout */
     int ret = 0;
 
-    /* Record start time for the no-timeout path. On real Linux, any pending
-     * signal interrupts futex_wait with -EINTR. Without a timer signal
-     * (SIGVTALRM from timer_create/setitimer), some multi-threaded runtimes
-     * can deadlock when a thread blocks in futex_wait and no wakeup arrives
-     * (e.g., a shutdown signal delivered to the wrong I/O manager).
-     * FUTEX_WAIT returns -EINTR after 1 second of blocking to simulate
-     * periodic signal delivery. All real futex callers (musl, glibc, and
-     * other managed runtimes) handle -EINTR correctly by re-checking their
-     * condition and retrying.
-     */
-    struct timeval wait_start;
-    bool simulate_periodic_eintr =
-        !has_timeout && futex_should_simulate_periodic_eintr();
-    if (simulate_periodic_eintr)
-        gettimeofday(&wait_start, NULL);
-
     while (!__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE)) {
         if (has_timeout) {
             int rc = pthread_cond_timedwait(&waiter.cond, &b->lock, &deadline);
@@ -552,35 +552,45 @@ static int64_t futex_wait(guest_t *g,
                 ret = -LINUX_ETIMEDOUT;
                 break;
             }
-        } else {
-            /* No timeout specified: poll every 100ms to check for exit_group,
-             * futex_interrupt (simulated SIGCHLD), or excessive wait time
-             * (simulated signal interruption).
-             */
-            struct timespec poll_ts;
-            timespec_deadline_in_ms(&poll_ts, 100);
-            pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts);
+            continue;
+        }
 
-            if (proc_exit_group_requested() || futex_interrupt_pending()) {
-                ret = -LINUX_EINTR;
-                break;
-            }
+        /* No timeout specified: poll every 100 ms to check for exit_group,
+         * futex_interrupt, expired guest itimers, and queued signals.
+         */
+        struct timespec poll_ts;
+        timespec_deadline_in_ms(&poll_ts, 100);
+        pthread_cond_timedwait(&waiter.cond, &b->lock, &poll_ts);
 
-            /* Simulate periodic signal delivery only for multi-threaded
-             * guests. Single-threaded glibc startup paths can legitimately
-             * park in FUTEX_WAIT forever until a real wake arrives, and
-             * synthetic EINTR here breaks that contract.
-             */
-            if (simulate_periodic_eintr) {
-                struct timeval now;
-                gettimeofday(&now, NULL);
-                long elapsed_ms = (now.tv_sec - wait_start.tv_sec) * 1000 +
-                                  (now.tv_usec - wait_start.tv_usec) / 1000;
-                if (elapsed_ms >= FUTEX_OS_SYNC_EINTR_SIM_MS) {
-                    ret = -LINUX_EINTR;
-                    break;
-                }
-            }
+        if (proc_exit_group_requested() || futex_interrupt_consume()) {
+            ret = -LINUX_EINTR;
+            break;
+        }
+
+        /* Lock-order: bucket lock(7) outranks sig_lock(4), so signal_pending()
+         * and signal_check_timer() may only be called once the bucket lock has
+         * been released. Drop it, poke the itimers, observe queued signals
+         * under sig_lock (the slow-path confirm avoids the stale-true edge
+         * that the atomic hint can carry after rt_sigprocmask masks the
+         * queued signal), then re-acquire and re-check waiter.woken in case
+         * a wake landed in the window.
+         */
+        pthread_mutex_unlock(&b->lock);
+        signal_check_timer();
+        bool sig_ready = signal_pending() != 0;
+        pthread_mutex_lock(&b->lock);
+
+        if (__atomic_load_n(&waiter.woken, __ATOMIC_ACQUIRE))
+            break;
+
+        /* Return EINTR only when a real deliverable signal is queued for
+         * this thread. POSIX callers (e.g. glibc sem_wait, foot's render
+         * worker) often do not retry on EINTR, so synthetic spurious
+         * wakeups cannot be issued here.
+         */
+        if (sig_ready) {
+            ret = -LINUX_EINTR;
+            break;
         }
     }
 
diff --git a/src/runtime/futex.h b/src/runtime/futex.h
index 79f60eb..186b6a8 100644
--- a/src/runtime/futex.h
+++ b/src/runtime/futex.h
@@ -27,6 +27,7 @@ void futex_init(void);
 void futex_interrupt_request(void);
 void futex_interrupt_clear(void);
 int futex_interrupt_pending(void);
+int futex_interrupt_consume(void);
 
 /* Main futex syscall entry point.
  * op:    futex operation (FUTEX_WAIT, FUTEX_WAKE, etc.)
diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c
index c1b2e31..297d102 100644
--- a/src/runtime/procemu.c
+++ b/src/runtime/procemu.c
@@ -19,6 +19,8 @@
  */
 #define MAPS_NAME_COLUMN 73
 
+#include <ctype.h>
+#include <stdbool.h>
 #include <stdarg.h>
 #include <stdatomic.h>
 #include <stdio.h>
@@ -40,6 +42,8 @@
 #include <netinet/in.h>
 #include <libproc.h>
 #include <mach/mach.h>
+#include <sys/ioctl.h>
+#include <termios.h>
 
 #include "utils.h"
 
@@ -1479,11 +1483,805 @@ static void proc_task_collect_cb(thread_entry_t *t, void *arg)
         c->tids[c->ntids++] = t->guest_tid;
 }
 
+/* Pseudoterminal master side-table.
+ *
+ * Bridges two host vs guest mismatches in one place:
+ *
+ * 1. The macOS /dev/ptmx master is not itself a tty. TIOCSWINSZ / TIOCGWINSZ
+ *    on the bare master return ENOTTY until something has opened the
+ *    corresponding slave once, and the stored winsize gets cleared whenever
+ *    the slave refcount drops to zero (verified empirically on macOS 15).
+ *    Linux ptmx masters are tty fds in their own right, so guests assume those
+ *    ioctls work without an open slave. To bridge the gap, every /dev/ptmx
+ *    open eagerly opens one slave host fd that elfuse holds for the lifetime
+ *    of the master and never exposes to the guest.
+ *
+ * 2. macOS slaves live at /dev/ttysNNN; Linux glibc looks for /dev/pts/N where
+ *    N comes from TIOCGPTN. Guest opens of /dev/pts/N route back to the
+ *    macOS path captured from ptsname(3) at /dev/ptmx open time, not a
+ *    re-formatted guess, so format changes in macOS (or unusual minor
+ *    encodings) cannot strand the guest with the wrong slave.
+ *
+ * Entries are keyed by the host master fd because that is what fd_cleanup_entry
+ * has when the guest closes a master. Capacity matches the macOS default UNIX98
+ * slave count; overflow leaves the entry empty and the guest gets the pre-fix
+ * degraded behavior for that one pair instead of an open failure.
+ *
+ * Fork-restored entries may outlive their master for one /dev/pts/N open. A
+ * foot / sshd / posix-compliant child closes the master fd after fork before
+ * opening the slave (the child has no use for the master); without retaining
+ * the path mapping past close, the subsequent /dev/pts/N open in the child
+ * loses its translation and fails with ENOENT even though the parent still
+ * holds the master and the macOS slave node is openable. Those stale entries
+ * keep the received slave fd until the first translated open attempt, then
+ * expire before the minor can be reused for an unrelated host tty. Ordinary
+ * local master closes clear the mapping immediately.
+ */
+#define PTY_KEEPALIVE_MAX 256
+#define PTY_KEEPALIVE_FREE (-1)
+/* PTY_SLAVE_PATH_MAX lives in procemu.h so this table and the fork-IPC
+ * payload (proc_pty_ipc_entry_t) cannot drift apart.
+ */
+static struct {
+    int master_host_fd;
+    int slave_host_fd;
+    uint32_t linux_pts_num;
+    bool stale_open_once;
+    char slave_path[PTY_SLAVE_PATH_MAX];
+} pty_keepalive_table[PTY_KEEPALIVE_MAX];
+static pthread_mutex_t pty_keepalive_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_once_t pty_keepalive_once = PTHREAD_ONCE_INIT;
+
+/* Sentinel-init. Other fields stay BSS-zero; without sentinels a host fd 0
+ * close would match slot 0 and close the wrong fd inside elfuse.
+ */
+static void pty_keepalive_init(void)
+{
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        pty_keepalive_table[i].master_host_fd = PTY_KEEPALIVE_FREE;
+        pty_keepalive_table[i].slave_host_fd = PTY_KEEPALIVE_FREE;
+    }
+}
+
+static void pty_keepalive_lock_acquire(void)
+{
+    pthread_once(&pty_keepalive_once, pty_keepalive_init);
+    pthread_mutex_lock(&pty_keepalive_lock);
+}
+
+/* Find a slot by master_host_fd; -1 if none. Caller holds the lock. */
+static int pty_keepalive_find_master_locked(int master_host_fd)
+{
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++)
+        if (pty_keepalive_table[i].master_host_fd == master_host_fd)
+            return i;
+    return -1;
+}
+
+static int pty_keepalive_clear_slot_locked(int slot)
+{
+    int slave = pty_keepalive_table[slot].slave_host_fd;
+    pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE;
+    pty_keepalive_table[slot].slave_host_fd = PTY_KEEPALIVE_FREE;
+    pty_keepalive_table[slot].linux_pts_num = 0;
+    pty_keepalive_table[slot].stale_open_once = false;
+    pty_keepalive_table[slot].slave_path[0] = '\0';
+    return slave;
+}
+
+static uint32_t pty_extract_pts_num(const char *slave_path)
+{
+    /* macOS canonical slave paths are /dev/ttysNNN with a decimal tail. Read
+     * the longest decimal suffix and return it as the Linux pts number used
+     * by guest /dev/pts/N. Returns UINT32_MAX on parse failure so callers
+     * can reject ambiguous names rather than silently aliasing.
+     */
+    if (!slave_path)
+        return UINT32_MAX;
+    const char *p = slave_path + strlen(slave_path);
+    while (p > slave_path && isdigit((unsigned char) p[-1]))
+        p--;
+    if (!*p || !isdigit((unsigned char) *p))
+        return UINT32_MAX;
+    char *endp;
+    unsigned long n = strtoul(p, &endp, 10);
+    if (endp == p || *endp != '\0' || n > UINT32_MAX)
+        return UINT32_MAX;
+    return (uint32_t) n;
+}
+
+/* Result codes for the locked register helper. */
+#define PTY_REG_INSERTED 0 /* new entry installed */
+#define PTY_REG_EXISTS 1   /* a matching entry already existed */
+#define PTY_REG_FULL (-1)  /* table out of free slots */
+
+/* Caller-holds-lock variant. Returns one of PTY_REG_* and, on PTY_REG_EXISTS,
+ * writes the existing entry's pts number to *existing_pts_num. The lock-held
+ * variant exists so proc_pty_master_adopt can atomically pair fd-table slot
+ * validation with keepalive insertion under fd_lock + pty_keepalive_lock,
+ * eliminating the race window where a sibling close+recycle between validate
+ * and register would attach the keepalive to the wrong file.
+ */
+static int pty_keepalive_register_locked(int master_host_fd,
+                                         int slave_host_fd,
+                                         uint32_t linux_pts_num,
+                                         const char *slave_path,
+                                         bool stale_open_once,
+                                         uint32_t *existing_pts_num)
+{
+    int empty_slot = -1;
+    int stale_path_slot = -1;
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].master_host_fd == master_host_fd) {
+            if (existing_pts_num)
+                *existing_pts_num = pty_keepalive_table[i].linux_pts_num;
+            return PTY_REG_EXISTS;
+        }
+        if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE)
+            continue;
+        /* Prefer a stale-path slot with the same pts number: the macOS minor
+         * deterministically maps to the same slave_path string, so reusing
+         * keeps lookups path-correct and bounds the table at one slot per
+         * live minor instead of accumulating a new entry on every reopen.
+         */
+        if (pty_keepalive_table[i].slave_path[0] != '\0' &&
+            pty_keepalive_table[i].linux_pts_num == linux_pts_num) {
+            stale_path_slot = i;
+        } else if (empty_slot < 0 &&
+                   pty_keepalive_table[i].slave_path[0] == '\0') {
+            empty_slot = i;
+        }
+    }
+    int slot = (stale_path_slot >= 0) ? stale_path_slot : empty_slot;
+    if (slot < 0) {
+        /* Out of empty slots and no stale-path match: evict the lowest-index
+         * stale-path entry so the live registration cannot starve. Live entries
+         * are never evicted. The eviction policy is approximately LRU: empty
+         * slots fill from low indices, so the lowest-index stale slot tends to
+         * be the oldest closed. A theoretical race exists with the
+         * close-before-open child pattern (a child stales slot K under
+         * pty_keepalive_lock and races into open("/dev/pts/N") just as another
+         * thread evicts slot K to register a different minor) but needs the
+         * keepalive table to be full -- live and stale entries both count --
+         * with the staling thread's slot being the lowest-index stale. Well
+         * outside the foot / sshd workload that motivated this code.
+         */
+        for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+            if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE &&
+                pty_keepalive_table[i].slave_path[0] != '\0') {
+                slot = i;
+                break;
+            }
+        }
+        if (slot < 0)
+            return PTY_REG_FULL;
+    }
+    pty_keepalive_table[slot].master_host_fd = master_host_fd;
+    if (pty_keepalive_table[slot].slave_host_fd >= 0 &&
+        pty_keepalive_table[slot].slave_host_fd != slave_host_fd)
+        close(pty_keepalive_table[slot].slave_host_fd);
+    pty_keepalive_table[slot].slave_host_fd = slave_host_fd;
+    pty_keepalive_table[slot].linux_pts_num = linux_pts_num;
+    pty_keepalive_table[slot].stale_open_once = stale_open_once;
+    if (slave_path)
+        str_copy_trunc(pty_keepalive_table[slot].slave_path, slave_path,
+                       PTY_SLAVE_PATH_MAX);
+    else
+        pty_keepalive_table[slot].slave_path[0] = '\0';
+    return PTY_REG_INSERTED;
+}
+
+/* Lock-acquiring convenience wrapper used by the open-time and fork-restore
+ * paths where atomicity with fd_table is not required. Returns 0 on success
+ * (including PTY_REG_EXISTS, in which case the caller should close its own
+ * redundant slave_host_fd), -1 with errno set on table-full (ENOSPC).
+ */
+static int pty_keepalive_register(int master_host_fd,
+                                  int slave_host_fd,
+                                  uint32_t linux_pts_num,
+                                  const char *slave_path,
+                                  bool stale_open_once)
+{
+    pty_keepalive_lock_acquire();
+    int rc = pty_keepalive_register_locked(master_host_fd, slave_host_fd,
+                                           linux_pts_num, slave_path,
+                                           stale_open_once, NULL);
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    if (rc == PTY_REG_FULL) {
+        errno = ENOSPC;
+        return -1;
+    }
+    if (rc == PTY_REG_EXISTS)
+        errno = EEXIST;
+    return 0;
+}
+
+uint32_t proc_pty_master_pts_num(int master_host_fd)
+{
+    if (master_host_fd < 0)
+        return UINT32_MAX;
+    pty_keepalive_lock_acquire();
+    int slot = pty_keepalive_find_master_locked(master_host_fd);
+    uint32_t pts_num =
+        (slot < 0) ? UINT32_MAX : pty_keepalive_table[slot].linux_pts_num;
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    return pts_num;
+}
+
+/* Re-validate that fd_table[guest_fd] still refers to (host_fd, generation).
+ * Returns true when both match the snapshot, false otherwise (slot closed or
+ * recycled). Used by proc_pty_master_adopt to bracket every host-fd-number
+ * access against the closing-and-reuse race.
+ */
+static bool pty_fd_still_canonical(int guest_fd,
+                                   int canonical_host_fd,
+                                   uint64_t canonical_gen)
+{
+    fd_entry_t snap;
+    if (!fd_snapshot(guest_fd, &snap))
+        return false;
+    return snap.host_fd == canonical_host_fd &&
+           snap.generation == canonical_gen;
+}
+
+uint32_t proc_pty_master_adopt(int guest_fd)
+{
+    /* Step 1: atomically snapshot (host_fd, generation) and dup the
+     * canonical fd in a single fd_lock window. fd_snapshot_and_dup pins
+     * the file object behind the canonical host fd, so even if a sibling
+     * closes the guest fd and the host fd number is recycled by an
+     * unrelated open, host syscalls against the probe still operate on
+     * the right tty. The generation captured here is the witness for the
+     * subsequent table lookup and register validations.
+     */
+    fd_entry_t snap;
+    int probe = fd_snapshot_and_dup(guest_fd, &snap);
+    if (probe < 0)
+        return UINT32_MAX;
+    int canonical_host_fd = snap.host_fd;
+    uint64_t canonical_gen = snap.generation;
+
+    /* Fast path: a keepalive was already registered for this canonical fd
+     * (typical case for /dev/ptmx opens that went through pty_open_master).
+     * The keepalive table is keyed by host fd number, so re-validate the
+     * slot identity before trusting the returned pts_num. If the fd has
+     * been recycled to a different file (generation mismatch), the
+     * existing entry belongs to that file, not ours, and the slow path
+     * below must register a fresh entry for our pinned probe.
+     */
+    uint32_t existing = proc_pty_master_pts_num(canonical_host_fd);
+    if (existing != UINT32_MAX &&
+        pty_fd_still_canonical(guest_fd, canonical_host_fd, canonical_gen)) {
+        close(probe);
+        return existing;
+    }
+
+    /* Step 2: confirm the file really is a /dev/ptmx master. ptsname(3)
+     * returns NULL/ENOTTY on non-pty descriptors, so a stray TIOCGPTN
+     * against a regular file is rejected without any side effect.
+     */
+    char slave_path[PTY_SLAVE_PATH_MAX];
+    uint32_t pts_num = UINT32_MAX;
+    int slave = -1;
+    if (ptsname_r(probe, slave_path, sizeof(slave_path)) != 0)
+        goto out;
+    pts_num = pty_extract_pts_num(slave_path);
+    if (pts_num == UINT32_MAX)
+        goto out;
+
+    /* unlockpt(3) is harmless if the sender already unlocked. EINVAL means
+     * already unlocked; anything else means the slave will not open and
+     * we give up cleanly.
+     */
+    if (unlockpt(probe) < 0 && errno != EINVAL) {
+        pts_num = UINT32_MAX;
+        goto out;
+    }
+    slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC);
+    if (slave < 0) {
+        pts_num = UINT32_MAX;
+        goto out;
+    }
+
+    /* Step 3: re-validate AND publish under the joint pty_keepalive_lock +
+     * fd_lock window. Lock order is pty_keepalive_lock first;
+     * duplicate_guest_fd uses the same order when bracketing
+     * fd_snapshot_and_dup + proc_pty_dup_keepalive_locked, so the two paths
+     * cannot deadlock. With both held, no sibling can flip the fd_table slot
+     * between the validation read and the keepalive insert, so the keepalive
+     * cannot attach to a recycled canonical host fd.
+     */
+    pty_keepalive_lock_acquire();
+    pthread_mutex_lock(&fd_lock);
+    if (fd_table[guest_fd].type == FD_CLOSED ||
+        fd_table[guest_fd].host_fd != canonical_host_fd ||
+        fd_table[guest_fd].generation != canonical_gen) {
+        pthread_mutex_unlock(&fd_lock);
+        pthread_mutex_unlock(&pty_keepalive_lock);
+        close(slave);
+        pts_num = UINT32_MAX;
+        goto out;
+    }
+    uint32_t existing_pts = UINT32_MAX;
+    int rc = pty_keepalive_register_locked(canonical_host_fd, slave, pts_num,
+                                           slave_path, false, &existing_pts);
+    pthread_mutex_unlock(&fd_lock);
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    if (rc == PTY_REG_FULL) {
+        close(slave);
+        pts_num = UINT32_MAX;
+    } else if (rc == PTY_REG_EXISTS) {
+        /* Another adopter registered first; their slave keeps the tty alive.
+         * The pts_num came from the locked scan above, so it is the value the
+         * winning entry holds and is not subject to a lookup-after-recycle
+         * race.
+         */
+        close(slave);
+        pts_num = existing_pts;
+    }
+
+out:
+    close(probe);
+    return pts_num;
+}
+
+/* Look up the captured macOS slave path for a Linux pts number. Returns 0 and
+ * writes the path on hit, -1 with errno=ENOENT on miss. Used by the /dev/pts/N
+ * open and stat intercepts so they hit the exact path returned by ptsname(3)
+ * rather than a guessed /dev/ttys%03lu reformat that breaks if macOS changes
+ * its naming scheme or uses an unexpected minor encoding.
+ */
+static int pty_lookup_slave_path(uint32_t linux_pts_num,
+                                 char *out,
+                                 size_t out_sz)
+{
+    if (!out || out_sz == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+    int hit = -1;
+    pty_keepalive_lock_acquire();
+    /* Prefer a live entry (master still open in this process) over a stale
+     * path entry. Both encode the same slave_path for a given minor on macOS,
+     * so the preference only matters if a future change ever lets the two
+     * diverge - live wins by breaking out of the scan on first match.
+     */
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].linux_pts_num != linux_pts_num)
+            continue;
+        if (pty_keepalive_table[i].slave_path[0] == '\0')
+            continue;
+        if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) {
+            hit = i;
+            break;
+        }
+        if (!pty_keepalive_table[i].stale_open_once ||
+            pty_keepalive_table[i].slave_host_fd < 0)
+            continue;
+        if (hit < 0)
+            hit = i;
+    }
+    if (hit < 0) {
+        pthread_mutex_unlock(&pty_keepalive_lock);
+        errno = ENOENT;
+        return -1;
+    }
+    size_t len = strlen(pty_keepalive_table[hit].slave_path);
+    if (len >= out_sz) {
+        pthread_mutex_unlock(&pty_keepalive_lock);
+        errno = ENAMETOOLONG;
+        return -1;
+    }
+    memcpy(out, pty_keepalive_table[hit].slave_path, len + 1);
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    return 0;
+}
+
+static int pty_open_slave(uint32_t linux_pts_num, int linux_flags)
+{
+    int oflags = translate_open_flags(linux_flags) &
+                 (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY);
+    char host_path[PTY_SLAVE_PATH_MAX];
+    int stale_hit = -1;
+    int retained_slaves[PTY_KEEPALIVE_MAX];
+    int nretained = 0;
+    int fd = -1;
+
+    pty_keepalive_lock_acquire();
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].linux_pts_num != linux_pts_num)
+            continue;
+        if (pty_keepalive_table[i].slave_path[0] == '\0')
+            continue;
+        if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE) {
+            size_t len = strlen(pty_keepalive_table[i].slave_path);
+            if (len >= sizeof(host_path)) {
+                pthread_mutex_unlock(&pty_keepalive_lock);
+                errno = ENAMETOOLONG;
+                return -1;
+            }
+            memcpy(host_path, pty_keepalive_table[i].slave_path, len + 1);
+            pthread_mutex_unlock(&pty_keepalive_lock);
+            return open(host_path, oflags);
+        }
+        if (stale_hit < 0 && pty_keepalive_table[i].stale_open_once &&
+            pty_keepalive_table[i].slave_host_fd >= 0)
+            stale_hit = i;
+    }
+
+    if (stale_hit < 0) {
+        pthread_mutex_unlock(&pty_keepalive_lock);
+        errno = ENOENT;
+        return -1;
+    }
+
+    /* Stale fork-child entries are one-shot. The retained slave fd pins the
+     * macOS tty while we translate the close-before-open sequence, preventing
+     * the cached path from resolving to a reused unrelated minor. Regardless
+     * of open success, consume the stale mapping before returning.
+     */
+    size_t len = strlen(pty_keepalive_table[stale_hit].slave_path);
+    if (len >= sizeof(host_path)) {
+        int retained_slave = pty_keepalive_clear_slot_locked(stale_hit);
+        pthread_mutex_unlock(&pty_keepalive_lock);
+        if (retained_slave >= 0)
+            close(retained_slave);
+        errno = ENAMETOOLONG;
+        return -1;
+    }
+    memcpy(host_path, pty_keepalive_table[stale_hit].slave_path, len + 1);
+    fd = open(host_path, oflags);
+    int saved = errno;
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE)
+            continue;
+        if (!pty_keepalive_table[i].stale_open_once)
+            continue;
+        if (strncmp(pty_keepalive_table[i].slave_path, host_path,
+                    PTY_SLAVE_PATH_MAX) != 0)
+            continue;
+        int retained_slave = pty_keepalive_clear_slot_locked(i);
+        if (retained_slave >= 0 && nretained < PTY_KEEPALIVE_MAX)
+            retained_slaves[nretained++] = retained_slave;
+    }
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    for (int i = 0; i < nretained; i++)
+        close(retained_slaves[i]);
+    errno = saved;
+    return fd;
+}
+
+static int pty_open_pts_dir(int linux_flags)
+{
+    char dir[80];
+    uint32_t pts_nums[PTY_KEEPALIVE_MAX];
+    int pts_count = 0;
+    int n = snprintf(dir, sizeof(dir), "/tmp/elfuse-pts-XXXXXX");
+    if (n < 0 || (size_t) n >= sizeof(dir)) {
+        errno = ENAMETOOLONG;
+        return -1;
+    }
+    if (!mkdtemp(dir))
+        return -1;
+
+    pty_keepalive_lock_acquire();
+    /* Enumerate live masters and fork-child one-shot stale entries. The stale
+     * entries retain a slave fd until the first open attempt consumes them, so
+     * they cannot name a reused unrelated tty while they appear in readdir.
+     */
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].slave_path[0] == '\0')
+            continue;
+        if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE &&
+            (!pty_keepalive_table[i].stale_open_once ||
+             pty_keepalive_table[i].slave_host_fd < 0))
+            continue;
+        /* The recycle/reuse-by-pts_num invariant in
+         * pty_keepalive_register_locked keeps at most one entry per minor, so
+         * no de-duplication pass is needed here.
+         */
+        pts_nums[pts_count++] = pty_keepalive_table[i].linux_pts_num;
+    }
+    pthread_mutex_unlock(&pty_keepalive_lock);
+
+    for (int i = 0; i < pts_count; i++) {
+        char entry[160];
+        int en = snprintf(entry, sizeof(entry), "%s/%u", dir, pts_nums[i]);
+        if (en <= 0 || (size_t) en >= sizeof(entry))
+            continue;
+        int tfd = open(entry, O_CREAT | O_WRONLY, 0444);
+        if (tfd >= 0)
+            close(tfd);
+    }
+
+    pthread_once(&proc_scratch_atexit_once, proc_scratch_register_atexit);
+
+    pthread_mutex_lock(&proc_scratch_lock);
+    if (proc_scratch_dirs_count < PROC_SCRATCH_DIRS_MAX) {
+        str_copy_trunc(proc_scratch_dirs[proc_scratch_dirs_count++], dir,
+                       sizeof(proc_scratch_dirs[0]));
+    }
+    pthread_mutex_unlock(&proc_scratch_lock);
+
+    int fd = proc_open_dir_fd(dir, linux_flags);
+    if (fd < 0) {
+        int saved = errno;
+        proc_scratch_remove_one(dir);
+        errno = saved;
+    }
+    return fd;
+}
+
+void proc_pty_lock_for_dup(void)
+{
+    pty_keepalive_lock_acquire();
+}
+
+void proc_pty_unlock_for_dup(void)
+{
+    pthread_mutex_unlock(&pty_keepalive_lock);
+}
+
+void proc_pty_dup_keepalive_locked(int src_master_host_fd,
+                                   int dst_master_host_fd)
+{
+    /* Caller-holds-lock variant; see header for the dup race this guards. */
+    if (src_master_host_fd < 0 || dst_master_host_fd < 0)
+        return;
+
+    int slot = pty_keepalive_find_master_locked(src_master_host_fd);
+    if (slot < 0)
+        return;
+    int dst_slave = dup(pty_keepalive_table[slot].slave_host_fd);
+    if (dst_slave < 0)
+        return;
+    uint32_t src_pts_num = pty_keepalive_table[slot].linux_pts_num;
+    char src_slave_path[PTY_SLAVE_PATH_MAX];
+    memcpy(src_slave_path, pty_keepalive_table[slot].slave_path,
+           PTY_SLAVE_PATH_MAX);
+
+    /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into
+     * a guest child that has no map back to it.
+     */
+    int fdflags = fcntl(dst_slave, F_GETFD);
+    if (fdflags < 0 || fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) {
+        close(dst_slave);
+        return;
+    }
+    int rc =
+        pty_keepalive_register_locked(dst_master_host_fd, dst_slave,
+                                      src_pts_num, src_slave_path, false, NULL);
+    if (rc != PTY_REG_INSERTED) {
+        /* Table full or duplicate entry for dst_master_host_fd; drop the
+         * redundant slave. Duplicate is unexpected: dst is a freshly-duped
+         * host fd that should not already be in the table unless a prior
+         * close skipped proc_pty_close_keepalive.
+         */
+        close(dst_slave);
+    }
+}
+
+void proc_pty_close_keepalive(int master_host_fd)
+{
+    /* fd_cleanup_entry calls this for every guest fd close, not just pty
+     * masters; pty_keepalive_lock_acquire guarantees sentinel-init first.
+     */
+    if (master_host_fd < 0)
+        return;
+
+    int slave = -1;
+    pty_keepalive_lock_acquire();
+    int slot = pty_keepalive_find_master_locked(master_host_fd);
+    if (slot >= 0) {
+        if (pty_keepalive_table[slot].stale_open_once) {
+            /* Fork-restored child entry: retain the slave fd and path for one
+             * /dev/pts/N open after close(master). pty_open_slave consumes and
+             * closes it on the first translated open attempt.
+             */
+            pty_keepalive_table[slot].master_host_fd = PTY_KEEPALIVE_FREE;
+        } else {
+            slave = pty_keepalive_clear_slot_locked(slot);
+        }
+    }
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    if (slave >= 0)
+        close(slave);
+}
+
+static void proc_pty_expire_stale_by_path(const char *slave_path)
+{
+    if (!slave_path || slave_path[0] == '\0')
+        return;
+
+    int stale_slaves[PTY_KEEPALIVE_MAX];
+    int nslaves = 0;
+    pty_keepalive_lock_acquire();
+    for (int i = 0; i < PTY_KEEPALIVE_MAX; i++) {
+        if (pty_keepalive_table[i].master_host_fd != PTY_KEEPALIVE_FREE)
+            continue;
+        if (!pty_keepalive_table[i].stale_open_once)
+            continue;
+        if (strncmp(pty_keepalive_table[i].slave_path, slave_path,
+                    PTY_SLAVE_PATH_MAX) != 0)
+            continue;
+        int slave = pty_keepalive_clear_slot_locked(i);
+        if (slave >= 0 && nslaves < PTY_KEEPALIVE_MAX)
+            stale_slaves[nslaves++] = slave;
+    }
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    for (int i = 0; i < nslaves; i++)
+        close(stale_slaves[i]);
+}
+
+static int pty_keepalive_register_recycled(int master_host_fd,
+                                           int slave_host_fd,
+                                           uint32_t linux_pts_num,
+                                           const char *slave_path,
+                                           bool stale_open_once)
+{
+    proc_pty_expire_stale_by_path(slave_path);
+    return pty_keepalive_register(master_host_fd, slave_host_fd, linux_pts_num,
+                                  slave_path, stale_open_once);
+}
+
+int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries,
+                                int *out_slave_fds,
+                                int max_entries)
+{
+    if (!out_entries || !out_slave_fds || max_entries <= 0)
+        return 0;
+
+    int n = 0;
+    pty_keepalive_lock_acquire();
+    for (int i = 0; i < PTY_KEEPALIVE_MAX && n < max_entries; i++) {
+        if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE)
+            continue;
+
+        /* dup under the lock so the slave fd cannot be closed and the host fd
+         * number recycled before SCM_RIGHTS reads it. The caller closes the dup
+         * after the send completes.
+         */
+        int duped = dup(pty_keepalive_table[i].slave_host_fd);
+        if (duped < 0)
+            continue;
+
+        out_entries[n].master_host_fd = pty_keepalive_table[i].master_host_fd;
+        out_entries[n].linux_pts_num = pty_keepalive_table[i].linux_pts_num;
+        _Static_assert(sizeof(out_entries[n].slave_path) == PTY_SLAVE_PATH_MAX,
+                       "ipc slave_path size must match keepalive table");
+        memcpy(out_entries[n].slave_path, pty_keepalive_table[i].slave_path,
+               PTY_SLAVE_PATH_MAX);
+        out_slave_fds[n] = duped;
+        n++;
+    }
+    pthread_mutex_unlock(&pty_keepalive_lock);
+    return n;
+}
+
+void proc_pty_restore_keepalive(int master_host_fd,
+                                int slave_host_fd,
+                                uint32_t linux_pts_num,
+                                const char *slave_path)
+{
+    /* fork-IPC hand-off. SCM_RIGHTS drops FD_CLOEXEC; set it here so the
+     * keepalive does not survive exec. Any failure drops the slave fd.
+     */
+    if (master_host_fd < 0)
+        goto drop;
+
+    if (slave_host_fd >= 0) {
+        int fdflags = fcntl(slave_host_fd, F_GETFD);
+        if (fdflags < 0 ||
+            fcntl(slave_host_fd, F_SETFD, fdflags | FD_CLOEXEC) < 0)
+            goto drop;
+    }
+
+    /* Trust the parent's linux_pts_num verbatim instead of re-parsing
+     * slave_path. The wire-format string is bounded to PTY_SLAVE_PATH_MAX - 1
+     * bytes; if a future macOS canonical form ever exceeded that, the parent
+     * would have truncated and reparsing here would yield the wrong number. On
+     * EEXIST the child's fd_table-restore path replayed master_host_fd over a
+     * prior recv-keepalive entry; drop the redundant slave so it does not leak.
+     */
+    errno = 0;
+    if (pty_keepalive_register_recycled(master_host_fd, slave_host_fd,
+                                        linux_pts_num, slave_path, true) < 0 ||
+        errno == EEXIST)
+        goto drop;
+    return;
+
+drop:
+    if (slave_host_fd >= 0)
+        close(slave_host_fd);
+}
+
+/* Open /dev/ptmx, unlock the slave, and instantiate a keepalive slave fd so the
+ * master's tty ioctls work before the guest opens the slave itself.
+ * Returns the master host fd on success, -1 with errno set on failure.
+ */
+static int pty_open_master(int linux_flags)
+{
+    /* /dev/ptmx is a character device; O_CREAT / O_TRUNC / O_EXCL make no sense
+     * here. Strip them and only honor accmode + descriptor flags so the host
+     * open(2) never sees a variadic-mode-required combination without a mode
+     * arg.
+     */
+    int oflags = translate_open_flags(linux_flags) &
+                 (O_ACCMODE | O_NONBLOCK | O_CLOEXEC | O_NOCTTY);
+    int master = open("/dev/ptmx", oflags);
+    if (master < 0)
+        return -1;
+
+    /* grantpt(3) is a no-op on a unix98 pty mount, but call it for clarity
+     * and to match what posix_openpt(3)'s callers expect to have happened.
+     */
+    char slave_path[PTY_SLAVE_PATH_MAX];
+    if (grantpt(master) < 0 || unlockpt(master) < 0 ||
+        ptsname_r(master, slave_path, sizeof(slave_path)) != 0) {
+        close_keep_errno(master);
+        return -1;
+    }
+
+    /* Establish the (linux_pts_num, slave_path) mapping that /dev/pts/N opens
+     * and stats resolve through. If table or slave-fd registration fails after
+     * the master is open, report EMFILE rather than silently returning a master
+     * fd whose pts number cannot be opened back through /dev/pts/N. The caller
+     * can close other pty pairs and retry instead of dealing with a half-broken
+     * descriptor.
+     */
+    uint32_t linux_pts_num = pty_extract_pts_num(slave_path);
+    if (linux_pts_num == UINT32_MAX) {
+        close(master);
+        errno = ENOTTY;
+        return -1;
+    }
+    int slave = open(slave_path, O_RDWR | O_NOCTTY | O_CLOEXEC);
+    if (slave < 0) {
+        close_keep_errno(master);
+        return -1;
+    }
+    errno = 0;
+    if (pty_keepalive_register_recycled(master, slave, linux_pts_num,
+                                        slave_path, false) < 0) {
+        close(slave);
+        close(master);
+        errno = EMFILE;
+        return -1;
+    }
+    /* Defense-in-depth: the freshly-opened master fd should not already have
+     * a keepalive (would indicate a stale entry from a prior close that did
+     * not run proc_pty_close_keepalive). Drop the redundant slave so it does
+     * not leak.
+     */
+    if (errno == EEXIST)
+        close(slave);
+    return master;
+}
+
 int proc_intercept_open(const guest_t *g,
                         const char *path,
                         int linux_flags,
                         int mode)
 {
+    /* /dev/ptmx -> host /dev/ptmx + keepalive slave (see pty_open_master).
+     * O_PATH is path-only on Linux: it must not run the device open hook or
+     * allocate a pty pair. Use a harmless backing fd; FD_PATH gates I/O and
+     * ioctl, while proc_intercept_stat supplies the visible device metadata.
+     */
+    if (!strcmp(path, "/dev/ptmx")) {
+        if (linux_flags & LINUX_O_PATH) {
+            if (linux_flags & LINUX_O_DIRECTORY) {
+                errno = ENOTDIR;
+                return -1;
+            }
+            int oflags = O_RDONLY;
+            if (linux_flags & LINUX_O_CLOEXEC)
+                oflags |= O_CLOEXEC;
+            return open("/dev/null", oflags);
+        }
+        return pty_open_master(linux_flags);
+    }
+
     /* /dev/null, /dev/zero, /dev/(u)random, /dev/tty */
     const char *host_dev = NULL;
     int host_accmode = translate_open_flags(linux_flags) & O_ACCMODE;
@@ -1525,6 +2323,7 @@ int proc_intercept_open(const guest_t *g,
         const char *shm = shm_dir_path();
         return shm ? proc_open_dir_fd(shm, linux_flags) : -1;
     }
+
     if (!strncmp(path, "/dev/shm/", 9)) {
         char host_path[512];
         if (dev_shm_resolve_path(path + 9, host_path, sizeof(host_path)) < 0)
@@ -1548,6 +2347,40 @@ int proc_intercept_open(const guest_t *g,
     if (!strncmp(path, "/dev/fd/", 8))
         return dev_fd_dup(path, 8);
 
+    /* /dev/pts -> synthetic devpts directory. stat/access advertise this
+     * directory even on macOS hosts without /dev/pts, so open must be
+     * intercepted too or callers that probe then enumerate see inconsistent
+     * Linux-visible behavior.
+     */
+    if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/"))
+        return pty_open_pts_dir(linux_flags);
+
+    /* /dev/pts/N -> the macOS slave path captured at /dev/ptmx open time.
+     * Looking up the exact ptsname(3) string (rather than reformatting
+     * /dev/ttys%03lu) keeps the guest correct against any future macOS format
+     * change and against tty minor encodings that do not round-trip through
+     * plain zero-padding. ENOENT until the owning master is opened matches
+     * Linux devpts behavior for an unallocated slave number.
+     */
+    if (!strncmp(path, "/dev/pts/", 9)) {
+        const char *digits = path + 9;
+        if (!*digits) {
+            errno = ENOENT;
+            return -1;
+        }
+        char *endp;
+        unsigned long n = strtoul(digits, &endp, 10);
+        if (endp == digits || *endp != '\0' || n > UINT32_MAX) {
+            errno = ENOENT;
+            return -1;
+        }
+        /* /dev/pts/N is a character device; strip O_CREAT and friends so
+         * the two-argument open(2) never sees a creation-mode-required
+         * combination without a mode arg.
+         */
+        return pty_open_slave((uint32_t) n, linux_flags);
+    }
+
     /* /proc -> synthetic directory with PID entries for busybox ps, top, etc.
      * Creates a temp dir once (cached for the process lifetime) with entries
      * matching the current single-process model: the current PID directory +
@@ -1573,9 +2406,8 @@ int proc_intercept_open(const guest_t *g,
      * Each open gets its own scratch dir so concurrent enumerations cannot
      * mutate one another (see proc_open_fd_scratch).
      */
-    if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) {
+    if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/"))
         return proc_open_fd_scratch("elfuse-fd", linux_flags);
-    }
 
     if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) {
         const char *dir = ensure_proc_tmpdir(g);
@@ -1590,9 +2422,9 @@ int proc_intercept_open(const guest_t *g,
         return proc_open_dir_fd(netdir, linux_flags);
     }
 
-    /* /proc/<our_pid>[/...] -> /proc/self[...]. Returns -1 on
-     * ENAMETOOLONG so the guest sees the same error a real Linux kernel
-     * would produce instead of falling through to a host syscall.
+    /* /proc/<our_pid>[/...] -> /proc/self[...].
+     * Returns -1 on ENAMETOOLONG so the guest sees the same error a real Linux
+     * kernel would produce instead of falling through to a host syscall.
      */
     {
         char alias[LINUX_PATH_MAX];
@@ -1621,9 +2453,9 @@ int proc_intercept_open(const guest_t *g,
      * return an actual file descriptor to the binary.
      * Under rosetta, the binfmt_misc convention treats rosetta as the
      * interpreter visible to the guest: rosetta opens /proc/self/fd/X
-     * via /proc/self/exe to identify itself and then issues the VZ
-     * ioctls on that descriptor. Return ROSETTA_PATH so the VZ ioctl
-     * gate (rosetta_ioctl_target_fd) recognises the fd.
+     * via /proc/self/exe to identify itself and then issues the VZ ioctls on
+     * that descriptor. Return ROSETTA_PATH so the VZ ioctl gate
+     * (rosetta_ioctl_target_fd) recognises the fd.
      */
     if (!strcmp(path, "/proc/self/exe")) {
         if (g && g->is_rosetta)
@@ -1637,8 +2469,8 @@ int proc_intercept_open(const guest_t *g,
     }
 
     /* /proc/cpuinfo -> synthetic file with CPU count.
-     * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid
-     * silent truncation on hosts with >16 CPUs.
+     * Buffer sized dynamically from ncpu (~200 bytes/entry) to avoid silent
+     * truncation on hosts with >16 CPUs.
      */
     if (!strcmp(path, "/proc/cpuinfo")) {
         int ncpu = (int) sysconf(_SC_NPROCESSORS_ONLN);
@@ -1746,8 +2578,8 @@ int proc_intercept_open(const guest_t *g,
     }
 
     /* /proc/self/task -> directory with per-thread TID entries.
-     * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at
-     * startup to discover thread count and per-thread state.
+     * Debuggers and runtimes (GDB, LLDB, JVM, Go runtime) probe this at startup
+     * to discover thread count and per-thread state.
      *
      * Rebuilds a temp directory on each open (thread set is dynamic).
      * Cannot rmdir before returning the fd because macOS getdents on unlinked
@@ -1815,8 +2647,8 @@ int proc_intercept_open(const guest_t *g,
         }
 
         /* /proc/self/task/<tid> directory itself: synthesize a dir with
-         * stat/status placeholder entries. Persistent so getdents sees
-         * the entries on macOS (which cannot enumerate unlinked dirs).
+         * stat/status placeholder entries. Persistent so getdents sees the
+         * entries on macOS (which cannot enumerate unlinked dirs).
          */
         if (*endp == '\0' || !strcmp(endp, "/")) {
             static proc_persistent_dir_t tiddir =
@@ -1881,9 +2713,9 @@ int proc_intercept_open(const guest_t *g,
 
         /* Add preannounced entries only while they still have an uncovered
          * tail. Once the union of live regions covers the full advertised
-         * interval, suppress the shadow entry so /proc/self/maps shows only
-         * the realized split VMAs. A partial union must stay visible because
-         * some reserved-but-not-realized span remains to advertise.
+         * interval, suppress the shadow entry so /proc/self/maps shows only the
+         * realized split VMAs. A partial union must stay visible because some
+         * reserved-but-not-realized span remains to advertise.
          */
         for (int i = 0; i < g->npreannounced && nentries < MAPS_ENTRY_MAX;
              i++) {
@@ -2185,8 +3017,9 @@ int proc_intercept_open(const guest_t *g,
             uint64_t mask;
             /* fs/signalfd.c uses a tab after the colon (matching the
              * pos:/flags:/mnt_id: convention in fs/proc/fd.c, not the
-             * single-space style of eventfd/timerfd). Verified against a
-             * real Linux 6.x /proc/self/fdinfo dump. */
+             * single-space style of eventfd/timerfd). Verified against a real
+             * Linux 6.x /proc/self/fdinfo dump.
+             */
             if (signalfd_fdinfo_snapshot(n, &mask))
                 snprintf(extra, sizeof(extra), "sigmask:\t%016llx\n",
                          (unsigned long long) mask);
@@ -2196,11 +3029,12 @@ int proc_intercept_open(const guest_t *g,
             int64_t value_ns, interval_ns;
             if (timerfd_fdinfo_snapshot(n, &clockid, &ticks, &value_ns,
                                         &interval_ns)) {
-                /* Linux fs/timerfd.c emits these fields with single
-                 * spaces after the colon, not tabs (unlike pos:/flags:/
-                 * mnt_id: in fs/proc/fd.c, which do use tabs). Match the
-                 * upstream format so guest readers parsing fdinfo via a
-                 * "it_value: (" prefix find the field. */
+                /* Linux fs/timerfd.c emits these fields with single spaces
+                 * after the colon, not tabs (unlike pos:/flags:/mnt_id: in
+                 * fs/proc/fd.c, which do use tabs). Match the upstream format
+                 * so guest readers parsing fdinfo via a "it_value: (" prefix
+                 * find the field.
+                 */
                 snprintf(extra, sizeof(extra),
                          "clockid: %d\n"
                          "ticks: %llu\n"
@@ -2227,9 +3061,9 @@ int proc_intercept_open(const guest_t *g,
     }
 
     /* /proc/self/fdinfo -> directory listing. Each open gets its own scratch
-     * dir so concurrent getdents on independent dirfds cannot interfere
-     * (the previous shared-dir design unlinked entries under a sibling
-     * enumerator). The dirs are tracked for atexit cleanup.
+     * dir so concurrent getdents on independent dirfds cannot interfere (the
+     * previous shared-dir design unlinked entries under a sibling enumerator).
+     * The dirs are tracked for atexit cleanup.
      */
     if (!strcmp(path, "/proc/self/fdinfo") ||
         !strcmp(path, "/proc/self/fdinfo/")) {
@@ -2276,6 +3110,7 @@ int proc_intercept_open(const guest_t *g,
             buffers_kb = total_kb / 20;
             cached_kb = total_kb / 4;
         }
+
         return proc_emit_fmt(
             "MemTotal:       %llu kB\n"
             "MemFree:        %llu kB\n"
@@ -2313,9 +3148,9 @@ int proc_intercept_open(const guest_t *g,
     }
 
     /* /proc/self/io -> synthetic I/O counters.
-     * Some node-style observability runtimes read this for resource
-     * monitoring metrics. procfs emulation returns zeroed counters because
-     * it does not track per-guest I/O.
+     * Some node-style observability runtimes read this for resource monitoring
+     * metrics. procfs emulation returns zeroed counters because it does not
+     * track per-guest I/O.
      */
     if (!strcmp(path, "/proc/self/io")) {
         return proc_emit_literal(
@@ -2474,6 +3309,23 @@ int proc_intercept_stat(const char *path, struct stat *st)
     if (!strcmp(path, "/dev/fuse"))
         return fuse_proc_stat(st);
 
+    /* Linux /dev/ptmx is the Unix98 pty multiplexer character device (5:2).
+     * Keep this synthetic so O_PATH probes can fstat the path fd without
+     * forcing a real host /dev/ptmx open, which would allocate a pty.
+     */
+    if (!strcmp(path, "/dev/ptmx")) {
+        memset(st, 0, sizeof(*st));
+        st->st_mode = S_IFCHR | 0666;
+        st->st_nlink = 1;
+        st->st_dev = PROC_SYNTH_DEV;
+        st->st_ino = proc_synth_ino(path);
+        st->st_uid = proc_get_uid();
+        st->st_gid = proc_get_gid();
+        st->st_rdev = ((dev_t) 5u << 24) | (dev_t) 2u;
+        st->st_blksize = 1024;
+        return 0;
+    }
+
     /* /dev/shm is a directory */
     if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) {
         stat_fill_proc_dir(st, 01777, 2,
@@ -2488,6 +3340,63 @@ int proc_intercept_stat(const char *path, struct stat *st)
         return stat(host_path, st);
     }
 
+    /* /dev/pts directory and /dev/pts/N slave entries. glibc ptsname(3)
+     * stats /dev/pts/N after TIOCGPTN and rejects with ENOENT if absent.
+     * Synthesize a minimal char-device stat whose st_rdev decodes to Linux's
+     * standard pts major (136) so glibc's major(rdev) == UNIX98_PTY_SLAVE_MAJOR
+     * check passes. The numeric tail must round-trip with /dev/ttysN via the
+     * open intercept (see proc_intercept_open).
+     */
+    if (!strcmp(path, "/dev/pts") || !strcmp(path, "/dev/pts/")) {
+        stat_fill_proc_dir(st, 0755, 2, path);
+        return 0;
+    }
+    if (!strncmp(path, "/dev/pts/", 9)) {
+        const char *digits = path + 9;
+        if (!*digits) {
+            errno = ENOENT;
+            return -1;
+        }
+        char *endp;
+        unsigned long n = strtoul(digits, &endp, 10);
+        if (endp == digits || *endp != '\0' || n > UINT32_MAX) {
+            errno = ENOENT;
+            return -1;
+        }
+        /* Resolve through the captured-path table: ENOENT unless the
+         * corresponding master is currently open. This avoids the host
+         * stat false-positive where /dev/ttysNNN happens to exist for an
+         * unrelated tty allocated outside elfuse.
+         */
+        char host_path[PTY_SLAVE_PATH_MAX];
+        if (pty_lookup_slave_path((uint32_t) n, host_path, sizeof(host_path)) <
+            0)
+            return -1;
+        struct stat host_st;
+        if (stat(host_path, &host_st) < 0) {
+            errno = ENOENT;
+            return -1;
+        }
+        memset(st, 0, sizeof(*st));
+        st->st_mode = S_IFCHR | 0620;
+        st->st_nlink = 1;
+        st->st_uid = host_st.st_uid;
+        st->st_gid = host_st.st_gid;
+        /* macOS dev_t = (major << 24) | minor; the fs-stat translation layer
+         * (mac_to_linux_dev) re-encodes that into Linux's split major/minor
+         * layout, so storing 136 in the macOS-major slot makes glibc's
+         * major(rdev) yield UNIX98_PTY_SLAVE_MAJOR.
+         */
+        st->st_rdev = ((dev_t) 136u << 24) | (dev_t) (n & 0xFFFFFFu);
+        st->st_size = 0;
+        st->st_blksize = 1024;
+        st->st_blocks = 0;
+        st->st_atime = host_st.st_atime;
+        st->st_mtime = host_st.st_mtime;
+        st->st_ctime = host_st.st_ctime;
+        return 0;
+    }
+
     /* /proc and /proc/<our_pid> are directories */
     if (!strcmp(path, "/proc") || !strcmp(path, "/proc/")) {
         stat_fill_proc_dir(st, 0555, 3, path);
@@ -2645,10 +3554,11 @@ int proc_intercept_stat(const char *path, struct stat *st)
             struct stat host_st;
             if (lstat(host_path, &host_st) < 0)
                 return -1;
+
             /* Replace host inode/dev with the synthetic-procfs convention so
-             * the guest sees a stable identity that does not collide with
-             * real host files (and so st_size reads as 0 for cpumask files,
-             * matching real sysfs).
+             * the guest sees a stable identity that does not collide with real
+             * host files (and so st_size reads as 0 for cpumask files, matching
+             * real sysfs).
              */
             if (S_ISDIR(host_st.st_mode))
                 stat_fill_proc_dir(st, 0555, 2, path);
@@ -2699,9 +3609,9 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz)
         char sysroot_snap[LINUX_PATH_MAX];
         if (proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap))) {
             /* proc_set_sysroot stores a realpath()-canonicalized form, so
-             * canonicalize exe before the prefix check or the strip fails
-             * when /var -> /private/var (and similar macOS symlinks) make
-             * the two strings diverge.
+             * canonicalize exe before the prefix check or the strip fails when
+             * /var -> /private/var (and similar macOS symlinks) make the two
+             * strings diverge.
              */
             const char *exe_cmp = exe;
             if (realpath(exe, exe_real))
@@ -2843,10 +3753,11 @@ int proc_intercept_write(int guest_fd,
         return 0;
     int kind = proc_oom_path_kind(snap.proc_path);
     if (kind == OOM_PATH_SCORE) {
-        /* Linux: oom_score has no write handler. proc_reg_write returns
-         * -EIO when the underlying proc_dir_entry exposes no write op,
-         * not -EINVAL. Match that so guests probing the error code see
-         * the same value as on a real kernel. */
+        /* Linux: oom_score has no write handler. proc_reg_write returns -EIO
+         * when the underlying proc_dir_entry exposes no write op, not -EINVAL.
+         * Match that so guests probing the error code see the same value as on
+         * a real kernel.
+         */
         errno = EIO;
         return -1;
     }
@@ -2854,8 +3765,8 @@ int proc_intercept_write(int guest_fd,
         return 0;
 
     /* Linux: zero-byte writes to proc nodes succeed without side effects.
-     * Without this short-circuit, sys_writev would funnel a zero-length
-     * vector through proc_parse_int_write and get -EINVAL.
+     * Without this short-circuit, sys_writev would funnel a zero-length vector
+     * through proc_parse_int_write and get -EINVAL.
      */
     if (count == 0) {
         *written_out = 0;
@@ -2902,6 +3813,7 @@ int proc_intercept_write(int guest_fd,
         goto unlock;
     if (!use_pwrite && lseek(host_fd, offset + (int64_t) count, SEEK_SET) < 0)
         goto unlock;
+
     atomic_store(&oom_score_adj_value, score_adj);
     proc_oom_refresh_live_fds_locked();
     *written_out = (ssize_t) count;
diff --git a/src/runtime/procemu.h b/src/runtime/procemu.h
index 58f4f30..78227ae 100644
--- a/src/runtime/procemu.h
+++ b/src/runtime/procemu.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <stddef.h>
+#include <stdint.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include "core/guest.h"
@@ -86,3 +87,80 @@ const char *proc_get_shm_dir(void);
 int proc_dev_shm_resolve(const char *guest_suffix,
                          char *host_path,
                          size_t host_path_sz);
+
+/* Drop the keepalive slave fd paired with a /dev/ptmx master host fd. Called
+ * from fd_cleanup_entry when the guest closes the master so the host kernel
+ * eventually tears the tty down. Idempotent and safe to call on any host fd
+ * (no-op when no keepalive is registered).
+ */
+void proc_pty_close_keepalive(int master_host_fd);
+
+/* Caller-locked variant of pty keepalive duplication. Brackets the host
+ * fd_snapshot_and_dup and the keepalive mirror under one pty_keepalive_lock
+ * window so a sibling close cannot run proc_pty_close_keepalive in the
+ * gap and leave the alias without a keepalive entry. Caller must wrap the
+ * sequence with proc_pty_lock_for_dup / proc_pty_unlock_for_dup.
+ */
+void proc_pty_lock_for_dup(void);
+void proc_pty_unlock_for_dup(void);
+void proc_pty_dup_keepalive_locked(int src_master_host_fd,
+                                   int dst_master_host_fd);
+
+/* Return the captured Linux pts number for a host master fd, or UINT32_MAX
+ * when no keepalive is registered. Lets sys_ioctl TIOCGPTN report the value
+ * /dev/pts/N opens / stats round-trip through, instead of independently
+ * parsing the macOS slave path and risking divergence with the open table.
+ */
+uint32_t proc_pty_master_pts_num(int master_host_fd);
+
+/* Best-effort lazy registration for a /dev/ptmx master that elfuse did not
+ * open itself (e.g. one received from a peer via SCM_RIGHTS). Takes a guest
+ * fd because the canonical host fd would race with sibling close+reuse if
+ * passed in directly: this helper snapshots fd_table[guest_fd].host_fd and
+ * its generation, performs the slave open against a private dup, and only
+ * registers the keepalive after re-verifying the slot still holds the
+ * original (host_fd, generation). Returns the pts number on success, or
+ * UINT32_MAX if the fd is not a pty master, the slot got closed/recycled
+ * mid-adoption, or the keepalive table is full. Idempotent: if the master
+ * already has a keepalive entry, returns its stored linux_pts_num.
+ */
+uint32_t proc_pty_master_adopt(int guest_fd);
+
+/* Max bytes for a captured macOS slave path (e.g. "/dev/ttys004"). Lives in
+ * the header so proc_pty_ipc_entry_t below and the in-memory keepalive table
+ * in procemu.c share one source of truth; a divergence would silently corrupt
+ * the fork-IPC wire format.
+ */
+#define PTY_SLAVE_PATH_MAX 64
+
+/* Serialized form of a pty keepalive entry, used by fork-IPC. The slave host
+ * fd travels separately via SCM_RIGHTS; this struct only carries the
+ * lifetime-independent metadata that the child needs to re-register.
+ */
+typedef struct {
+    int32_t master_host_fd;
+    uint32_t linux_pts_num;
+    char slave_path[PTY_SLAVE_PATH_MAX];
+} proc_pty_ipc_entry_t;
+
+/* Snapshot the current pty keepalive table into out_entries / out_slave_fds.
+ * dup()s every slave fd under the keepalive lock so the snapshot stays valid
+ * across the SCM_RIGHTS send even if the original master gets closed before
+ * the IPC drains. Returns the number of live entries written (always
+ * <= max_entries); on success the caller owns the duplicated slave fds and
+ * must close them after the IPC send completes.
+ */
+int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries,
+                                int *out_slave_fds,
+                                int max_entries);
+
+/* Re-register a single keepalive in the child after a fork-IPC. master_host_fd
+ * is the child-side host fd that just landed in fd_table (its number is
+ * different from the parent's). Takes ownership of slave_host_fd. The slave
+ * path is recorded so /dev/pts/N opens in the child resolve to the same macOS
+ * tty the parent has been talking to.
+ */
+void proc_pty_restore_keepalive(int master_host_fd,
+                                int slave_host_fd,
+                                uint32_t linux_pts_num,
+                                const char *slave_path);
diff --git a/src/syscall/abi.h b/src/syscall/abi.h
index ac58574..3aa50ed 100644
--- a/src/syscall/abi.h
+++ b/src/syscall/abi.h
@@ -333,6 +333,7 @@ typedef struct {
 #define LINUX_EMULTIHOP 72  /* Multihop attempted */
 #define LINUX_EILSEQ 84     /* Illegal byte sequence */
 #define LINUX_EHOSTDOWN 112 /* Host is down */
+#define LINUX_ENODATA 61    /* No data available (xattr missing, stream) */
 
 /* Linux FD flags. */
 #define LINUX_FD_CLOEXEC 1
@@ -351,6 +352,7 @@ typedef struct {
 #define LINUX_TIOCSPGRP 0x5410  /* -> macOS TIOCSPGRP (same semantics) */
 #define LINUX_TIOCSCTTY 0x540E  /* -> macOS TIOCSCTTY (same semantics) */
 #define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */
+#define LINUX_TIOCSWINSZ 0x5414 /* -> macOS TIOCSWINSZ (same struct) */
 #define LINUX_FIONREAD 0x541B   /* -> macOS FIONREAD (same semantics) */
 #define LINUX_FIONBIO 0x5421    /* set/clear O_NONBLOCK (arg: int *) */
 #define LINUX_FIONCLEX 0x5450   /* clear close-on-exec on fd */
@@ -362,6 +364,13 @@ typedef struct {
 #define LINUX_TCSETS2 0x402c542b  /* termios2 set (TCSANOW) */
 #define LINUX_TCSETSW2 0x402c542c /* termios2 set (TCSADRAIN) */
 #define LINUX_TCSETSF2 0x402c542d /* termios2 set (TCSAFLUSH) */
+/* Pseudoterminal multiplexer ioctls. The numeric encodings match Linux
+ * include/uapi/asm-generic/ioctls.h regardless of architecture. macOS exposes
+ * an equivalent /dev/ptmx and unlockpt(3); ptsname(3) returns /dev/ttysNNN.
+ */
+#define LINUX_TIOCGPTN 0x80045430   /* _IOR('T', 0x30, unsigned int) */
+#define LINUX_TIOCSPTLCK 0x40045431 /* _IOW('T', 0x31, int) */
+#define LINUX_TIOCGPTPEER 0x5441    /* _IO('T', 0x41); arg is open flags */
 
 /* Linux open flags. */
 #define LINUX_O_RDONLY 0x0000
@@ -390,6 +399,13 @@ typedef struct {
 #define LINUX_O_CLOEXEC 0x80000   /* 02000000 octal */
 #define LINUX_O_PATH 0x200000     /* 010000000 octal */
 
+/* Linux fallocate(2) mode bits (linux/falloc.h). PUNCH_HOLE requires the
+ * caller to also set KEEP_SIZE per the manpage; collapse/insert/zero/unshare
+ * range modes are recognised numerically but elsewhere unsupported.
+ */
+#define LINUX_FALLOC_FL_KEEP_SIZE 0x01
+#define LINUX_FALLOC_FL_PUNCH_HOLE 0x02
+
 /* Linux AT_* constants. */
 #define LINUX_AT_FDCWD (-100)
 #define LINUX_AT_SYMLINK_NOFOLLOW 0x100
diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c
index a072cfb..13e95bd 100644
--- a/src/syscall/fdtable.c
+++ b/src/syscall/fdtable.c
@@ -21,6 +21,7 @@
 #include "utils.h"
 
 #include "core/shim-globals.h"
+#include "runtime/procemu.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
 
@@ -477,6 +478,12 @@ void fd_cleanup_entry(int guest_fd, const fd_entry_t *snap)
     if (snap->cleanup)
         snap->cleanup(guest_fd);
 
+    /* Drop any /dev/ptmx keepalive slave fd paired with this host fd. Must
+     * happen before close(snap->host_fd) because the side table is keyed by
+     * the still-live host master fd. No-op for non-pty fds.
+     */
+    proc_pty_close_keepalive(snap->host_fd);
+
     /* Keep stdin/stdout/stderr open on the host */
     if (snap->type != FD_STDIO)
         close(snap->host_fd);
diff --git a/src/syscall/fs-stat.c b/src/syscall/fs-stat.c
index eb584b7..d246016 100644
--- a/src/syscall/fs-stat.c
+++ b/src/syscall/fs-stat.c
@@ -212,29 +212,45 @@ static int64_t stat_at_path(guest_t *g,
         return 0;
     }
 
-    host_fd_ref_t dir_ref;
-    if (host_dirfd_ref_open(dirfd, &dir_ref) < 0)
-        return -LINUX_EBADF;
-
     int64_t rc = 0;
+    host_fd_ref_t dir_ref = {.fd = -1, .owned = false};
     if ((flags & LINUX_AT_EMPTY_PATH) && pathp[0] == '\0') {
         /* Linux: AT_EMPTY_PATH with dirfd == AT_FDCWD operates on the
          * current working directory.
          */
-        if (dir_ref.fd == AT_FDCWD) {
+        if (dirfd == LINUX_AT_FDCWD) {
+            dir_ref.fd = AT_FDCWD;
             int mac_flags = translate_at_flags(flags);
             if (fstatat(AT_FDCWD, ".", mac_st, mac_flags) < 0) {
                 rc = linux_errno();
                 goto done;
             }
-        } else if (dir_ref.fd < 0) {
-            rc = -LINUX_EBADF;
-            goto done;
-        } else if (fstat(dir_ref.fd, mac_st) < 0) {
-            rc = linux_errno();
-            goto done;
+        } else {
+            fd_entry_t snap;
+            dir_ref.fd = fd_snapshot_and_dup(dirfd, &snap);
+            dir_ref.owned = true;
+            if (dir_ref.fd < 0) {
+                rc = -LINUX_EBADF;
+                goto done;
+            }
+            if (snap.type == FD_PATH && snap.proc_path[0] != '\0') {
+                int intercepted = proc_intercept_stat(snap.proc_path, mac_st);
+                if (intercepted == 0)
+                    goto done;
+                if (intercepted == -1) {
+                    rc = linux_errno();
+                    goto done;
+                }
+            }
+            if (fstat(dir_ref.fd, mac_st) < 0) {
+                rc = linux_errno();
+                goto done;
+            }
         }
     } else {
+        if (host_dirfd_ref_open(dirfd, &dir_ref) < 0)
+            return -LINUX_EBADF;
+
         int intercepted = PROC_NOT_INTERCEPTED;
         if (path_might_use_stat_intercept(tx.intercept_path)) {
             intercepted = proc_intercept_stat(tx.intercept_path, mac_st);
@@ -275,6 +291,19 @@ int64_t sys_fstat(guest_t *g, int fd, uint64_t stat_gva)
     if (frc != -LINUX_EBADF)
         return frc;
 
+    fd_entry_t snap;
+    if (fd_snapshot(fd, &snap) && snap.type == FD_PATH &&
+        snap.proc_path[0] != '\0') {
+        int intercepted = proc_intercept_stat(snap.proc_path, &mac_st);
+        if (intercepted == 0) {
+            if (write_linux_stat(g, stat_gva, &mac_st) < 0)
+                return -LINUX_EFAULT;
+            return 0;
+        }
+        if (intercepted == -1)
+            return linux_errno();
+    }
+
     host_fd_ref_t host_ref;
     if (host_fd_ref_open(fd, &host_ref) < 0) {
         log_debug("fstat(%d): invalid guest fd", fd);
diff --git a/src/syscall/fs.c b/src/syscall/fs.c
index 004f3b2..627f29b 100644
--- a/src/syscall/fs.c
+++ b/src/syscall/fs.c
@@ -105,19 +105,33 @@ static const char *proc_stateful_file_path(const char *path)
     return NULL;
 }
 
-static void fd_note_proc_path(int guest_fd, const char *path)
+/* Resolve the proc_path the fd table should record for an intercepted path.
+ * Returns true and fills *out when a mapping exists; false otherwise so the
+ * caller can skip the install entirely. Pure string work; safe to call before
+ * any lock acquisition.
+ */
+static bool resolve_virtual_path(const char *path, char *out, size_t out_size)
 {
-    if (!path || strncmp(path, "/proc", 5) != 0)
-        return;
+    if (!path || out_size == 0)
+        return false;
+
+    if (!strcmp(path, "/dev/ptmx")) {
+        str_copy_trunc(out, path, out_size);
+        return true;
+    }
+
+    if (strncmp(path, "/proc", 5) != 0)
+        return false;
 
     char virt_buf[64];
     const char *virt = proc_virtual_dir_path(path, virt_buf, sizeof(virt_buf));
     if (!virt)
         virt = proc_stateful_file_path(path);
+    if (!virt)
+        return false;
 
-    if (virt)
-        str_copy_trunc(fd_table[guest_fd].proc_path, virt,
-                       sizeof(fd_table[guest_fd].proc_path));
+    str_copy_trunc(out, virt, out_size);
+    return true;
 }
 
 static const char *proc_virtual_dir_path(const char *path,
@@ -185,7 +199,8 @@ static int fd_alloc_opened_host(int host_fd,
                                 int type,
                                 int linux_flags,
                                 int min_guest_fd,
-                                void (*cleanup)(int))
+                                void (*cleanup)(int),
+                                const char *virtual_path)
 {
     DIR *dir = NULL;
 
@@ -213,16 +228,23 @@ static int fd_alloc_opened_host(int host_fd,
         return -1;
     }
 
-    /* Publish linux_flags, dir, and the urandom bitmap bit atomically
-     * with respect to the slot's identity. fd_alloc_*_relaxed drops
+    /* Resolve the virtual-path stamp before taking fd_lock; the helper is pure
+     * string work and must not run inside the critical section.
+     */
+    char proc_path_buf[FD_VIRTUAL_PATH_MAX];
+    bool have_proc_path = resolve_virtual_path(virtual_path, proc_path_buf,
+                                               sizeof(proc_path_buf));
+
+    /* Publish linux_flags, dir, proc_path, and the urandom bitmap bit
+     * atomically with respect to the slot's identity. fd_alloc_*_relaxed drops
      * fd_lock before returning, so a sibling vCPU's pathological
-     * close(guest_fd) + open() could reuse the slot between alloc and
-     * the metadata install below. Re-acquire fd_lock and verify the
-     * (type, host_fd) tuple still matches what just got allocated;
-     * if it does not, the slot belongs to a different file now and
-     * any install would clobber the sibling's entry. The sibling's
-     * close path already cleaned up our host_fd via fd_cleanup_entry,
-     * so this side only owns dir, which gets closed below.
+     * close(guest_fd) + open() could reuse the slot between alloc and the
+     * metadata install below. Re-acquire fd_lock and verify the
+     * (type, host_fd) tuple still matches what just got allocated; if it does
+     * not, the slot belongs to a different file now and any install would
+     * clobber the sibling's entry. The sibling's close path already cleaned up
+     * the host_fd of this side via fd_cleanup_entry, so this side only owns
+     * dir, which gets closed below.
      */
     bool installed = false;
     pthread_mutex_lock(&fd_lock);
@@ -231,6 +253,9 @@ static int fd_alloc_opened_host(int host_fd,
         fd_table[guest_fd].linux_flags = linux_flags;
         if (dir)
             fd_table[guest_fd].dir = dir;
+        if (have_proc_path)
+            memcpy(fd_table[guest_fd].proc_path, proc_path_buf,
+                   sizeof(proc_path_buf));
         bool readable_urandom =
             type == FD_URANDOM &&
             (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY;
@@ -283,8 +308,8 @@ int64_t sys_openat_path(guest_t *g,
                 close_keep_errno(sidecar_fd);
                 return linux_errno();
             }
-            int guest_fd =
-                fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL);
+            int guest_fd = fd_alloc_opened_host(sidecar_fd, type, linux_flags,
+                                                -1, NULL, NULL);
             if (guest_fd < 0) {
                 close_keep_errno(sidecar_fd);
                 return linux_errno();
@@ -314,7 +339,7 @@ int64_t sys_openat_path(guest_t *g,
             return linux_errno();
         }
         int guest_fd =
-            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -342,19 +367,25 @@ int64_t sys_openat_path(guest_t *g,
             int type = intercepted_fd_type(tx.intercept_path, intercepted,
                                            linux_flags);
             if (type < 0) {
+                /* /dev/ptmx registers a keepalive slave under intercepted
+                 * before this point; without dropping it here the slave fd
+                 * leaks because nothing else has the master in fd_table.
+                 * proc_pty_close_keepalive is a no-op for other paths.
+                 */
+                proc_pty_close_keepalive(intercepted);
                 close_keep_errno(intercepted);
                 return linux_errno();
             }
             int min_guest_fd =
                 (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128;
-            int guest_fd =
-                fd_alloc_opened_host(intercepted, type, linux_flags,
-                                     min_guest_fd, fd_cleanup_for_type(type));
+            int guest_fd = fd_alloc_opened_host(
+                intercepted, type, linux_flags, min_guest_fd,
+                fd_cleanup_for_type(type), tx.intercept_path);
             if (guest_fd < 0) {
+                proc_pty_close_keepalive(intercepted);
                 close_keep_errno(intercepted);
                 return linux_errno();
             }
-            fd_note_proc_path(guest_fd, tx.intercept_path);
             return guest_fd;
         }
         if (intercepted == -1) {
@@ -375,7 +406,7 @@ int64_t sys_openat_path(guest_t *g,
             return linux_errno();
         }
         int guest_fd =
-            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -397,7 +428,8 @@ int64_t sys_openat_path(guest_t *g,
         close_keep_errno(host_fd);
         return linux_errno();
     }
-    int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
+    int guest_fd =
+        fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL, NULL);
     if (guest_fd < 0) {
         close_keep_errno(host_fd);
         return linux_errno();
@@ -430,6 +462,13 @@ int64_t sys_close(int fd)
 
     int host_fd = -1;
     if (fd_close_regular_relaxed(fd, &host_fd)) {
+        /* The fast path bypasses fd_cleanup_entry, so any side tables
+         * keyed by host_fd that the slow path drops must be drained here
+         * too. proc_pty_close_keepalive is a cheap no-op for non-pty fds
+         * and prevents the keepalive slave from leaking past a /dev/ptmx
+         * close when no per-type cleanup is registered.
+         */
+        proc_pty_close_keepalive(host_fd);
         if (close(host_fd) < 0)
             return linux_errno();
         return 0;
@@ -542,18 +581,27 @@ static int duplicate_guest_fd(int src_fd,
                               bool fixed_slot,
                               int linux_flags)
 {
-    /* Snapshot the source entry and dup its host fd in a single fd_lock
-     * critical section so the type, host fd, and metadata captured here
-     * cannot drift apart under a racing close + reopen.
+    /* Hold pty_keepalive_lock across the source snapshot, host dup, and
+     * keepalive mirror so a concurrent sys_close on src_fd cannot remove
+     * the source's keepalive entry between fd_snapshot_and_dup and
+     * proc_pty_dup_keepalive_locked. Without this bracket the alias would
+     * land in fd_table with no keepalive of its own.
+     *
+     * Lock order is pty_keepalive_lock -> fd_lock (fd_snapshot_and_dup
+     * takes fd_lock internally); proc_pty_master_adopt's joint-locked
+     * publish uses the same order so the two paths do not deadlock.
      */
+    proc_pty_lock_for_dup();
     fd_entry_t src_snap;
     int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap);
     if (new_host_fd < 0 && src_snap.type == FD_CLOSED) {
+        proc_pty_unlock_for_dup();
         errno = EBADF;
         return -1;
     }
     if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE ||
         src_snap.type == FD_FUSE_DIR) {
+        proc_pty_unlock_for_dup();
         if (new_host_fd >= 0)
             close_keep_errno(new_host_fd);
         return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot,
@@ -566,13 +614,27 @@ static int duplicate_guest_fd(int src_fd,
      * bind there.
      */
     if (src_snap.type == FD_EVENTFD) {
+        proc_pty_unlock_for_dup();
         if (new_host_fd >= 0)
             close_keep_errno(new_host_fd);
         return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd,
                               fixed_guest_fd, fixed_slot, linux_flags);
     }
-    if (new_host_fd < 0)
+    if (new_host_fd < 0) {
+        proc_pty_unlock_for_dup();
         return -1;
+    }
+
+    /* Mirror any /dev/ptmx keepalive BEFORE fd_alloc publishes guest_fd.
+     * Once the guest fd exists, a sibling thread can close it; that runs
+     * fd_cleanup_entry which calls proc_pty_close_keepalive(new_host_fd).
+     * For that cleanup to drop the freshly-duped keepalive, the keepalive
+     * entry must already be in the table; registering after fd_alloc would
+     * lose the race and leak the slave fd. No-op when the source has no
+     * keepalive.
+     */
+    proc_pty_dup_keepalive_locked(src_snap.host_fd, new_host_fd);
+    proc_pty_unlock_for_dup();
 
     int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type;
     void (*cleanup)(int) = fd_cleanup_for_type(new_type);
@@ -583,6 +645,10 @@ static int duplicate_guest_fd(int src_fd,
     if (guest_fd < 0) {
         if (fixed_slot)
             errno = EBADF;
+        /* fd_cleanup_entry never ran on new_host_fd (no guest fd was
+         * registered), so the keepalive must be dropped explicitly here.
+         */
+        proc_pty_close_keepalive(new_host_fd);
         close_keep_errno(new_host_fd);
         return -1;
     }
diff --git a/src/syscall/io.c b/src/syscall/io.c
index 67f02cd..879db75 100644
--- a/src/syscall/io.c
+++ b/src/syscall/io.c
@@ -20,6 +20,7 @@
 #include <stdbool.h>
 #include <limits.h>
 #include <pthread.h>
+#include <ctype.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
@@ -1551,6 +1552,7 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
     }
     case LINUX_TIOCGWINSZ: {
         /* Get terminal window size */
+        (void) proc_pty_master_adopt(fd);
         struct winsize ws;
         if (ioctl(host_fd, TIOCGWINSZ, &ws) < 0) {
             host_fd_ref_close(&host_ref);
@@ -1569,6 +1571,33 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
         host_fd_ref_close(&host_ref);
         return 0;
     }
+    case LINUX_TIOCSWINSZ: {
+        /* Set terminal window size. Same struct as TIOCGWINSZ; foot, sshd,
+         * tmux, and any libvte-derived emulator call this on the PTY master
+         * after spawning the slave child. Without it, terminal startup fails
+         * with -ENOTTY from the default arm below.
+         *
+         * A master received through SCM_RIGHTS bypasses /dev/ptmx open
+         * interception, so lazily create its keepalive before the host ioctl.
+         * The helper is a no-op for non-pty fds; the real ioctl below still
+         * supplies the final errno.
+         */
+        linux_winsize_t lws;
+        if (guest_read_small(g, arg, &lws, sizeof(lws)) < 0) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_EFAULT;
+        }
+        struct winsize ws = {
+            .ws_row = lws.ws_row,
+            .ws_col = lws.ws_col,
+            .ws_xpixel = lws.ws_xpixel,
+            .ws_ypixel = lws.ws_ypixel,
+        };
+        (void) proc_pty_master_adopt(fd);
+        int rc = ioctl(host_fd, TIOCSWINSZ, &ws);
+        host_fd_ref_close(&host_ref);
+        return rc < 0 ? linux_errno() : 0;
+    }
 
     case LINUX_TCGETS: {
         /* Get terminal attributes. c_cc index mapping is in file-scope
@@ -1715,6 +1744,96 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
         return 0;
     }
 
+    case LINUX_TIOCGPTN: {
+        /* Get the slave pty number associated with a /dev/ptmx master fd.
+         * Pass the guest fd: proc_pty_master_adopt snapshots the canonical
+         * (host_fd, generation) under fd_lock, performs the slave open on a
+         * private dup, then re-validates the slot before publishing the
+         * keepalive. Passing the per-syscall host_fd_ref dup or a raw host
+         * fd would race with sibling close+reuse.
+         */
+        uint32_t val = proc_pty_master_adopt(fd);
+        if (val == UINT32_MAX) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_ENOTTY;
+        }
+        if (guest_write_small(g, arg, &val, sizeof(val)) < 0) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_EFAULT;
+        }
+        host_fd_ref_close(&host_ref);
+        return 0;
+    }
+    case LINUX_TIOCSPTLCK: {
+        /* Lock/unlock the slave side of a pty. glibc unlockpt() always passes
+         * 0 (unlock); util-linux's setlock(1) passes 1 to lock. macOS exposes
+         * unlockpt(3) but no re-lock primitive, so the lock branch is accepted
+         * as a best-effort no-op for real ptmx masters rather than surfacing
+         * as -EINVAL: an application probing the result would otherwise
+         * misread the failure as "this kernel has no devpts".
+         */
+        int32_t lock = 0;
+        if (guest_read_small(g, arg, &lock, sizeof(lock)) < 0) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_EFAULT;
+        }
+        int rc = 0;
+        if (lock == 0) {
+            rc = unlockpt(host_fd);
+        } else {
+            char slave[64];
+            if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) {
+                host_fd_ref_close(&host_ref);
+                return -LINUX_ENOTTY;
+            }
+        }
+        host_fd_ref_close(&host_ref);
+        return rc < 0 ? linux_errno() : 0;
+    }
+    case LINUX_TIOCGPTPEER: {
+        /* Return a fresh fd referring to the slave side of a /dev/ptmx master.
+         * Linux added this in 4.13 so callers can avoid the ptsname(3) round
+         * trip and any /dev/pts visibility races. The arg holds open(2)-style
+         * flags. Restrict to the bits Linux's pty driver actually honors
+         * (accmode + O_NOCTTY + O_NONBLOCK + O_CLOEXEC); any other bit, in
+         * particular O_CREAT / O_TRUNC / O_EXCL / O_PATH, would be silently
+         * ignored on Linux and is rejected with EINVAL here so misuse does
+         * not leak nonsense flags into the guest fd table.
+         */
+        int linux_flags = (int) arg;
+        const int allowed = LINUX_O_ACCMODE | LINUX_O_NOCTTY |
+                            LINUX_O_NONBLOCK | LINUX_O_CLOEXEC;
+        if (linux_flags & ~allowed) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_EINVAL;
+        }
+        char slave[64];
+        if (ptsname_r(host_fd, slave, sizeof(slave)) != 0) {
+            host_fd_ref_close(&host_ref);
+            return -LINUX_ENOTTY;
+        }
+        int oflags = translate_open_flags(linux_flags);
+        int host_slave_fd = open(slave, oflags);
+        if (host_slave_fd < 0) {
+            int saved_errno = errno;
+            host_fd_ref_close(&host_ref);
+            errno = saved_errno;
+            return linux_errno();
+        }
+        host_fd_ref_close(&host_ref);
+        int guest_fd = fd_alloc(FD_REGULAR, host_slave_fd, NULL);
+        if (guest_fd < 0) {
+            close(host_slave_fd);
+            return -LINUX_EMFILE;
+        }
+        /* Track CLOEXEC + accmode in the guest table so exec honors them; the
+         * host fd's own FD_CLOEXEC is per-descriptor and would be lost on the
+         * dup that host_fd_ref hands multi-threaded callers.
+         */
+        fd_publish_linux_flags(guest_fd, linux_flags);
+        return guest_fd;
+    }
+
     case LINUX_FIONBIO: {
         /* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg:
          * nonzero enables non-blocking, zero disables it. libuv's
@@ -1759,8 +1878,83 @@ int64_t sys_fallocate(int fd, int mode, int64_t offset, int64_t len)
         return -LINUX_EINVAL;
     }
 
-    /* mode 0 = basic allocation -> ftruncate fallback.
-     * Other modes (FALLOC_FL_PUNCH_HOLE etc.) not supported.
+    /* FALLOC_FL_PUNCH_HOLE always requires FALLOC_FL_KEEP_SIZE on Linux;
+     * map both to macOS F_PUNCHHOLE on the host fd, with a pwrite-zeros
+     * fallback for misalignment.
+     *
+     * The Linux semantic is "reads in [offset, offset+len) return zero;
+     * file size unchanged". macOS F_PUNCHHOLE enforces filesystem block
+     * alignment on both ends and rejects sub-block requests with EINVAL --
+     * that one-byte probe (offset=0 len=1) foot's wl_shm pool issues
+     * surfaces as "fallocate(FALLOC_FL_PUNCH_HOLE) not supported (Invalid
+     * argument)" otherwise, and foot disables punch-hole for the whole
+     * session.
+     *
+     * Writing zeros over the region produces the same observable result:
+     * reads return zero, file size unchanged. The disk-space deallocation
+     * optimisation is lost on the pwrite path, but the probe succeeds, so
+     * foot keeps punch-hole enabled and the later, properly aligned calls
+     * (page-sized buffers) still take the F_PUNCHHOLE fast path.
+     */
+    const int kPunchHole =
+        LINUX_FALLOC_FL_PUNCH_HOLE | LINUX_FALLOC_FL_KEEP_SIZE;
+    if (mode == kPunchHole) {
+        struct fpunchhole hole = {
+            .fp_flags = 0,
+            .reserved = 0,
+            .fp_offset = (off_t) offset,
+            .fp_length = (off_t) len,
+        };
+        if (fcntl(host_ref.fd, F_PUNCHHOLE, &hole) == 0) {
+            host_fd_ref_close(&host_ref);
+            return 0;
+        }
+        /* EINVAL: misaligned, sub-block, or non-regular file. pwrite zeros
+         * only through the current EOF so KEEP_SIZE remains guest-visible.
+         * Any other host errno propagates verbatim.
+         */
+        if (errno != EINVAL) {
+            host_fd_ref_close(&host_ref);
+            return linux_errno();
+        }
+        struct stat st;
+        if (fstat(host_ref.fd, &st) < 0) {
+            host_fd_ref_close(&host_ref);
+            return linux_errno();
+        }
+        if (offset >= st.st_size) {
+            host_fd_ref_close(&host_ref);
+            return 0;
+        }
+        int64_t remaining = st.st_size - offset;
+        if (remaining > len)
+            remaining = len;
+
+        static const char zeros[4096];
+        off_t cur = (off_t) offset;
+        while (remaining > 0) {
+            size_t chunk = remaining > (int64_t) sizeof(zeros)
+                               ? sizeof(zeros)
+                               : (size_t) remaining;
+            ssize_t nw = pwrite(host_ref.fd, zeros, chunk, cur);
+            if (nw < 0) {
+                if (errno == EINTR)
+                    continue;
+                host_fd_ref_close(&host_ref);
+                return linux_errno();
+            }
+            if (nw == 0)
+                break; /* defensive; pwrite on a regular file should not 0 */
+            cur += nw;
+            remaining -= nw;
+        }
+        host_fd_ref_close(&host_ref);
+        return 0;
+    }
+
+    /* mode 0 = basic allocation -> ftruncate fallback. Anything else
+     * (collapse range, zero range, insert range, unshare range) stays
+     * unsupported and surfaces as -EOPNOTSUPP for the guest to handle.
      */
     if (mode != 0) {
         host_fd_ref_close(&host_ref);
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index 415fa2e..b344816 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -1478,6 +1478,26 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g,
     return 0;
 }
 
+/* True when the backing fd allows writes through. The overlay path replaces
+ * the slab's RW host VA with MAP_SHARED|MAP_FIXED of this fd, and Apple HVF
+ * refuses hv_vm_map of any permission onto a host VA whose write capability
+ * does not cover the requested stage-2 perms. A read-only fd lands there
+ * with the kernel rejecting either PROT_WRITE on the host mmap or, after a
+ * PROT_READ downgrade, the post-overlay hv_vm_map with HV_DENIED.
+ * Centralises that decision: both the overlay entry (hvf_apply_file_overlay)
+ * and the sys_mmap fast-path skip share this gate so read-only backers are
+ * routed straight to the snapshot pread path. Returns true on the optimistic
+ * path when fcntl itself fails: the subsequent mmap / hv_vm_map will surface
+ * the real error rather than this helper synthesising one.
+ */
+static bool overlay_fd_writable(int fd)
+{
+    int fl = fcntl(fd, F_GETFL);
+    if (fl < 0)
+        return true;
+    return (fl & O_ACCMODE) != O_RDONLY;
+}
+
 /* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd,
  * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB
  * segment is split out first if it is not already isolated. Caller
@@ -1491,6 +1511,8 @@ static int hvf_apply_file_overlay(guest_t *g,
                                   int fd,
                                   off_t file_off)
 {
+    if (!overlay_fd_writable(fd))
+        return -LINUX_EACCES;
     thread_quiesce_siblings();
     int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off);
     thread_resume_siblings();
@@ -2267,7 +2289,14 @@ int64_t sys_mmap(guest_t *g,
          * gap-finder advances the hint to the next host-page boundary
          * after each allocation.
          */
+        /* overlay_fd_writable rejects read-only backing fds inside
+         * hvf_apply_file_overlay; mirror the check here so a read-only
+         * mmap takes the snapshot pread path directly, skipping the
+         * thread_quiesce / segment_split cycle the overlay would
+         * otherwise perform before returning EACCES.
+         */
         bool overlay_aligned = (flags & LINUX_MAP_SHARED) &&
+                               overlay_fd_writable(host_backing_fd) &&
                                (result_off % hps == 0) &&
                                ((uint64_t) offset % hps == 0);
         if (overlay_aligned) {
@@ -3977,9 +4006,25 @@ int mmap_fork_restore_overlays(guest_t *g,
         int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd,
                                          (off_t) file_off);
         if (err < 0) {
-            log_warn(
-                "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d",
-                (unsigned long long) ovl_s, (unsigned long long) ovl_e, err);
+            /* -LINUX_EACCES is the writable-fd gate in hvf_apply_file_overlay
+             * rejecting a read-only backing fd (foot's fontconfig caches,
+             * shared library file-backed regions, etc.). The fallback to
+             * snapshot semantics is correct for those: the child reads the
+             * pre-fork bytes and never writes back, which is what the parent
+             * already did. Log at debug level so the success path stays
+             * quiet. Any other failure is unexpected and stays at warn.
+             */
+            if (err == -LINUX_EACCES)
+                log_debug(
+                    "fork-child: read-only backing fd, skipping overlay "
+                    "[0x%llx, 0x%llx) (snapshot semantics)",
+                    (unsigned long long) ovl_s, (unsigned long long) ovl_e);
+            else
+                log_warn(
+                    "fork-child: overlay re-install [0x%llx, 0x%llx) failed: "
+                    "%d",
+                    (unsigned long long) ovl_s, (unsigned long long) ovl_e,
+                    err);
             rc = err;
             continue;
         }
diff --git a/src/syscall/path.c b/src/syscall/path.c
index b8599ed..d2bf6d8 100644
--- a/src/syscall/path.c
+++ b/src/syscall/path.c
@@ -75,6 +75,13 @@ bool path_might_use_stat_intercept(const char *path)
         return true;
     if (!strcmp(path, "/dev/fuse"))
         return true;
+    /* glibc ptsname(3) stats /dev/pts/N after TIOCGPTN to confirm the slave
+     * exists and is a char device; without this the stat falls through to the
+     * host where /dev/pts is absent and ptsname returns ENOENT.
+     */
+    if (!strncmp(path, "/dev/pts/", 9) || !strcmp(path, "/dev/pts") ||
+        !strcmp(path, "/dev/pts/"))
+        return true;
     if (fuse_path_matches_mount(path))
         return true;
     if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1))
diff --git a/src/syscall/poll.c b/src/syscall/poll.c
index 705a902..aa65d0b 100644
--- a/src/syscall/poll.c
+++ b/src/syscall/poll.c
@@ -210,7 +210,7 @@ int64_t sys_ppoll(guest_t *g,
                    poll_timeout_ms < 0 ? 200 : poll_timeout_ms);
 
         /* Check for exit_group / futex_interrupt after waking */
-        if (proc_exit_group_requested() || futex_interrupt_pending()) {
+        if (proc_exit_group_requested() || futex_interrupt_consume()) {
             ret = -1;
             errno = EINTR;
             break;
@@ -549,7 +549,7 @@ int64_t sys_pselect6(guest_t *g,
                           has_timeout ? &ts : &poll_ts, NULL);
         }
 
-        if (proc_exit_group_requested() || futex_interrupt_pending()) {
+        if (proc_exit_group_requested() || futex_interrupt_consume()) {
             ret = -1;
             errno = EINTR;
             break;
@@ -1030,7 +1030,7 @@ int64_t sys_epoll_pwait(guest_t *g,
         nready = kevent(epoll_ref.fd, NULL, 0, kevents, cap,
                         has_timeout ? &ts : &poll_ts);
 
-        if (proc_exit_group_requested() || futex_interrupt_pending()) {
+        if (proc_exit_group_requested() || futex_interrupt_consume()) {
             nready = -1;
             errno = EINTR;
             break;
diff --git a/src/syscall/translate.c b/src/syscall/translate.c
index 018690d..7cf2981 100644
--- a/src/syscall/translate.c
+++ b/src/syscall/translate.c
@@ -93,6 +93,22 @@ int64_t linux_errno(void)
 #if ENOTSUP != EOPNOTSUPP
     case ENOTSUP:
         return -LINUX_EOPNOTSUPP;
+#endif
+        /* macOS xattr "attribute not found" lives at 93 (ENOATTR) on modern
+         * SDKs; on some versions ENOATTR is a synonym for ENODATA(96). Map
+         * both to Linux ENODATA(61) so getxattr/lgetxattr/fgetxattr report
+         * missing attrs correctly. Guarded by #if to avoid duplicate cases
+         * when the headers alias the two macros.
+         */
+#ifdef ENOATTR
+#if !defined(ENODATA) || ENOATTR != ENODATA
+    case ENOATTR:
+        return -LINUX_ENODATA;
+#endif
+#endif
+#ifdef ENODATA
+    case ENODATA:
+        return -LINUX_ENODATA;
 #endif
 #ifdef ENOTRECOVERABLE
     case ENOTRECOVERABLE:
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 8d836eb..b8e779f 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -68,6 +68,7 @@ test-epoll-aba
 test-timerfd
 test-large-io-boundary
 test-ioctl-cloexec
+test-pty
 
 [section] /proc and /dev emulation tests
 test-proc
@@ -107,6 +108,9 @@ test-cloexec
 [section] O_PATH semantics tests
 test-opath
 
+[section] xattr semantics tests
+test-xattr
+
 [section] Guard page / mmap edge cases
 test-guard-page
 test-mmap-hint
diff --git a/tests/test-futex-pi.c b/tests/test-futex-pi.c
index 66b84b7..ad3ebbf 100644
--- a/tests/test-futex-pi.c
+++ b/tests/test-futex-pi.c
@@ -12,9 +12,12 @@
  *   2. FUTEX_LOCK_PI + FUTEX_UNLOCK_PI round-trip: acquire and
  *      release a PI lock from the same thread.
  *
- *   3. futex_wait EINTR injection (commit 18bdd0f): futex_wait with
- *      no timeout returns -EINTR within ~1-2 seconds when no waker
- *      exists (simulated periodic signal delivery).
+ *   3. futex_wait blocks indefinitely without a signal: an earlier
+ *      revision returned synthetic -EINTR after ~1 s of unconditional
+ *      blocking, which violated POSIX sem_wait callers that do not
+ *      retry on EINTR (e.g. foot's render_worker_thread). The wait
+ *      must only return when a real wake arrives or a signal is
+ *      genuinely queued for the thread.
  *
  * Syscalls exercised: futex(98), clone(220), gettid(178), exit(93)
  */
@@ -198,92 +201,77 @@ static void test_pi_dead_owner(void)
     PASS();
 }
 
-/* Test 3: EINTR injection after ~1s */
+/* Test 3: futex_wait without a signal blocks until woken */
 
-/* Sibling that keeps the guest in a multi-threaded state for the duration of
- * the EINTR probe. The synthetic EINTR injection in futex_wait only fires
- * while thread_is_single_active() is false; a single-threaded guest must be
- * allowed to park in FUTEX_WAIT indefinitely so it does not break glibc
- * startup paths. The probe therefore has to run with at least one other guest
- * thread alive.
- *
- * The sibling sleeps on a timed futex_wait against keepalive_word with a
- * 5-second timeout. The timeout dodges the EINTR injection ('!has_timeout' is
- * what gates the sim), and 5 s is long enough to outlast the worst-case parent
- * EINTR window (1 s with up to 100 ms poll jitter, plus a safety margin). After
- * the parent's probe returns, the parent flips keepalive_word and wakes the
- * sibling.
+/* Sibling that waits ~1.2 s, flips the futex word, and issues FUTEX_WAKE on
+ * the parent's address. Used to drive the parent out of an indefinite
+ * futex_wait via a real wake (not synthetic EINTR).
  */
-static volatile int sibling_keepalive __attribute__((aligned(4))) = 1;
-static char sibling_stack_buf[8192] __attribute__((aligned(16)));
+static volatile int waker_word __attribute__((aligned(4))) = 0;
+static char waker_stack_buf[8192] __attribute__((aligned(16)));
 
-static void sibling_alive_thread(void)
+static void waker_thread(void)
 {
-    struct timespec ts = {5, 0};
-    while (__atomic_load_n(&sibling_keepalive, __ATOMIC_SEQ_CST) == 1) {
-        raw_syscall6(__NR_futex, (long) &sibling_keepalive,
-                     FUTEX_WAIT | FUTEX_PRIVATE, 1, (long) &ts, 0, 0);
-    }
+    struct timespec ts = {1, 200 * 1000 * 1000};
+    raw_syscall6(__NR_nanosleep, (long) &ts, 0, 0, 0, 0, 0);
+
+    __atomic_store_n(&waker_word, 1, __ATOMIC_SEQ_CST);
+    raw_futex_wake((int *) &waker_word, 1);
     raw_exit(0);
 }
 
 static void test_futex_eintr(void)
 {
-    TEST("futex_wait EINTR after ~1s");
+    TEST("futex_wait blocks until real wake, no synthetic EINTR");
 
-    /* Spawn the sibling so thread_is_single_active() is false during the wait.
-     * CLONE flags match test_pi_dead_owner.
-     */
-    sibling_keepalive = 1;
-    void *sibling_top = sibling_stack_buf + sizeof(sibling_stack_buf);
-    int sibling_tid_val = 0;
-    long sret = raw_clone(0x7d0f00, sibling_top, &sibling_tid_val, 0,
-                          (int *) &sibling_tid_val);
+    waker_word = 0;
+    void *waker_top = waker_stack_buf + sizeof(waker_stack_buf);
+    int waker_tid_val = 0;
+    long sret = raw_clone(0x7d0f00, waker_top, &waker_tid_val, 0,
+                          (int *) &waker_tid_val);
     if (sret < 0) {
-        FAIL("sibling clone failed");
+        FAIL("waker clone failed");
         return;
     }
     if (sret == 0) {
-        sibling_alive_thread();
+        waker_thread();
         raw_exit(1); /* unreachable */
     }
 
-    /* Create a futex word that no one will wake.
-     * futex_wait with no timeout should return -EINTR after ~1 second
-     * (elfuse's simulated periodic signal delivery).
-     */
-    volatile int unwoken = 42;
-
     struct timeval t0, t1;
     gettimeofday(&t0, NULL);
 
-    long r = raw_futex_wait((int *) &unwoken, 42);
+    /* No timeout. With no signal queued, the wait must NOT return until the
+     * waker thread issues FUTEX_WAKE; earlier revisions returned synthetic
+     * -EINTR after ~1 s, which broke glibc sem_wait callers.
+     */
+    long r = raw_futex_wait((int *) &waker_word, 0);
 
     gettimeofday(&t1, NULL);
     long elapsed_ms =
         (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_usec - t0.tv_usec) / 1000;
 
-    /* Tear down the sibling now that the EINTR check is done. */
-    __atomic_store_n(&sibling_keepalive, 0, __ATOMIC_SEQ_CST);
-    raw_futex_wake((int *) &sibling_keepalive, 1);
+    /* Reap the waker. */
     for (int i = 0; i < 100; i++) {
-        if (__atomic_load_n(&sibling_tid_val, __ATOMIC_SEQ_CST) == 0)
+        if (__atomic_load_n(&waker_tid_val, __ATOMIC_SEQ_CST) == 0)
             break;
         usleep(10000);
     }
 
-    /* Expect -EINTR (Linux errno 4) after 800ms-3000ms.
-     * The 1s timeout has jitter from 100ms polling intervals.
+    /* The waker sleeps ~1200 ms before waking the parent. Accept either rc==0
+     * (woken by FUTEX_WAKE after the wait observed waker_word change) or
+     * -EAGAIN (woken between the waker_word store and the FUTEX_WAKE; the
+     * value-mismatch path is the documented race for FUTEX_WAIT). Either
+     * outcome proves the parent did not bail out on synthetic EINTR.
      */
-    if (r == -4 /* -EINTR */ && elapsed_ms >= 800 && elapsed_ms <= 3000) {
+    if ((r == 0 || r == -11 /* -EAGAIN */) && elapsed_ms >= 1000 &&
+        elapsed_ms <= 4000) {
         PASS();
-    } else if (r == -4) {
-        /* Got EINTR but timing seems off; still passing, but note it */
-        printf("OK (EINTR after %ldms)\n", elapsed_ms);
-        passes++;
     } else {
-        printf("FAIL: expected -EINTR(-4) got %ld, elapsed %ldms\n", r,
-               elapsed_ms);
+        printf(
+            "FAIL: expected rc=0 or -EAGAIN after ~1200ms, got %ld at "
+            "%ldms\n",
+            r, elapsed_ms);
         fails++;
     }
 }
diff --git a/tests/test-io-opt.c b/tests/test-io-opt.c
index e1691c1..b139a60 100644
--- a/tests/test-io-opt.c
+++ b/tests/test-io-opt.c
@@ -165,6 +165,37 @@ int main(void)
             FAIL("open failed");
     }
 
+    TEST("fallocate punch hole keeps size past EOF");
+    {
+        const char *punch_path = "/tmp/elfuse-test-punch.bin";
+        unlink(punch_path);
+        int fd = open(punch_path, O_CREAT | O_RDWR, 0644);
+        if (fd >= 0) {
+            const char data[] = "abc";
+            const off_t data_size = (off_t) sizeof(data) - 1;
+            if (write(fd, data, sizeof(data) - 1) == (ssize_t) data_size &&
+                fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1,
+                          4096) == 0) {
+                struct stat st;
+                char buf[sizeof(data) - 1];
+                int stat_ok = fstat(fd, &st) == 0;
+                ssize_t nr = -1;
+                if (stat_ok)
+                    nr = pread(fd, buf, sizeof(buf), 0);
+                if (stat_ok && st.st_size == data_size &&
+                    nr == (ssize_t) sizeof(buf) && buf[0] == 'a' &&
+                    buf[1] == '\0' && buf[2] == '\0')
+                    PASS();
+                else
+                    FAIL("punch hole did not preserve size and zero bytes");
+            } else
+                FAIL("punch hole setup failed");
+            close(fd);
+            unlink(punch_path);
+        } else
+            FAIL("open failed");
+    }
+
     /* Test copy_file_range (via off_t-based API) */
     TEST("copy_file_range");
     {
diff --git a/tests/test-pty.c b/tests/test-pty.c
new file mode 100644
index 0000000..ce6344f
--- /dev/null
+++ b/tests/test-pty.c
@@ -0,0 +1,584 @@
+/* PTY ioctl regression test
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Foot, sshd, tmux, and any libvte-derived terminal need the multiplexer
+ * primitives glibc's posix_openpt(3) / ptsname(3) / openpty(3) stack rests
+ * on. Exercise the pieces glibc and posix-compliant pty children depend on:
+ *
+ *   1. TIOCSWINSZ on the /dev/ptmx master fd (the direct failure foot saw)
+ *   2. TIOCGPTN -> /dev/pts/N path round trip
+ *   3. TIOCSPTLCK(0) for unlockpt(), plus non-zero lock request fd typing
+ *   4. /dev/pts/N open + stat intercept and the slave fd's window size
+ *   5. /dev/pts/N open in a forked child after the child has already closed
+ *      its master (foot/sshd/openssh sftp-server pattern)
+ *
+ * The test stays self-contained on the syscall surface (no libutil/openpty),
+ * so it runs the same way under elfuse-aarch64, qemu-aarch64, and any future
+ * elfuse-x86_64 reuse.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/syscall.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include "test-harness.h"
+
+#ifndef TIOCSWINSZ
+#define TIOCSWINSZ 0x5414
+#endif
+#ifndef TIOCGPTN
+#define TIOCGPTN 0x80045430
+#endif
+#ifndef TIOCSPTLCK
+#define TIOCSPTLCK 0x40045431
+#endif
+#ifndef TIOCGPTPEER
+#define TIOCGPTPEER 0x5441
+#endif
+#ifndef O_PATH
+#define O_PATH 010000000
+#endif
+#ifndef AT_EMPTY_PATH
+#define AT_EMPTY_PATH 0x1000
+#endif
+#ifndef SYS_statx
+#define SYS_statx 291
+#endif
+
+int passes = 0, fails = 0;
+
+static int count_pts_entries(void)
+{
+    DIR *dir = opendir("/dev/pts");
+    if (!dir)
+        return -1;
+
+    int count = 0;
+    struct dirent *ent;
+    while ((ent = readdir(dir))) {
+        if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
+            continue;
+        int numeric = 1;
+        for (const char *p = ent->d_name; *p; p++) {
+            if (!isdigit((unsigned char) *p)) {
+                numeric = 0;
+                break;
+            }
+        }
+        if (numeric)
+            count++;
+    }
+    closedir(dir);
+    return count;
+}
+
+int main(void)
+{
+    printf("test-pty: PTY ioctl + /dev/pts/N path support\n");
+
+    /* Regression guard for the pty_keepalive_table BSS-zero collision: any
+     * close that runs before the very first /dev/ptmx open would walk the
+     * still-zero-initialized table and match a master_host_fd of zero,
+     * silently closing the wrong slave_host_fd (also zero). Closing stdin
+     * here forces fd_cleanup_entry to invoke proc_pty_close_keepalive in
+     * that vulnerable window. If the fix regresses, every subsequent test
+     * still passes locally but a future stdio fd in another vCPU may go
+     * missing. The cheap sentinel makes sure stdin's close itself does
+     * not corrupt elfuse's own file table.
+     */
+    close(STDIN_FILENO);
+
+    TEST("open(/dev/ptmx, O_PATH) does not allocate a pty");
+    int before_path_pts = count_pts_entries();
+    int path_fd = open("/dev/ptmx", O_PATH | O_CLOEXEC);
+    int after_path_pts = count_pts_entries();
+    if (path_fd >= 0) {
+        struct stat ptmx_path_st;
+        int fstat_rc = fstat(path_fd, &ptmx_path_st);
+        struct stat ptmx_at_st;
+        int fstatat_rc = fstatat(path_fd, "", &ptmx_at_st, AT_EMPTY_PATH);
+        struct statx ptmx_sx;
+        memset(&ptmx_sx, 0, sizeof(ptmx_sx));
+        long statx_rc =
+            syscall(SYS_statx, path_fd, "", AT_EMPTY_PATH, 0x7ff, &ptmx_sx);
+        errno = 0;
+        int fchdir_rc = fchdir(path_fd);
+        int fchdir_errno = errno;
+        int ok = before_path_pts >= 0 && after_path_pts == before_path_pts &&
+                 fstat_rc == 0 && S_ISCHR(ptmx_path_st.st_mode) &&
+                 major(ptmx_path_st.st_rdev) == 5 &&
+                 minor(ptmx_path_st.st_rdev) == 2 && fstatat_rc == 0 &&
+                 S_ISCHR(ptmx_at_st.st_mode) &&
+                 major(ptmx_at_st.st_rdev) == 5 &&
+                 minor(ptmx_at_st.st_rdev) == 2 && statx_rc == 0 &&
+                 S_ISCHR(ptmx_sx.stx_mode) && ptmx_sx.stx_rdev_major == 5 &&
+                 ptmx_sx.stx_rdev_minor == 2 && fchdir_rc < 0 &&
+                 fchdir_errno == ENOTDIR;
+        close(path_fd);
+        EXPECT_TRUE(ok,
+                    "O_PATH open allocated a pty or exposed wrong path fd "
+                    "semantics");
+    } else {
+        FAIL("O_PATH /dev/ptmx open failed");
+    }
+
+    int ptmx = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+    TEST("open(/dev/ptmx, O_RDWR | O_NOCTTY)");
+    EXPECT_TRUE(ptmx >= 0, "open /dev/ptmx failed");
+    if (ptmx < 0) {
+        SUMMARY("test-pty");
+        return 1;
+    }
+
+    /* TIOCSWINSZ on the master was the direct regression that broke foot's
+     * terminal startup: sys_ioctl had no case for it and fell through to the
+     * default -ENOTTY arm.
+     */
+    TEST("TIOCSWINSZ on /dev/ptmx master");
+    struct winsize ws_set = {
+        .ws_row = 40,
+        .ws_col = 132,
+        .ws_xpixel = 1056,
+        .ws_ypixel = 640,
+    };
+    EXPECT_TRUE(ioctl(ptmx, TIOCSWINSZ, &ws_set) == 0, "TIOCSWINSZ failed");
+
+    TEST("TIOCGWINSZ round-trips the values set above");
+    struct winsize ws_get = {0};
+    int ok = ioctl(ptmx, TIOCGWINSZ, &ws_get) == 0 && ws_get.ws_row == 40 &&
+             ws_get.ws_col == 132 && ws_get.ws_xpixel == 1056 &&
+             ws_get.ws_ypixel == 640;
+    EXPECT_TRUE(ok, "TIOCGWINSZ round trip mismatch");
+
+    TEST("TIOCSPTLCK(0) unlocks the slave");
+    int unlock = 0;
+    EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &unlock) == 0, "TIOCSPTLCK(0) failed");
+
+    /* Linux TIOCSPTLCK(non-zero) locks the slave and returns success.
+     * elfuse cannot actually enforce the lock on macOS (no re-lock primitive)
+     * but must still report success so callers do not misread the result as
+     * "this kernel has no devpts".
+     */
+    TEST("TIOCSPTLCK(1) accepted as best-effort no-op");
+    int lock = 1;
+    EXPECT_TRUE(ioctl(ptmx, TIOCSPTLCK, &lock) == 0, "TIOCSPTLCK(1)");
+
+    TEST("TIOCSPTLCK(1) rejects a regular file");
+    char lock_template[] = "/tmp/elfuse-pty-lock-XXXXXX";
+    int regular = mkstemp(lock_template);
+    if (regular < 0) {
+        FAIL("mkstemp failed");
+    } else {
+        EXPECT_ERRNO(ioctl(regular, TIOCSPTLCK, &lock), ENOTTY,
+                     "regular fd accepted TIOCSPTLCK(1)");
+        close(regular);
+        unlink(lock_template);
+    }
+
+    TEST("TIOCSPTLCK(1) rejects a pipe");
+    int lock_pipe[2];
+    if (pipe(lock_pipe) != 0) {
+        FAIL("pipe(lock_pipe) failed");
+    } else {
+        EXPECT_ERRNO(ioctl(lock_pipe[0], TIOCSPTLCK, &lock), ENOTTY,
+                     "pipe fd accepted TIOCSPTLCK(1)");
+        close(lock_pipe[0]);
+        close(lock_pipe[1]);
+    }
+
+    TEST("TIOCGPTN returns a numeric slave id");
+    unsigned int ptyno = (unsigned int) -1;
+    EXPECT_TRUE(ioctl(ptmx, TIOCGPTN, &ptyno) == 0 && ptyno < 100000u,
+                "TIOCGPTN failed");
+
+    TEST("stat(/dev/pts) succeeds and reports a directory");
+    struct stat pts_dir_st;
+    int pts_dir_statrc = stat("/dev/pts", &pts_dir_st);
+    EXPECT_TRUE(pts_dir_statrc == 0 && S_ISDIR(pts_dir_st.st_mode),
+                "stat /dev/pts failed");
+
+    TEST("open(/dev/pts, O_DIRECTORY) succeeds");
+    int pts_dir_fd = open("/dev/pts", O_RDONLY | O_DIRECTORY);
+    EXPECT_TRUE(pts_dir_fd >= 0, "open /dev/pts directory failed");
+    if (pts_dir_fd >= 0)
+        close(pts_dir_fd);
+
+    TEST("readdir(/dev/pts) lists the active slave id");
+    DIR *pts_dir = opendir("/dev/pts");
+    if (!pts_dir) {
+        FAIL("opendir /dev/pts failed");
+    } else {
+        char want[32];
+        snprintf(want, sizeof(want), "%u", ptyno);
+        int saw_ptyno = 0;
+        struct dirent *ent;
+        while ((ent = readdir(pts_dir))) {
+            if (!strcmp(ent->d_name, want)) {
+                saw_ptyno = 1;
+                break;
+            }
+        }
+        closedir(pts_dir);
+        EXPECT_TRUE(saw_ptyno, "active pts id missing from /dev/pts");
+    }
+
+    char pts_path[32];
+    snprintf(pts_path, sizeof(pts_path), "/dev/pts/%u", ptyno);
+
+    /* glibc ptsname(3) stats the formatted path before returning it.
+     * Until the path.c stat allowlist included /dev/pts/N, the stat went
+     * to the host (which has no /dev/pts at all) and ptsname returned
+     * ENOENT, leaving every caller without a usable slave path.
+     */
+    TEST("stat(/dev/pts/N) succeeds and reports a char device");
+    struct stat st;
+    int statrc = stat(pts_path, &st);
+    EXPECT_TRUE(statrc == 0 && S_ISCHR(st.st_mode), "stat /dev/pts/N failed");
+
+    TEST("stat(/dev/pts/N) major is Linux pts (136)");
+    EXPECT_EQ(major(st.st_rdev), 136, "wrong pts major");
+
+    TEST("open(/dev/pts/N) returns a usable slave fd");
+    int slave = open(pts_path, O_RDWR | O_NOCTTY);
+    EXPECT_TRUE(slave >= 0, "open /dev/pts/N failed");
+
+    if (slave >= 0) {
+        TEST("TIOCGWINSZ on the slave reflects the master-side update");
+        struct winsize ws_slave = {0};
+        int slave_ok = ioctl(slave, TIOCGWINSZ, &ws_slave) == 0 &&
+                       ws_slave.ws_row == 40 && ws_slave.ws_col == 132;
+        EXPECT_TRUE(slave_ok, "slave winsize mismatch");
+
+        TEST("TIOCSPTLCK(1) rejects the pty slave");
+        EXPECT_ERRNO(ioctl(slave, TIOCSPTLCK, &lock), ENOTTY,
+                     "slave fd accepted TIOCSPTLCK(1)");
+
+        TEST("TIOCSWINSZ on the slave propagates back to the master");
+        struct winsize ws_resize = {
+            .ws_row = 24,
+            .ws_col = 80,
+            .ws_xpixel = 0,
+            .ws_ypixel = 0,
+        };
+        if (ioctl(slave, TIOCSWINSZ, &ws_resize) != 0) {
+            FAIL("slave TIOCSWINSZ failed");
+        } else {
+            struct winsize ws_after = {0};
+            int after_ok = ioctl(ptmx, TIOCGWINSZ, &ws_after) == 0 &&
+                           ws_after.ws_row == 24 && ws_after.ws_col == 80;
+            EXPECT_TRUE(after_ok, "master did not see slave resize");
+        }
+
+        close(slave);
+    }
+
+    /* TIOCGPTPEER short-circuits the ptsname/stat/open dance. Recent foot
+     * and util-linux prefer it; older kernels return ENOTTY and the caller
+     * falls back to /dev/pts. Accept either an fd or ENOTTY (some hosts
+     * legitimately do not implement it), but never silent corruption. */
+    TEST("TIOCGPTPEER returns a slave fd or ENOTTY");
+    int peer = ioctl(ptmx, TIOCGPTPEER, O_RDWR | O_NOCTTY);
+    int peer_ok = peer >= 0 || (peer == -1 && errno == ENOTTY);
+    EXPECT_TRUE(peer_ok, "TIOCGPTPEER returned unexpected status");
+    if (peer >= 0)
+        close(peer);
+
+    /* dup of the master must keep both aliases functional even after the
+     * original is closed. The keepalive slave needs to be mirrored across
+     * the dup so the surviving alias still observes master-side tty ioctls.
+     */
+    TEST("dup(ptmx) followed by close(orig) leaves alias usable");
+    int alias = dup(ptmx);
+    if (alias < 0) {
+        FAIL("dup(ptmx) failed");
+    } else {
+        close(ptmx);
+        struct winsize ws_alias = {.ws_row = 50, .ws_col = 100};
+        if (ioctl(alias, TIOCSWINSZ, &ws_alias) != 0) {
+            FAIL("TIOCSWINSZ on alias after closing original");
+        } else {
+            struct winsize ws_check = {0};
+            int alias_ok = ioctl(alias, TIOCGWINSZ, &ws_check) == 0 &&
+                           ws_check.ws_row == 50 && ws_check.ws_col == 100;
+            EXPECT_TRUE(alias_ok, "alias winsize did not stick");
+        }
+        ptmx = alias; /* the alias is the live master from here on */
+    }
+
+    /* Fork must propagate the master's keepalive across the IPC handoff so
+     * the child can do master-side tty ioctls without the macOS ENOTTY
+     * fallback. The parent's keepalive slave fd is independent (each side
+     * holds its own slot) so closing one side does not affect the other.
+     */
+    TEST("child fork inherits master keepalive (TIOCSWINSZ works)");
+    int sync_pipe[2];
+    if (pipe(sync_pipe) != 0) {
+        FAIL("pipe(sync_pipe) failed");
+    } else {
+        pid_t pid = fork();
+        if (pid < 0) {
+            FAIL("fork failed");
+            close(sync_pipe[0]);
+            close(sync_pipe[1]);
+        } else if (pid == 0) {
+            /* Child: do master-side TIOCSWINSZ and report rc via pipe. */
+            close(sync_pipe[0]);
+            struct winsize ws_child = {
+                .ws_row = 30,
+                .ws_col = 90,
+            };
+            int rc = ioctl(ptmx, TIOCSWINSZ, &ws_child);
+            char status = (rc == 0) ? 'Y' : 'N';
+            (void) !write(sync_pipe[1], &status, 1);
+            close(sync_pipe[1]);
+            _exit(rc == 0 ? 0 : 1);
+        } else {
+            close(sync_pipe[1]);
+            char status = '?';
+            ssize_t n = read(sync_pipe[0], &status, 1);
+            close(sync_pipe[0]);
+            int wstatus = 0;
+            waitpid(pid, &wstatus, 0);
+            int child_ok = (n == 1) && (status == 'Y') && WIFEXITED(wstatus) &&
+                           WEXITSTATUS(wstatus) == 0;
+            EXPECT_TRUE(child_ok, "child TIOCSWINSZ on master failed");
+            /* Parent should still see the child's update because the slave
+             * keepalive in the parent is still alive. */
+            struct winsize ws_parent = {0};
+            int parent_ok = ioctl(ptmx, TIOCGWINSZ, &ws_parent) == 0 &&
+                            ws_parent.ws_row == 30 && ws_parent.ws_col == 90;
+            TEST("parent observes the child's master-side resize");
+            EXPECT_TRUE(parent_ok, "parent winsize mismatch after child");
+        }
+    }
+
+    close(ptmx);
+
+    /* Re-open and immediately close should not leak the keepalive slave.
+     * Without the proc_pty_close_keepalive call in sys_close's fast path,
+     * single-thread close goes through fd_close_regular_relaxed and
+     * bypasses fd_cleanup_entry, leaving the hidden slave fd open until
+     * elfuse exits. Loop enough times to expose any per-close leak.
+     */
+    TEST("repeated open/close does not exhaust the keepalive table");
+    int leak_loop_ok = 1;
+    for (int i = 0; i < 300; i++) {
+        int f = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+        if (f < 0) {
+            leak_loop_ok = 0;
+            break;
+        }
+        close(f);
+    }
+    EXPECT_TRUE(leak_loop_ok,
+                "repeated /dev/ptmx open/close exhausted the keepalive table");
+
+    /* foot/sshd/openssh sftp-server pattern: the child closes its inherited
+     * master fd before opening the slave (the child has no use for the
+     * master). Earlier, this dropped the keepalive table entry that held the
+     * macOS slave_path mapping, and the subsequent open("/dev/pts/N") in the
+     * child failed with ENOENT even though the parent still held the master
+     * and the macOS slave node was openable. The retained-path semantics in
+     * proc_pty_close_keepalive must let the child still translate the path.
+     */
+    TEST("child can open /dev/pts/N after closing its master");
+    int spawn_master = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+    if (spawn_master < 0) {
+        FAIL("open(/dev/ptmx) for spawn scenario");
+    } else {
+        unsigned int spawn_ptyno = (unsigned int) -1;
+        int unlock_zero = 0;
+        if (ioctl(spawn_master, TIOCGPTN, &spawn_ptyno) != 0 ||
+            ioctl(spawn_master, TIOCSPTLCK, &unlock_zero) != 0) {
+            FAIL("TIOCGPTN/TIOCSPTLCK on spawn master");
+            close(spawn_master);
+        } else {
+            char spawn_pts[32];
+            snprintf(spawn_pts, sizeof(spawn_pts), "/dev/pts/%u", spawn_ptyno);
+            int spawn_pipe[2];
+            if (pipe(spawn_pipe) != 0) {
+                FAIL("pipe for spawn scenario");
+                close(spawn_master);
+            } else {
+                pid_t spawn_pid = fork();
+                if (spawn_pid < 0) {
+                    FAIL("fork for spawn scenario");
+                    close(spawn_pipe[0]);
+                    close(spawn_pipe[1]);
+                    close(spawn_master);
+                } else if (spawn_pid == 0) {
+                    /* Child: foot's slave_exec sequence -- close the master,
+                     * then open(pts_name) for the controlling terminal.
+                     */
+                    close(spawn_pipe[0]);
+                    if (setsid() < 0)
+                        _exit(11);
+                    close(spawn_master);
+                    int slave_fd = open(spawn_pts, O_RDWR);
+                    char status = (slave_fd >= 0) ? 'Y' : 'N';
+                    (void) !write(spawn_pipe[1], &status, 1);
+                    if (slave_fd >= 0) {
+                        (void) !write(slave_fd, "ok\n", 3);
+                        close(slave_fd);
+                    }
+                    close(spawn_pipe[1]);
+                    _exit(slave_fd >= 0 ? 0 : 12);
+                } else {
+                    close(spawn_pipe[1]);
+                    char status = '?';
+                    ssize_t n = read(spawn_pipe[0], &status, 1);
+                    close(spawn_pipe[0]);
+                    int wstatus = 0;
+                    waitpid(spawn_pid, &wstatus, 0);
+                    int spawn_ok = (n == 1) && (status == 'Y') &&
+                                   WIFEXITED(wstatus) &&
+                                   WEXITSTATUS(wstatus) == 0;
+                    EXPECT_TRUE(spawn_ok,
+                                "child open(/dev/pts/N) after close(master)");
+                    if (spawn_ok) {
+                        char drain[16] = {0};
+                        int flags = fcntl(spawn_master, F_GETFL);
+                        if (flags >= 0 && fcntl(spawn_master, F_SETFL,
+                                                flags | O_NONBLOCK) == 0) {
+                            (void) !read(spawn_master, drain,
+                                         sizeof(drain) - 1);
+                            (void) fcntl(spawn_master, F_SETFL, flags);
+                        }
+                    }
+                    close(spawn_master);
+
+                    TEST("stale /dev/pts/N expires after master teardown");
+                    int stale_fd = open(spawn_pts, O_RDWR);
+                    /* Both ENOENT (devfs node gone) and ENXIO (devfs node
+                     * lingers but the pty pair has been torn down) are valid
+                     * macOS responses depending on kernel version. The
+                     * invariant the test guards is "the stale cached path
+                     * does not silently hand back an unrelated tty"; any
+                     * open failure satisfies that.
+                     */
+                    int stale_ok =
+                        stale_fd < 0 && (errno == ENOENT || errno == ENXIO);
+                    if (stale_fd >= 0)
+                        close(stale_fd);
+                    EXPECT_TRUE(stale_ok, "stale /dev/pts/N stayed openable");
+                }
+            }
+        }
+    }
+
+    /* /dev/pts/N for a never-allocated minor must surface ENOENT (the cached
+     * paths in the keepalive table cannot satisfy an arbitrary number).
+     */
+    TEST("/dev/pts/<unknown> returns ENOENT");
+    int unknown = open("/dev/pts/999999", O_RDWR);
+    int unknown_ok = unknown < 0 && errno == ENOENT;
+    if (unknown >= 0)
+        close(unknown);
+    EXPECT_TRUE(unknown_ok, "open(/dev/pts/999999) did not return ENOENT");
+
+    /* A pty master received via SCM_RIGHTS bypasses the /dev/ptmx open
+     * intercept, so the receiver process has no keepalive entry for it.
+     * proc_pty_master_adopt must lazily register one before master-side tty
+     * ioctls, even when TIOCSWINSZ runs before the first TIOCGPTN. Two
+     * checks live inside this block: TIOCSWINSZ-first and the post-adopt
+     * stat(/dev/pts/N). Each has its own TEST() label below.
+     */
+    int sp[2];
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0) {
+        TEST("socketpair for SCM_RIGHTS adoption setup");
+        FAIL("socketpair failed");
+    } else {
+        int donor = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+        if (donor < 0) {
+            FAIL("donor open(/dev/ptmx) failed");
+            close(sp[0]);
+            close(sp[1]);
+        } else {
+            char iobuf = 'x';
+            struct iovec iov = {.iov_base = &iobuf, .iov_len = 1};
+            char ctrl[CMSG_SPACE(sizeof(int))] = {0};
+            struct msghdr msg = {.msg_iov = &iov,
+                                 .msg_iovlen = 1,
+                                 .msg_control = ctrl,
+                                 .msg_controllen = sizeof(ctrl)};
+            struct cmsghdr *cm = CMSG_FIRSTHDR(&msg);
+            cm->cmsg_level = SOL_SOCKET;
+            cm->cmsg_type = SCM_RIGHTS;
+            cm->cmsg_len = CMSG_LEN(sizeof(int));
+            memcpy(CMSG_DATA(cm), &donor, sizeof(int));
+            ssize_t sent = sendmsg(sp[0], &msg, 0);
+            close(donor);
+            if (sent < 0) {
+                FAIL("sendmsg(SCM_RIGHTS) failed");
+            } else {
+                char rbuf;
+                struct iovec riov = {.iov_base = &rbuf, .iov_len = 1};
+                char rctrl[CMSG_SPACE(sizeof(int))] = {0};
+                struct msghdr rmsg = {.msg_iov = &riov,
+                                      .msg_iovlen = 1,
+                                      .msg_control = rctrl,
+                                      .msg_controllen = sizeof(rctrl)};
+                ssize_t got = recvmsg(sp[1], &rmsg, 0);
+                if (got < 0) {
+                    FAIL("recvmsg(SCM_RIGHTS) failed");
+                } else {
+                    struct cmsghdr *rcm = CMSG_FIRSTHDR(&rmsg);
+                    int recv_master = -1;
+                    if (rcm && rcm->cmsg_level == SOL_SOCKET &&
+                        rcm->cmsg_type == SCM_RIGHTS)
+                        memcpy(&recv_master, CMSG_DATA(rcm), sizeof(int));
+                    if (recv_master < 0) {
+                        TEST("SCM_RIGHTS recv yields a master fd");
+                        FAIL("recv_master fd not received");
+                    } else {
+                        TEST("TIOCSWINSZ first on SCM_RIGHTS-received master");
+                        struct winsize ws_recv = {
+                            .ws_row = 33,
+                            .ws_col = 101,
+                            .ws_xpixel = 808,
+                            .ws_ypixel = 528,
+                        };
+                        EXPECT_TRUE(
+                            ioctl(recv_master, TIOCSWINSZ, &ws_recv) == 0,
+                            "TIOCSWINSZ before TIOCGPTN on "
+                            "SCM_RIGHTS-received master");
+                        TEST("TIOCGPTN on SCM_RIGHTS-received master");
+                        unsigned int recv_ptyno = (unsigned int) -1;
+                        int gpn_rc = ioctl(recv_master, TIOCGPTN, &recv_ptyno);
+                        EXPECT_TRUE(gpn_rc == 0 && recv_ptyno < 100000u,
+                                    "TIOCGPTN on SCM_RIGHTS-received master");
+                        char recv_pts_path[32];
+                        snprintf(recv_pts_path, sizeof(recv_pts_path),
+                                 "/dev/pts/%u", recv_ptyno);
+                        TEST("stat(/dev/pts/N) after SCM_RIGHTS adoption");
+                        struct stat recv_st;
+                        EXPECT_TRUE(stat(recv_pts_path, &recv_st) == 0 &&
+                                        S_ISCHR(recv_st.st_mode),
+                                    "stat after SCM_RIGHTS adoption failed");
+                        close(recv_master);
+                    }
+                }
+            }
+            close(sp[0]);
+            close(sp[1]);
+        }
+    }
+
+    SUMMARY("test-pty");
+    return fails > 0 ? 1 : 0;
+}
diff --git a/tests/test-xattr.c b/tests/test-xattr.c
new file mode 100644
index 0000000..9ee055e
--- /dev/null
+++ b/tests/test-xattr.c
@@ -0,0 +1,177 @@
+/* lgetxattr / getxattr / setxattr semantics tests
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Pins three properties of the elfuse xattr surface that the host
+ * shim has to translate:
+ *
+ *   1. lgetxattr returns the stored value on a regular file.
+ *   2. lgetxattr on a symlink does not follow the link, so requesting
+ *      an attr stored on the target reports ENODATA. getxattr on the
+ *      same symlink follows and returns the target's value.
+ *   3. A missing attribute reports ENODATA, not EINVAL. macOS returns
+ *      ENOATTR(93) or its synonym ENODATA(96); both must translate to
+ *      Linux ENODATA(61).
+ *
+ * Regression: an earlier revision lacked the ENOATTR translation
+ * entry, so the default in linux_errno() fell through to EINVAL,
+ * which masked real "attribute not present" outcomes and broke
+ * fontconfig / glibc xattr probes.
+ *
+ * Syscalls exercised: setxattr(5), lsetxattr(6), getxattr(8),
+ *                     lgetxattr(9)
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#include "test-harness.h"
+
+int passes = 0, fails = 0;
+
+#define NR_setxattr 5
+#define NR_lsetxattr 6
+#define NR_getxattr 8
+#define NR_lgetxattr 9
+
+static long do_setxattr(const char *path,
+                        const char *name,
+                        const void *val,
+                        size_t sz,
+                        int flags)
+{
+    return syscall(NR_setxattr, path, name, val, sz, flags);
+}
+
+static long do_lsetxattr(const char *path,
+                         const char *name,
+                         const void *val,
+                         size_t sz,
+                         int flags)
+{
+    return syscall(NR_lsetxattr, path, name, val, sz, flags);
+}
+
+static long do_getxattr(const char *path,
+                        const char *name,
+                        void *out,
+                        size_t cap)
+{
+    return syscall(NR_getxattr, path, name, out, cap);
+}
+
+static long do_lgetxattr(const char *path,
+                         const char *name,
+                         void *out,
+                         size_t cap)
+{
+    return syscall(NR_lgetxattr, path, name, out, cap);
+}
+
+static const char tmp_file[] = "/tmp/elfuse-xattr-target";
+static const char tmp_link[] = "/tmp/elfuse-xattr-link";
+
+static void setup(void)
+{
+    unlink(tmp_link);
+    unlink(tmp_file);
+    int fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+    if (fd < 0)
+        return;
+    (void) !write(fd, "hello\n", 6);
+    close(fd);
+    symlink(tmp_file, tmp_link);
+}
+
+static void teardown(void)
+{
+    unlink(tmp_link);
+    unlink(tmp_file);
+}
+
+static void test_lgetxattr_regular_file(void)
+{
+    TEST("lgetxattr on regular file returns value");
+    const char *attr = "user.elfuse_probe";
+    const char *val = "wired";
+    if (do_setxattr(tmp_file, attr, val, strlen(val), 0) != 0) {
+        FAIL("setxattr seed failed");
+        return;
+    }
+    char buf[64] = {0};
+    long r = do_lgetxattr(tmp_file, attr, buf, sizeof(buf));
+    EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0,
+                "lgetxattr value mismatch");
+}
+
+static void test_lgetxattr_symlink_no_follow(void)
+{
+    TEST("lgetxattr on symlink reports ENODATA, not EINVAL");
+    char buf[64] = {0};
+    errno = 0;
+    long r = do_lgetxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf));
+    EXPECT_TRUE(r == -1 && errno == ENODATA,
+                "expected ENODATA from lgetxattr on bare symlink");
+}
+
+static void test_getxattr_symlink_follows(void)
+{
+    TEST("getxattr on symlink follows to target");
+    char buf[64] = {0};
+    long r = do_getxattr(tmp_link, "user.elfuse_probe", buf, sizeof(buf));
+    const char *val = "wired";
+    EXPECT_TRUE(r == (long) strlen(val) && memcmp(buf, val, strlen(val)) == 0,
+                "getxattr value mismatch");
+}
+
+static void test_lgetxattr_symlink_with_attr(void)
+{
+    TEST("lgetxattr returns symlink-owned attr after lsetxattr");
+    const char *attr = "user.elfuse_probe";
+    const char *lval = "link-val";
+    if (do_lsetxattr(tmp_link, attr, lval, strlen(lval), 0) != 0) {
+        printf("SKIP (lsetxattr on symlink unsupported: errno=%d)\n", errno);
+        passes++;
+        return;
+    }
+    char buf[64] = {0};
+    long r = do_lgetxattr(tmp_link, attr, buf, sizeof(buf));
+    EXPECT_TRUE(
+        r == (long) strlen(lval) && memcmp(buf, lval, strlen(lval)) == 0,
+        "lgetxattr did not return symlink-owned value");
+}
+
+static void test_lgetxattr_missing(void)
+{
+    TEST("lgetxattr on missing attr reports ENODATA");
+    char buf[64] = {0};
+    errno = 0;
+    long r = do_lgetxattr(tmp_file, "user.no_such_attr_xyz", buf, sizeof(buf));
+    EXPECT_TRUE(r == -1 && errno == ENODATA,
+                "expected ENODATA from lgetxattr on missing attr");
+}
+
+int main(void)
+{
+    printf("test-xattr: lgetxattr / getxattr / setxattr semantics\n");
+
+    setup();
+
+    test_lgetxattr_regular_file();
+    test_lgetxattr_symlink_no_follow();
+    test_getxattr_symlink_follows();
+    test_lgetxattr_symlink_with_attr();
+    test_lgetxattr_missing();
+
+    teardown();
+
+    SUMMARY("test-xattr");
+    return fails == 0 ? 0 : 1;
+}