diff --git a/abi/snapshot.json b/abi/snapshot.json index bffd4f1c5..24c8fc974 100644 --- a/abi/snapshot.json +++ b/abi/snapshot.json @@ -242,6 +242,56 @@ "number": 386 } ], + "io_multiplexing": { + "epoll_events": [ + { + "name": "EPOLLIN", + "value": 1 + }, + { + "name": "EPOLLOUT", + "value": 4 + }, + { + "name": "EPOLLERR", + "value": 8 + }, + { + "name": "EPOLLHUP", + "value": 16 + } + ], + "poll_events": [ + { + "name": "POLLIN", + "value": 1 + }, + { + "name": "POLLPRI", + "value": 2 + }, + { + "name": "POLLOUT", + "value": 4 + }, + { + "name": "POLLERR", + "value": 8 + }, + { + "name": "POLLHUP", + "value": 16 + }, + { + "name": "POLLNVAL", + "value": 32 + } + ], + "select": { + "fd_set_bytes": 128, + "fd_setsize": 1024 + } + }, "kernel_exports": [ { "kind": "func", @@ -583,6 +633,11 @@ "name": "kernel_get_exit_status", "signature": "() -> (i32)" }, + { + "kind": "func", + "name": "kernel_get_fd_accept_wake_idx", + "signature": "(i32,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_get_fd_path", @@ -833,6 +888,11 @@ "name": "kernel_inject_connection", "signature": "(i32,i32,i32,i32,i32,i32,i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_inject_datagram", + "signature": "(i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i64,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_inject_mouse_event", @@ -843,11 +903,21 @@ "name": "kernel_ioctl", "signature": "(i32,i32,i64,i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_ipc_shm_lookup_mapping", + "signature": "(i64,i64) -> (i32)" + }, { "kind": "func", "name": "kernel_ipc_shm_read_chunk", "signature": "(i32,i32,i64,i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_ipc_shm_record_mapping", + "signature": "(i64,i32,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_ipc_shm_write_chunk", @@ -863,6 +933,11 @@ "name": "kernel_ipc_shmdt", "signature": "(i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_ipc_shmdt_addr", + "signature": "(i64) -> (i32)" + }, { "kind": "func", "name": "kernel_is_fd_nonblock", @@ -1003,6 +1078,11 @@ "name": "kernel_pick_signal_target_tid", "signature": "(i32,i32) -> (i32)" }, + { + "kind": "func", + "name": "kernel_pick_tcp_listener_target", + "signature": "(i32,i32,i64) -> (i32)" + }, { "kind": "func", "name": "kernel_pipe", @@ -1473,6 +1553,11 @@ "name": "kernel_sysconf", "signature": "(i32) -> (i64)" }, + { + "kind": "func", + "name": "kernel_take_process_timer_cleanup", + "signature": "(i32,i64,i32) -> (i32)" + }, { "kind": "func", "name": "kernel_tcgetattr", @@ -2182,6 +2267,61 @@ }, "wasm_page_size": 65536 }, + "process_snapshot": { + "count_offset": 0, + "count_size": 4, + "record_fields": [ + { + "name": "pid", + "offset": 0, + "size": 4, + "type": "u32" + }, + { + "name": "ppid", + "offset": 4, + "size": 4, + "type": "u32" + }, + { + "name": "uid", + "offset": 8, + "size": 4, + "type": "u32" + }, + { + "name": "gid", + "offset": 12, + "size": 4, + "type": "u32" + }, + { + "name": "vsizeBytes", + "offset": 16, + "size": 8, + "type": "u64" + }, + { + "name": "state", + "offset": 24, + "size": 4, + "type": "u32_ascii" + }, + { + "name": "commLen", + "offset": 28, + "size": 4, + "type": "u32" + }, + { + "name": "cmdlineLen", + "offset": 32, + "size": 4, + "type": "u32" + } + ], + "record_fixed_size": 36 + }, "syscall_arg_descriptors": { "1": [ { @@ -4440,5 +4580,350 @@ "name": "ThreadCancel", "number": 415 } - ] + ], + "vfs_metadata": { + "access_modes": [ + { + "name": "F_OK", + "value": 0 + }, + { + "name": "R_OK", + "value": 4 + }, + { + "name": "W_OK", + "value": 2 + }, + { + "name": "X_OK", + "value": 1 + } + ], + "at_flags": [ + { + "name": "AT_FDCWD", + "value": -100 + }, + { + "name": "AT_SYMLINK_NOFOLLOW", + "value": 256 + }, + { + "name": "AT_REMOVEDIR", + "value": 512 + }, + { + "name": "AT_EMPTY_PATH", + "value": 4096 + } + ], + "dirent_types": [ + { + "name": "DT_UNKNOWN", + "value": 0 + }, + { + "name": "DT_FIFO", + "value": 1 + }, + { + "name": "DT_CHR", + "value": 2 + }, + { + "name": "DT_DIR", + "value": 4 + }, + { + "name": "DT_BLK", + "value": 6 + }, + { + "name": "DT_REG", + "value": 8 + }, + { + "name": "DT_LNK", + "value": 10 + }, + { + "name": "DT_SOCK", + "value": 12 + } + ], + "fcntl_commands": [ + { + "name": "F_DUPFD", + "value": 0 + }, + { + "name": "F_GETFD", + "value": 1 + }, + { + "name": "F_SETFD", + "value": 2 + }, + { + "name": "F_GETFL", + "value": 3 + }, + { + "name": "F_SETFL", + "value": 4 + }, + { + "name": "F_GETLK", + "value": 12 + }, + { + "name": "F_SETLK", + "value": 13 + }, + { + "name": "F_SETLKW", + "value": 14 + }, + { + "name": "F_SETOWN", + "value": 8 + }, + { + "name": "F_GETOWN", + "value": 9 + }, + { + "name": "F_DUPFD_CLOEXEC", + "value": 1030 + }, + { + "name": "F_DUPFD_CLOFORK", + "value": 1028 + }, + { + "name": "F_OFD_GETLK", + "value": 36 + }, + { + "name": "F_OFD_SETLK", + "value": 37 + }, + { + "name": "F_OFD_SETLKW", + "value": 38 + } + ], + "fd_flags": [ + { + "name": "FD_CLOEXEC", + "value": 1 + }, + { + "name": "FD_CLOFORK", + "value": 2 + } + ], + "file_modes": [ + { + "name": "S_IFMT", + "value": 61440 + }, + { + "name": "S_IFSOCK", + "value": 49152 + }, + { + "name": "S_IFLNK", + "value": 40960 + }, + { + "name": "S_IFREG", + "value": 32768 + }, + { + "name": "S_IFBLK", + "value": 24576 + }, + { + "name": "S_IFDIR", + "value": 16384 + }, + { + "name": "S_IFCHR", + "value": 8192 + }, + { + "name": "S_IFIFO", + "value": 4096 + }, + { + "name": "S_ISUID", + "value": 2048 + }, + { + "name": "S_ISGID", + "value": 1024 + }, + { + "name": "S_ISVTX", + "value": 512 + }, + { + "name": "S_IRWXU", + "value": 448 + }, + { + "name": "S_IRUSR", + "value": 256 + }, + { + "name": "S_IWUSR", + "value": 128 + }, + { + "name": "S_IXUSR", + "value": 64 + }, + { + "name": "S_IRWXG", + "value": 56 + }, + { + "name": "S_IRGRP", + "value": 32 + }, + { + "name": "S_IWGRP", + "value": 16 + }, + { + "name": "S_IXGRP", + "value": 8 + }, + { + "name": "S_IRWXO", + "value": 7 + }, + { + "name": "S_IROTH", + "value": 4 + }, + { + "name": "S_IWOTH", + "value": 2 + }, + { + "name": "S_IXOTH", + "value": 1 + }, + { + "name": "S_MODE_BITS", + "value": 4095 + } + ], + "open_flags": [ + { + "name": "O_RDONLY", + "value": 0 + }, + { + "name": "O_WRONLY", + "value": 1 + }, + { + "name": "O_RDWR", + "value": 2 + }, + { + "name": "O_ACCMODE", + "value": 3 + }, + { + "name": "O_CREAT", + "value": 64 + }, + { + "name": "O_EXCL", + "value": 128 + }, + { + "name": "O_NOCTTY", + "value": 256 + }, + { + "name": "O_TRUNC", + "value": 512 + }, + { + "name": "O_APPEND", + "value": 1024 + }, + { + "name": "O_NONBLOCK", + "value": 2048 + }, + { + "name": "O_DIRECTORY", + "value": 65536 + }, + { + "name": "O_NOFOLLOW", + "value": 131072 + }, + { + "name": "O_CLOEXEC", + "value": 524288 + }, + { + "name": "O_CLOFORK", + "value": 8388608 + } + ], + "seek_whence": [ + { + "name": "SEEK_SET", + "value": 0 + }, + { + "name": "SEEK_CUR", + "value": 1 + }, + { + "name": "SEEK_END", + "value": 2 + } + ] + }, + "wakeup_events": { + "fields": [ + { + "name": "idx", + "offset": 0, + "size": 4, + "type": "u32" + }, + { + "name": "wakeType", + "offset": 4, + "size": 1, + "type": "u8" + } + ], + "record_size": 5, + "types": [ + { + "bit": 1, + "name": "readable" + }, + { + "bit": 2, + "name": "writable" + }, + { + "bit": 4, + "name": "accept" + } + ] + } } diff --git a/crates/kernel/src/fork.rs b/crates/kernel/src/fork.rs index 89c385030..be2fb10be 100644 --- a/crates/kernel/src/fork.rs +++ b/crates/kernel/src/fork.rs @@ -25,7 +25,7 @@ use crate::fd::{FdEntry, FdTable, OpenFileDescRef}; use crate::lock::LockTable; use crate::memory::{MappedRegion, MemoryLayoutMetadata, MemoryManager}; use crate::ofd::{FileType, OfdTable, OpenFileDesc}; -use crate::process::{Process, ProcessState}; +use crate::process::{Process, ProcessState, ShmMapping}; use crate::signal::{SignalAction, SignalHandler, SignalState}; use crate::socket::SocketTable; use crate::terminal::{NCCS, TerminalState, WinSize}; @@ -40,6 +40,7 @@ const MAX_OFDS: u32 = 65536; const MAX_ENV_VARS: u32 = 65536; const MAX_ARGV: u32 = 65536; const MAX_PATH_LEN: usize = 1048576; // 1 MiB +const MAX_SHM_MAPPINGS: usize = 4096; const MAX_STRING_LEN: usize = 1048576; // 1 MiB // ── Writer helper ─────────────────────────────────────────────────────────── @@ -546,6 +547,14 @@ pub fn serialize_fork_state(proc: &Process, buf: &mut [u8]) -> Result Result= 4 { + let count = r.read_u32()? as usize; + if count > MAX_SHM_MAPPINGS { + return Err(Errno::EINVAL); + } + shm_mappings = Vec::with_capacity(count); + for _ in 0..count { + let addr = r.read_u32()? as usize; + let shmid = r.read_u32()? as i32; + let size = r.read_u32()? as usize; + shm_mappings.push(ShmMapping { addr, shmid, size }); + } + } + Ok(Process { pid: child_pid, ppid, @@ -1016,6 +1041,7 @@ pub fn deserialize_fork_state(buf: &[u8], child_pid: u32) -> Result Result { // exec wipes any prior framebuffer binding — the new program // must open and mmap /dev/fb0 itself. fb_binding: None, + shm_mappings: Vec::new(), // The fork counter exists as a kernel-side regression guardrail. // Resetting on exec keeps semantics simple: the next spawn-from-this-pid // test starts from a clean slate. The plan's regression check inspects @@ -1555,6 +1582,19 @@ mod tests { } } + #[test] + fn test_fork_state_preserves_shm_mappings() { + let mut proc = Process::new(1); + proc.record_shm_mapping(0x20000, 17, 4096); + + let mut buf = vec![0u8; 64 * 1024]; + let written = serialize_fork_state(&proc, &mut buf).unwrap(); + let child = deserialize_fork_state(&buf[..written], 42).unwrap(); + + assert_eq!(child.shm_mapping_at(0x20000).unwrap().shmid, 17); + assert_eq!(child.shm_mapping_at(0x20000).unwrap().size, 4096); + } + #[test] fn test_buffer_too_small() { let proc = Process::new(1); @@ -1574,7 +1614,8 @@ mod tests { #[test] fn test_exec_roundtrip_default_process() { - let proc = Process::new(1); + let mut proc = Process::new(1); + proc.record_shm_mapping(0x20000, 17, 4096); let mut buf = vec![0u8; 64 * 1024]; let written = serialize_exec_state(&proc, &mut buf).unwrap(); assert!(written > 12); @@ -1584,6 +1625,7 @@ mod tests { assert_eq!(restored.pid, 1); assert_eq!(restored.ppid, 0); // default ppid assert_eq!(restored.signals.pending, 0); + assert!(restored.shm_mappings.is_empty()); } #[test] diff --git a/crates/kernel/src/ipc.rs b/crates/kernel/src/ipc.rs index b2f602876..7125c512d 100644 --- a/crates/kernel/src/ipc.rs +++ b/crates/kernel/src/ipc.rs @@ -816,6 +816,17 @@ impl IpcTable { Ok(seg.segsz) } + /// Inherit an existing attachment across fork without rechecking + /// permissions. The parent already passed `shmat`; fork only creates + /// another process reference to the same segment. + pub fn shm_attach_inherited(&mut self, shmid: i32, pid: u32) -> Result<(), Errno> { + let seg = self.shm_segments.get_mut(&shmid).ok_or(Errno::EINVAL)?; + seg.nattch += 1; + seg.lpid = pid as i32; + seg.atime = crate::current_time_secs(); + Ok(()) + } + /// Read a chunk of shared memory segment data into a buffer. /// Returns bytes written. pub fn shm_read_chunk(&self, shmid: i32, offset: u32, buf: &mut [u8]) -> Result { @@ -1446,9 +1457,17 @@ mod tests { assert_eq!(info.nattch, 1); assert_eq!(info.lpid, 42); + t.shm_attach_inherited(id, 43).unwrap(); + let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap(); + assert_eq!(info.nattch, 2); + assert_eq!(info.lpid, 43); + // Detach t.shmdt(id, 42).unwrap(); let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap(); + assert_eq!(info.nattch, 1); + t.shmdt(id, 43).unwrap(); + let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap(); assert_eq!(info.nattch, 0); } diff --git a/crates/kernel/src/process.rs b/crates/kernel/src/process.rs index 34ba7c3e9..64e3be192 100644 --- a/crates/kernel/src/process.rs +++ b/crates/kernel/src/process.rs @@ -227,6 +227,20 @@ pub struct FbBinding { pub fmt: u32, } +/// Per-process SysV shared-memory attachment. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ShmMapping { + pub addr: usize, + pub shmid: i32, + pub size: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct HostTimerCleanup { + pub cancel_alarm: bool, + pub posix_timer_ids: Vec, +} + /// Per-thread state within a process. #[derive(Debug, Clone)] pub struct ThreadInfo { @@ -424,6 +438,9 @@ pub struct Process { /// Live mmap of `/dev/fb0`, if any. `Some` between successful /// `mmap` and the matching `munmap`/process-exit/exec. pub fb_binding: Option, + /// SysV shared-memory attachments keyed by the process virtual address + /// returned from `shmat`. + pub shm_mappings: Vec, /// Counts how many times this process has called fork() (parent side, on success). /// Read-only from outside the kernel via `kernel_get_fork_count`. /// Used as a regression guardrail by the spawn test suite to confirm @@ -508,6 +525,7 @@ impl Process { procfs_bufs: Vec::new(), has_exec: false, fb_binding: None, + shm_mappings: Vec::new(), fork_count: 0, } } @@ -588,6 +606,47 @@ impl Process { self.threads.iter_mut().find(|t| t.tid == tid) } + /// Record or replace a SysV shared-memory attachment for an address. + pub fn record_shm_mapping(&mut self, addr: usize, shmid: i32, size: usize) { + if let Some(mapping) = self.shm_mappings.iter_mut().find(|m| m.addr == addr) { + *mapping = ShmMapping { addr, shmid, size }; + } else { + self.shm_mappings.push(ShmMapping { addr, shmid, size }); + } + } + + /// Find a SysV shared-memory attachment by its process address. + pub fn shm_mapping_at(&self, addr: usize) -> Option { + self.shm_mappings.iter().copied().find(|m| m.addr == addr) + } + + /// Remove and return a SysV shared-memory attachment by its process address. + pub fn remove_shm_mapping(&mut self, addr: usize) -> Option { + let idx = self.shm_mappings.iter().position(|m| m.addr == addr)?; + Some(self.shm_mappings.swap_remove(idx)) + } + + /// Return host timer handles that should be cancelled for this process and + /// clear the Rust timer state that made them live. + pub fn take_host_timer_cleanup(&mut self) -> HostTimerCleanup { + let cancel_alarm = self.alarm_deadline_ns != 0 || self.alarm_interval_ns != 0; + self.alarm_deadline_ns = 0; + self.alarm_interval_ns = 0; + + let mut posix_timer_ids = Vec::new(); + for (timer_id, slot) in self.posix_timers.iter_mut().enumerate() { + if slot.is_some() { + posix_timer_ids.push(timer_id); + *slot = None; + } + } + + HostTimerCleanup { + cancel_alarm, + posix_timer_ids, + } + } + /// True if `tid` names the process's main thread. The main thread's TID /// equals the process PID (Linux convention) and is not tracked in /// [`Process::threads`]; per-thread signal state for the main thread lives @@ -964,6 +1023,69 @@ mod tests { assert_eq!(proc.fork_count(), 0); } + #[test] + fn shm_mapping_bookkeeping_is_keyed_by_process_addr() { + let mut proc = Process::new(1); + + proc.record_shm_mapping(0x20000, 7, 4096); + assert_eq!( + proc.shm_mapping_at(0x20000), + Some(ShmMapping { + addr: 0x20000, + shmid: 7, + size: 4096, + }) + ); + + proc.record_shm_mapping(0x20000, 8, 8192); + assert_eq!(proc.shm_mappings.len(), 1); + assert_eq!( + proc.remove_shm_mapping(0x20000), + Some(ShmMapping { + addr: 0x20000, + shmid: 8, + size: 8192, + }) + ); + assert_eq!(proc.shm_mapping_at(0x20000), None); + } + + #[test] + fn host_timer_cleanup_drains_alarm_and_posix_timer_state() { + let mut proc = Process::new(1); + proc.alarm_deadline_ns = 10; + proc.alarm_interval_ns = 5; + proc.posix_timers.push(Some(PosixTimerState { + clock_id: 0, + sigev_signo: 14, + sigev_value: 0, + interval_sec: 0, + interval_nsec: 0, + value_sec: 1, + value_nsec: 0, + overrun: 0, + })); + proc.posix_timers.push(None); + proc.posix_timers.push(Some(PosixTimerState { + clock_id: 0, + sigev_signo: 15, + sigev_value: 0, + interval_sec: 1, + interval_nsec: 0, + value_sec: 1, + value_nsec: 0, + overrun: 0, + })); + + let cleanup = proc.take_host_timer_cleanup(); + + assert!(cleanup.cancel_alarm); + assert_eq!(cleanup.posix_timer_ids, alloc::vec![0, 2]); + assert_eq!(proc.alarm_deadline_ns, 0); + assert_eq!(proc.alarm_interval_ns, 0); + assert!(proc.posix_timers.iter().all(|slot| slot.is_none())); + } + #[test] fn spawn_child_basic_inherits_cwd_and_returns_pid() { use crate::process_table::ProcessTable; diff --git a/crates/kernel/src/process_table.rs b/crates/kernel/src/process_table.rs index d505ed38e..6d6f3c9e4 100644 --- a/crates/kernel/src/process_table.rs +++ b/crates/kernel/src/process_table.rs @@ -23,6 +23,7 @@ use wasm_posix_shared::flags::O_ACCMODE; use crate::ofd::FileType; use crate::process::{Process, ProcessState}; +use crate::socket::SocketState; /// Owning pid of `/dev/fb0`, or `-1` if no process holds it. /// @@ -46,6 +47,8 @@ pub struct ProcessTable { /// The host sets this before each `kernel_handle_channel` when a thread /// worker is the caller. current_tid: u32, + /// Round-robin cursor for host-bridged TCP listener target selection. + tcp_listener_rr: BTreeMap, } /// Outcome of `ProcessTable::remove_process`. Bundles the removed @@ -186,6 +189,11 @@ fn bump_inherited_resource_refcounts(child: &Process) { } } } + + let ipc = unsafe { crate::ipc::global_ipc_table() }; + for mapping in &child.shm_mappings { + let _ = ipc.shm_attach_inherited(mapping.shmid, child.pid); + } } /// Build the fork-only `fork_pipe_replay` table: a list of (read_fd, @@ -220,6 +228,7 @@ impl ProcessTable { processes: BTreeMap::new(), current_pid: 0, current_tid: 0, + tcp_listener_rr: BTreeMap::new(), } } @@ -414,6 +423,13 @@ impl ProcessTable { let pshared = unsafe { crate::pshared::global_pshared_table() }; pshared.cleanup_process(pid); + // Drop SysV shared-memory attachments that were still live when the + // process exited or was reaped. + let ipc = unsafe { crate::ipc::global_ipc_table() }; + for mapping in &proc.shm_mappings { + let _ = ipc.shmdt(mapping.shmid, pid); + } + if retain_limbo_leader && proc.pgid == pid && self.group_has_member(pid) { self.processes.insert(pid, Self::limbo_process_from(&proc)); } @@ -814,6 +830,62 @@ impl ProcessTable { self.processes.get(&pid).map(|proc| proc.ppid) } + /// Pick the process/fd that should receive the next host-bridged TCP + /// connection for `port`. + /// + /// JS still owns the actual `net.Server`/service-worker bridge, but the + /// process table owns which live process currently has an inherited + /// listening socket. When children inherited a listener through fork, + /// prefer them over the original parent just as the previous host-side + /// policy did. + pub fn pick_tcp_listener_target(&mut self, port: u16, exclude_pid: u32) -> Option<(u32, i32)> { + let mut targets = self.tcp_listener_targets(port, exclude_pid); + if targets.len() > 1 { + let children: Vec<(u32, i32)> = targets + .iter() + .copied() + .filter(|(pid, _fd)| self.processes.get(pid).is_some_and(|proc| proc.ppid > 0)) + .collect(); + if !children.is_empty() { + targets = children; + } + } + + if targets.is_empty() { + self.tcp_listener_rr.remove(&port); + return None; + } + + let idx = self.tcp_listener_rr.get(&port).copied().unwrap_or(0) % targets.len(); + self.tcp_listener_rr.insert(port, idx + 1); + Some(targets[idx]) + } + + fn tcp_listener_targets(&self, port: u16, exclude_pid: u32) -> Vec<(u32, i32)> { + let mut targets = Vec::new(); + for (&pid, proc) in &self.processes { + if pid == exclude_pid || proc.state != ProcessState::Running { + continue; + } + for (fd, entry) in proc.fd_table.iter() { + let Some(ofd) = proc.ofd_table.get(entry.ofd_ref.0) else { + continue; + }; + if ofd.file_type != FileType::Socket || ofd.host_handle >= 0 { + continue; + } + let sock_idx = (-(ofd.host_handle + 1)) as usize; + let Some(sock) = proc.sockets.get(sock_idx) else { + continue; + }; + if sock.state == SocketState::Listening && sock.bind_port == port { + targets.push((pid, fd)); + } + } + } + targets + } + /// Mark a process as terminated by a host-observed signal death. /// /// Used when the worker dies without reaching the normal `SYS_EXIT` @@ -1041,4 +1113,58 @@ mod tests { Some((12, 1 << 8)) ); } + + #[test] + fn tcp_listener_target_policy_prefers_fork_children() { + let mut table = ProcessTable::new(); + table.create_process(10).unwrap(); + table.create_process(11).unwrap(); + table.create_process(12).unwrap(); + table.processes.get_mut(&11).unwrap().ppid = 10; + table.processes.get_mut(&12).unwrap().ppid = 10; + + add_listening_socket(&mut table, 10, 8080, 3); + add_listening_socket(&mut table, 11, 8080, 3); + add_listening_socket(&mut table, 12, 8080, 3); + + assert_eq!(table.pick_tcp_listener_target(8080, 0), Some((11, 3))); + assert_eq!(table.pick_tcp_listener_target(8080, 0), Some((12, 3))); + assert_eq!(table.pick_tcp_listener_target(8080, 0), Some((11, 3))); + } + + #[test] + fn tcp_listener_target_policy_can_exclude_a_process_during_cleanup() { + let mut table = ProcessTable::new(); + table.create_process(10).unwrap(); + table.create_process(11).unwrap(); + table.processes.get_mut(&11).unwrap().ppid = 10; + + add_listening_socket(&mut table, 10, 8080, 3); + add_listening_socket(&mut table, 11, 8080, 3); + + assert_eq!(table.pick_tcp_listener_target(8080, 10), Some((11, 3))); + assert_eq!(table.pick_tcp_listener_target(8080, 11), Some((10, 3))); + assert_eq!(table.pick_tcp_listener_target(9999, 0), None); + } + + fn add_listening_socket(table: &mut ProcessTable, pid: u32, port: u16, fd: i32) { + use crate::fd::OpenFileDescRef; + use crate::socket::{SocketDomain, SocketInfo, SocketState, SocketType}; + use wasm_posix_shared::flags::O_RDWR; + + let proc = table.processes.get_mut(&pid).unwrap(); + let mut sock = SocketInfo::new(SocketDomain::Inet, SocketType::Stream, 0); + sock.state = SocketState::Listening; + sock.bind_port = port; + let sock_idx = proc.sockets.alloc(sock); + let ofd_idx = proc.ofd_table.create( + FileType::Socket, + O_RDWR, + -((sock_idx as i64) + 1), + b"socket".to_vec(), + ); + proc.fd_table + .alloc_at_min(OpenFileDescRef(ofd_idx), 0, fd) + .unwrap(); + } } diff --git a/crates/kernel/src/syscalls.rs b/crates/kernel/src/syscalls.rs index 36a6b536a..cfe204316 100644 --- a/crates/kernel/src/syscalls.rs +++ b/crates/kernel/src/syscalls.rs @@ -4202,8 +4202,6 @@ pub fn sys_execveat( path: &[u8], flags: u32, ) -> Result<(), Errno> { - const AT_EMPTY_PATH: u32 = 0x1000; - if flags & AT_EMPTY_PATH != 0 && path.is_empty() { // fexecve path: exec the file referenced by dirfd let entry = proc.fd_table.get(dirfd)?; diff --git a/crates/kernel/src/wakeup.rs b/crates/kernel/src/wakeup.rs index 2e77670a7..02102e020 100644 --- a/crates/kernel/src/wakeup.rs +++ b/crates/kernel/src/wakeup.rs @@ -6,15 +6,16 @@ use alloc::vec::Vec; use core::cell::UnsafeCell; +use wasm_posix_shared::wakeup_event; /// Pipe became readable (data was written, or write-end closed). -pub const WAKE_READABLE: u8 = 1; +pub const WAKE_READABLE: u8 = wakeup_event::TYPE_READABLE; /// Pipe became writable (data was read, or read-end closed). -pub const WAKE_WRITABLE: u8 = 2; +pub const WAKE_WRITABLE: u8 = wakeup_event::TYPE_WRITABLE; /// Listener accept queue received a pending connection. -pub const WAKE_ACCEPT: u8 = 4; +pub const WAKE_ACCEPT: u8 = wakeup_event::TYPE_ACCEPT; /// A readiness change event. #[derive(Debug, Clone, Copy)] @@ -68,19 +69,18 @@ pub fn push_accept(accept_idx: u32) { pub fn drain(out: &mut [u8], max_events: u32) -> u32 { let events = unsafe { &mut *WAKEUP_BUFFER.events.get() }; let count = events.len().min(max_events as usize); - let bytes_per_event = 5; - let max_by_buf = out.len() / bytes_per_event; + let max_by_buf = out.len() / wakeup_event::RECORD_SIZE; let count = count.min(max_by_buf); for i in 0..count { let ev = &events[i]; - let offset = i * bytes_per_event; + let offset = i * wakeup_event::RECORD_SIZE; let idx_bytes = ev.idx.to_le_bytes(); - out[offset] = idx_bytes[0]; - out[offset + 1] = idx_bytes[1]; - out[offset + 2] = idx_bytes[2]; - out[offset + 3] = idx_bytes[3]; - out[offset + 4] = ev.wake_type; + out[offset + wakeup_event::IDX_OFFSET] = idx_bytes[0]; + out[offset + wakeup_event::IDX_OFFSET + 1] = idx_bytes[1]; + out[offset + wakeup_event::IDX_OFFSET + 2] = idx_bytes[2]; + out[offset + wakeup_event::IDX_OFFSET + 3] = idx_bytes[3]; + out[offset + wakeup_event::TYPE_OFFSET] = ev.wake_type; } events.clear(); diff --git a/crates/kernel/src/wasm_api.rs b/crates/kernel/src/wasm_api.rs index e90b78e0f..e9b9b0524 100644 --- a/crates/kernel/src/wasm_api.rs +++ b/crates/kernel/src/wasm_api.rs @@ -1620,6 +1620,36 @@ pub extern "C" fn kernel_get_parent_pid(pid: u32) -> i32 { } } +/// Pick the next live process/fd that should receive a host-bridged TCP +/// connection for `port`. +/// +/// Writes `{ u32 pid, i32 fd }` to `out_ptr`; returns 1 if a target was +/// written, 0 if none exists, or negative errno. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_pick_tcp_listener_target( + port: u32, + exclude_pid: u32, + out_ptr: *mut u8, +) -> i32 { + if out_ptr.is_null() { + return -(Errno::EFAULT as i32); + } + if port > u16::MAX as u32 { + return -(Errno::EINVAL as i32); + } + + let table = unsafe { &mut *PROCESS_TABLE.0.get() }; + match table.pick_tcp_listener_target(port as u16, exclude_pid) { + Some((pid, fd)) => { + let out = unsafe { core::slice::from_raw_parts_mut(out_ptr, 8) }; + out[0..4].copy_from_slice(&pid.to_le_bytes()); + out[4..8].copy_from_slice(&fd.to_le_bytes()); + 1 + } + None => 0, + } +} + /// Mark a process as signal-terminated without removing it from the table. /// /// Used by the host when the Worker dies before the guest reaches SYS_EXIT. @@ -1878,7 +1908,7 @@ pub extern "C" fn kernel_enum_procs(out_ptr: *mut u8, out_len: u32) -> i32 { // First pass: compute total bytes we need to write so we can fail fast // on a too-small buffer rather than partial-writing. Skip zombies on // the count too so the size estimate matches what we actually emit. - const HDR_BYTES: usize = 4 + 4 + 4 + 4 + 4 + 8 + 4 + 4 + 4; // 40 bytes per record + const HDR_BYTES: usize = wasm_posix_shared::process_snapshot::RECORD_FIXED_SIZE; let mut need: usize = 4; // count u32 for pid in &pids { let proc = match table.get(*pid) { @@ -2239,6 +2269,10 @@ pub extern "C" fn kernel_exec_setup(pid: u32) -> i32 { match crate::fork::deserialize_exec_state(&buf[..written], pid) { Ok(new_proc) => { table.get_mut(pid).map(|p| { + let ipc = unsafe { crate::ipc::global_ipc_table() }; + for mapping in &p.shm_mappings { + let _ = ipc.shmdt(mapping.shmid, pid); + } *p = new_proc; p.has_exec = true; }); @@ -3322,7 +3356,7 @@ fn dispatch_channel_syscall(nr: u32, args: &[i64; 6]) -> i32 { } // SYS_SHMAT (345), SYS_SHMDT (346): intercepted by host for process memory management 345 => kernel_ipc_shmat(a1, a2, a3), - 346 => kernel_ipc_shmdt(a1), + 346 => kernel_ipc_shmdt_addr(a1 as usize), 347 => { // SYS_SHMCTL: (shmid, cmd, buf_ptr) let ipc = unsafe { crate::ipc::global_ipc_table() }; @@ -4175,6 +4209,34 @@ pub extern "C" fn kernel_ipc_shmat(shmid: i32, _shmaddr: i32, flags: i32) -> i32 } } +/// Record the process address chosen by the host-managed mmap for a SysV +/// shared-memory attachment. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_ipc_shm_record_mapping(addr: usize, shmid: i32, size: u32) -> i32 { + let (_guard, proc) = unsafe { get_process() }; + proc.record_shm_mapping(addr, shmid, size as usize); + 0 +} + +/// Look up a SysV shared-memory attachment by process address. +/// Writes `{ i32 shmid, u32 size }` to `out_ptr`. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_ipc_shm_lookup_mapping(addr: usize, out_ptr: *mut u8) -> i32 { + if out_ptr.is_null() { + return -(Errno::EFAULT as i32); + } + + let (_guard, proc) = unsafe { get_process() }; + let Some(mapping) = proc.shm_mapping_at(addr) else { + return -(Errno::EINVAL as i32); + }; + + let out = unsafe { core::slice::from_raw_parts_mut(out_ptr, 8) }; + out[0..4].copy_from_slice(&mapping.shmid.to_le_bytes()); + out[4..8].copy_from_slice(&(mapping.size as u32).to_le_bytes()); + 0 +} + /// Detach from shared memory segment. /// Host should call kernel_ipc_shm_write_chunk first to sync data back. #[unsafe(no_mangle)] @@ -4187,6 +4249,22 @@ pub extern "C" fn kernel_ipc_shmdt(shmid: i32) -> i32 { } } +/// Detach from a shared-memory segment by process address. +/// Host should call kernel_ipc_shm_lookup_mapping and sync data back first. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_ipc_shmdt_addr(addr: usize) -> i32 { + let ipc = unsafe { crate::ipc::global_ipc_table() }; + let (_guard, proc) = unsafe { get_process() }; + let Some(mapping) = proc.remove_shm_mapping(addr) else { + return -(Errno::EINVAL as i32); + }; + + match ipc.shmdt(mapping.shmid, proc.pid) { + Ok(()) => 0, + Err(e) => -(e as i32), + } +} + /// Read a chunk of shared memory segment data into scratch area. /// Returns bytes written to out_ptr. #[unsafe(no_mangle)] @@ -9143,6 +9221,46 @@ pub extern "C" fn kernel_timer_delete(timerid: i32) -> i32 { 0 } +/// Drain the process-owned timer cleanup list for host-side timer handles. +/// +/// Writes `{ u32 cancel_alarm, u32 posix_count, u32 timer_ids[posix_count] }` +/// to `out_ptr`, clears the Rust timer state, and returns `posix_count`. +#[unsafe(no_mangle)] +pub extern "C" fn kernel_take_process_timer_cleanup( + pid: u32, + out_ptr: *mut u8, + max_timer_ids: u32, +) -> i32 { + if out_ptr.is_null() { + return -(Errno::EFAULT as i32); + } + + let table = unsafe { &mut *PROCESS_TABLE.0.get() }; + let Some(proc) = table.get_mut(pid) else { + return -(Errno::ESRCH as i32); + }; + + let timer_count = proc + .posix_timers + .iter() + .filter(|slot| slot.is_some()) + .count(); + if timer_count > max_timer_ids as usize { + return -(Errno::EINVAL as i32); + } + + let cleanup = proc.take_host_timer_cleanup(); + let out_len = 8 + cleanup.posix_timer_ids.len() * 4; + let out = unsafe { core::slice::from_raw_parts_mut(out_ptr, out_len) }; + out[0..4].copy_from_slice(&(cleanup.cancel_alarm as u32).to_le_bytes()); + out[4..8].copy_from_slice(&(cleanup.posix_timer_ids.len() as u32).to_le_bytes()); + for (idx, timer_id) in cleanup.posix_timer_ids.iter().enumerate() { + let offset = 8 + idx * 4; + out[offset..offset + 4].copy_from_slice(&(*timer_id as u32).to_le_bytes()); + } + cleanup.posix_timer_ids.len() as i32 +} + /// Called by the host when a repeating POSIX timer fires to increment the overrun counter. /// This is used for timer_getoverrun() support. #[unsafe(no_mangle)] @@ -9355,13 +9473,18 @@ pub extern "C" fn kernel_get_robust_list(_pid: u32, _head_ptr: usize, _len_ptr: /// thread_exit — clean up thread state in the kernel. /// Called by the host when a thread Worker exits. -/// Removes the thread from the process's thread table. +/// Removes the thread from the process's thread table and returns the +/// CLONE_CHILD_CLEARTID pointer recorded in ThreadInfo, or 0 if no clear-tid +/// wake is needed. #[unsafe(no_mangle)] pub extern "C" fn kernel_thread_exit(pid: u32, tid: u32) -> i32 { if crate::is_centralized_mode() { let pt = unsafe { &mut *PROCESS_TABLE.0.get() }; if let Some(proc) = pt.get_mut(pid) { - proc.remove_thread(tid); + return proc + .remove_thread(tid) + .map(|thread| thread.ctid_ptr as i32) + .unwrap_or(0); } } 0 diff --git a/crates/shared/src/lib.rs b/crates/shared/src/lib.rs index b4dc699ec..81adb3c4a 100644 --- a/crates/shared/src/lib.rs +++ b/crates/shared/src/lib.rs @@ -2,6 +2,33 @@ pub mod host_abi; +pub mod process_snapshot { + pub const COUNT_OFFSET: usize = 0; + pub const COUNT_SIZE: usize = 4; + + pub const RECORD_PID_OFFSET: usize = 0; + pub const RECORD_PPID_OFFSET: usize = 4; + pub const RECORD_UID_OFFSET: usize = 8; + pub const RECORD_GID_OFFSET: usize = 12; + pub const RECORD_VSIZE_BYTES_OFFSET: usize = 16; + pub const RECORD_STATE_OFFSET: usize = 24; + pub const RECORD_COMM_LEN_OFFSET: usize = 28; + pub const RECORD_CMDLINE_LEN_OFFSET: usize = 32; + pub const RECORD_FIXED_SIZE: usize = 36; +} + +pub mod wakeup_event { + pub const IDX_OFFSET: usize = 0; + pub const IDX_SIZE: usize = 4; + pub const TYPE_OFFSET: usize = 4; + pub const TYPE_SIZE: usize = 1; + pub const RECORD_SIZE: usize = 5; + + pub const TYPE_READABLE: u8 = 1; + pub const TYPE_WRITABLE: u8 = 2; + pub const TYPE_ACCEPT: u8 = 4; +} + /// Kernel ABI version. /// /// This number is baked into every compiled user program (wasm custom section @@ -475,6 +502,7 @@ pub mod flags { pub const O_ACCMODE: u32 = 3; pub const O_CREAT: u32 = 0o100; pub const O_EXCL: u32 = 0o200; + pub const O_NOCTTY: u32 = 0o400; pub const O_TRUNC: u32 = 0o1000; pub const O_APPEND: u32 = 0o2000; pub const O_NONBLOCK: u32 = 0o4000; @@ -486,6 +514,7 @@ pub mod flags { pub const AT_FDCWD: i32 = -100; pub const AT_SYMLINK_NOFOLLOW: u32 = 0x100; pub const AT_REMOVEDIR: u32 = 0x200; + pub const AT_EMPTY_PATH: u32 = 0x1000; } /// File descriptor flags (FD_*). @@ -600,6 +629,14 @@ pub mod poll { pub const POLLNVAL: i16 = 0x0020; } +/// Epoll event constants. +pub mod epoll { + pub const EPOLLIN: u32 = 0x0001; + pub const EPOLLOUT: u32 = 0x0004; + pub const EPOLLERR: u32 = 0x0008; + pub const EPOLLHUP: u32 = 0x0010; +} + /// Seek whence constants. pub mod seek { pub const SEEK_SET: u32 = 0; @@ -639,6 +676,11 @@ pub mod mode { pub const S_IFCHR: u32 = 0o020000; pub const S_IFIFO: u32 = 0o010000; + // Special permission bits + pub const S_ISUID: u32 = 0o4000; + pub const S_ISGID: u32 = 0o2000; + pub const S_ISVTX: u32 = 0o1000; + // Owner permissions pub const S_IRWXU: u32 = 0o700; pub const S_IRUSR: u32 = 0o400; @@ -656,6 +698,8 @@ pub mod mode { pub const S_IROTH: u32 = 0o004; pub const S_IWOTH: u32 = 0o002; pub const S_IXOTH: u32 = 0o001; + + pub const S_MODE_BITS: u32 = S_ISUID | S_ISGID | S_ISVTX | S_IRWXU | S_IRWXG | S_IRWXO; } /// Shared-memory channel layout offsets and sizes. diff --git a/docs/plans/2026-05-20-rust-owned-host-logic-plan.md b/docs/plans/2026-05-20-rust-owned-host-logic-plan.md index e7dd1b948..d08f9dd25 100644 --- a/docs/plans/2026-05-20-rust-owned-host-logic-plan.md +++ b/docs/plans/2026-05-20-rust-owned-host-logic-plan.md @@ -35,7 +35,7 @@ The host must stay responsible for browser and Node platform primitives: Workers 1. **Generated TS ABI constants from `crates/shared`/`xtask dump-abi`.** - Generate `host/src/generated/abi.ts` with ABI version, channel offsets/sizes, status codes, host-intercepted syscall numbers, syscall numbers already in `shared::Syscall`, and marshalled struct sizes. - Use it in `constants.ts`, `kernel-worker.ts`, `kernel.ts`, and the simplest worker-channel writers. - - Keep legacy constants that are not yet in `shared::Syscall` local for now. + - Keep constants that are not yet in `shared::Syscall` local for now. 2. **Expand shared syscall metadata coverage.** - Move currently untracked syscall numbers used by TS (`clone`, `futex`, `epoll`, `mq`, SysV IPC, thread cancel, exit_group, etc.) into Rust/shared metadata. @@ -74,7 +74,7 @@ The host must stay responsible for browser and Node platform primitives: Workers | Browser/Node parity | Host lifecycle changes often break one side only. | Migrate shared files first, then update both worker entries in the same PR for lifecycle changes. | | Snapshot churn | Adding snapshot coverage can look like ABI change even when runtime bytes do not change. | Keep first slice generated from existing snapshot/shared data. For new coverage, follow `docs/abi-versioning.md` and classify whether an ABI bump is required. | | V8/browser workarounds | Epoll and wake scheduling have browser-specific failure modes. | Do not remove TS workarounds until browser smoke/Playwright evidence exists. | -| Legacy binaries | Older images depend on stable ABI pins and first-party host adapters. | Keep strict `__abi_version` checks; use additive manifests/bindings without weakening compatibility checks. | +| Version-pinned binaries | Images can depend on stable ABI pins and first-party host adapters. | Keep strict `__abi_version` checks; use additive manifests/bindings without weakening compatibility checks. | ## ABI And Versioning Implications @@ -101,7 +101,7 @@ Implement chunk 1 first. It removes hand-maintained TS constants from the host r ## Living Migration Backlog -Updated: 2026-05-24 +Updated: 2026-05-28 This section is the handoff list for follow-up work. Keep it current as each slice lands so the project does not lose track of what was intentionally left @@ -114,12 +114,53 @@ path. | Done / PR #534 | Rust-owned syscall marshalling descriptors | `crates/shared::host_abi` owns simple pointer-argument descriptors; `dump-abi` generates `SYSCALL_ARGS`; TS host keeps memory copies but reads generated descriptors. | The old TS `SYSCALL_ARGS` table and syscall-number size switches are gone. `poll`/`ppoll`, SysV message prefix, `semop`, and `msgrcv` copy-back adjustments are metadata fields. Nested-pointer syscalls (`readv`/`writev`/preadv/pwritev) stay on dedicated TS paths. | Shared unit tests for descriptor ordering/high-risk sizes/nested-pointer exclusion; xtask ABI tests; `bash scripts/check-abi-version.sh`; generated ABI vitest; host build; kernel lib tests. | | Done / PR #534 follow-up | Extended host-visible syscall numbers and names | Add Rust/shared metadata for ABI-visible syscall numbers still hardcoded in host TS but not currently in `shared::Syscall`, such as `getrandom`, `clone`, `futex`, `ppoll`, `pselect6`, epoll, `exit_group`, `waitid`, `msync`, preadv/pwritev, mqueue, SysV IPC, `sched_yield`, `fallocate`, timers, and `thread_cancel`. Generate TS bindings, logging names, and snapshot coverage. | Host TS no longer defines literal syscall numbers for this set, and syscall trace names are generated from Rust-owned metadata. Existing `HOST_INTERCEPTED_SYSCALLS` remains separate for fork/exec/spawn because those are caught before normal dispatch. Public behavior unchanged. | Rust metadata uniqueness tests; xtask compatibility tests; `bash scripts/check-abi-version.sh update` + check; generated ABI vitest; host build; kernel lib tests. | | Done / stacked PR | Rust-defined host adapter manifest | Add a compact Rust-defined manifest describing ABI version, required host adapter protocol version, required/optional exports, worker protocol features, and channel metadata. JS validates it during kernel boot. | Boot fails earlier with clear errors when the host/kernel contract is incompatible. No Worker creation or Wasm instantiation moves out of JS. | Rust manifest serialization tests; ABI snapshot check; vitest boot validation cases; Node/browser worker-entry smoke if boot code changes. | -| In progress / stacked PR | Process lifecycle cleanup consolidation | Rust `ProcessTable` now owns parent lookup, wait-target matching, wait-status derivation, host-crash zombie marking, and authorized child reaping. TS keeps only blocked waiter queues plus Worker/memory cleanup. Remaining audit: thread-channel lifecycle, host timer cancellation, TCP listener target policy, and shared-memory mapping cleanup. | Kernel owns process lifecycle invariants that do not require Worker identity; JS owns Worker termination, memory objects, crash observation, and platform callbacks. | ProcessTable unit tests; fork/exec/spawn/clone/wait tests; crash/trap tests; browser parity smoke when worker entries change. | -| Planned | IPC/resource cleanup in Rust | Move remaining pure SysV IPC and POSIX mqueue lifetime/cleanup state into Rust-owned process cleanup paths. | `remove_process()` owns IPC cleanup; JS only wakes or schedules blocked channels when host primitives are involved. | SysV IPC and mqueue Rust tests plus host integration/e2e coverage for blocking and cleanup. | -| Planned | Readiness metadata improvements | Replace broad host inference with kernel-emitted readiness events for pipe/socket/poll/select cases where the kernel already knows state changes. | JS still owns timers/retry queues/`Atomics.waitAsync`, but readiness decisions are less inferred from syscall numbers. No extra Wasm round trip per syscall. | Pipe/socket/poll/select/ppoll/pselect tests; browser bridge smoke for affected wake paths; performance comparison before removing broad fallback logic. | -| Planned | VFS policy split | Keep backend I/O, OPFS/IndexedDB/fetch, Node `fs`, and lazy archive materialization in JS. Move permission and policy decisions into Rust where process uid/gid/umask/fd context is authoritative. | Guest-visible policy is enforced in Rust; host adapters only perform platform operations requested through a checked contract. | VFS unit tests, uid/gid/permission tests, host-fs metadata tests, default mount tests, Node/browser parity tests. | -| Planned | Procfs/process snapshot schema metadata | Generate binary process snapshot schema/constants consumed by TS UI decoding, or replace TS decoding with a Rust-exported stable formatter if that does not add hot-path cost. | TS no longer hand-decodes undocumented offsets for kernel process snapshot data. Procfs text formatting remains Rust-owned. | Rust procfs/process snapshot tests; generated ABI vitest; UI/kernel-host tests that consume snapshots. | +| Done / stacked PR | Process lifecycle cleanup consolidation | Rust `ProcessTable` now owns parent lookup, wait-target matching, wait-status derivation, host-crash zombie marking, authorized child reaping, thread-exit clear-tid metadata, SysV shared-memory attachment metadata, host-bridged TCP listener target policy, and process-owned host timer cleanup metadata. TS keeps blocked waiter queues, Worker/memory cleanup, platform timer handles, process-memory writes/futex wakeups, thread channel allocation/free-list state, and the actual TCP server objects because those are host primitives. | Kernel owns process lifecycle invariants that do not require Worker identity; JS owns Worker termination, memory objects, crash observation, channel allocation, and platform callbacks. | ProcessTable unit tests; fork/exec/spawn/clone/wait tests; crash/trap tests; browser parity smoke when worker entries change. | +| Done / stacked PR | IPC/resource cleanup in Rust | Rust `Process` now records `shmat` address -> segment metadata, inherits it through fork, clears it across exec setup, and detaches live mappings from `remove_process()`. Rust also owns mqueue tables and process cleanup. TS still copies bytes between guest memory and kernel SysV segments and drains mqueue notifications because only the host can address guest `Memory` and wake host channels. | `remove_process()` owns IPC attachment cleanup; JS only handles guest-memory transfer, signal/notification delivery, and host primitive wake/schedule work. | SysV IPC and mqueue Rust tests plus host integration/e2e coverage for blocking and cleanup. | +| Done / stacked PR | Readiness metadata improvements | Rust/shared now owns the kernel wakeup event record layout, wake-type bits, poll/epoll event bits, and `fd_set` sizing consumed by the host. Kernel wakeup events now target matching poll retries by pipe index before falling back to the broad retry pass; select/pselect targeting, signal-safe ppoll/pselect fallback behavior, and epoll mirror removal are deferred because they require browser smoke/performance evidence. | JS still owns timers/retry queues/`Atomics.waitAsync`, but readiness decisions are less inferred from syscall numbers. No extra Wasm round trip per syscall. | Pipe/socket/poll/select/ppoll/pselect tests; browser bridge smoke for affected wake paths; performance comparison before removing broad fallback logic. | +| Done / stacked metadata PR; design deferred | VFS policy split | Rust/shared now owns VFS-visible open flags, `*at` flags, fd/fcntl flags, access modes, file mode bits, dirent types, and seek constants generated into `host/src/generated/abi.ts`; host VFS adapters and the WASI shim consume those generated values. Standalone OPFS worker and vendored SharedFS internals keep local copies because they are entry-point/vendor boundaries. Kernel already enforces uid/gid/umask permission checks from host stat metadata. Remaining work is explicit mount/read-only policy contract design and Node/browser backend parity for enforcing it. | Guest-visible constants and existing permission policy metadata are Rust-owned. Mount/read-only enforcement needs a checked contract before host adapters can become pure platform executors. | Generated ABI vitest; `check-abi-version.sh`; VFS unit tests, uid/gid/permission tests, host-fs metadata tests, default mount tests, Node/browser parity tests before changing enforcement behavior. | +| Done / stacked PR | Procfs/process snapshot schema metadata | `crates/shared` owns the binary process snapshot layout, `dump-abi` publishes the schema in `abi/snapshot.json` and generated TS bindings, and `parseProcSnapshots` consumes those generated offsets and sizes. | TS no longer hand-decodes undocumented offsets for kernel process snapshot data. Procfs text formatting remains Rust-owned. | Rust procfs/process snapshot tests; generated ABI vitest; UI/kernel-host tests that consume snapshots. | Deferral rule: if a chunk would move browser/Node primitives, add runtime JS evaluation, or add a Wasm call to every syscall without removing meaningful ABI/security complexity, leave it in JS and document the reason here. + +## Autonomous Stack Exhaustion + +The stacked migration through the VFS metadata slice has exhausted the safe +autonomous work from this plan. The remaining items are design or evidence +gates, not mechanical moves: + +- Worker creation, `WebAssembly.Memory`, thread channel allocation, and channel + free-list ownership are host primitives. +- Guest-memory copies for SysV shared memory and host mqueue notification + delivery still require JS access to process `Memory` and channel wakeups. +- More precise select/pselect targeting and epoll mirror removal need browser + smoke coverage and performance data because the current fallback protects + signal-mask-swapping waits and V8-specific epoll behavior. +- Mount/read-only enforcement needs an explicit mount-table contract shared by + Rust and the Node/browser VFS backends before host adapters can act as pure + platform executors. + +Resume with design docs/tests for one of those contracts before moving more +logic across the Rust/TS boundary. + +## Additional Candidate Work + +These items are the remaining TS/JS logic surfaces that can plausibly move into +Rust-owned kernel state, Rust/shared metadata, or a reusable Rust host-adapter +crate. They are not approved migration chunks yet; evaluate each with focused +designs, tests, and host/API-surface tradeoff notes. + +| Candidate | Why consider it | Boundary notes | +|---|---|---| +| Centralized advisory file-lock ownership with native lock bridge | Rust already supports advisory-lock syscalls: it parses `fcntl`/`flock`, validates access mode, resolves ranges, owns process/OFD context, and has a local Rust `LockTable` fallback. The remaining TS-owned part is the cross-process shared `fcntl`/`flock` table for host-backed files, currently keyed by path hash and reached through `host_fcntl_lock`. Moving that Kandelo-owned host-backed lock state into Rust would shrink duplicated policy and make lock cleanup part of centralized kernel process state. See `docs/plans/2026-06-01-centralized-advisory-file-lock-native-bridge-plan.md`. | There is only a centralized kernel target; do not preserve earlier decentralized/research behavior as a compatibility concern. Keep `host_fcntl_lock` or a successor host hook as a Node/native-file interop bridge: for native-backed VFS files, the host should check/acquire/release native OS file locks so Kandelo processes coordinate with native programs before the kernel grants an internal lock. Browser/memfs hosts can implement the hook as no native-lock surface. Design needs a stable VFS file identity contract, not only path hashes. | +| Nested syscall marshalling descriptors | TS still special-cases nested process-memory layouts such as `readv`/`writev`/`preadv`/`pwritev`, `sendmsg`/`recvmsg`, `fcntl` flock structs, `semctl`, `select`/`pselect`, `ppoll` scalar conversion, and selected `ioctl` payloads. Rust/shared metadata could describe these shapes so future non-JS adapters do not reimplement TS ABI knowledge. | The actual copies must stay in the host adapter because they access process `WebAssembly.Memory`. Prefer generated tables or a reusable Rust host-adapter crate over runtime kernel calls on every syscall. | +| WASI Preview 1 translation | `host/src/wasi-shim.ts` maps WASI fd/path/poll/socket/errno surfaces onto the POSIX kernel syscall ABI. Non-JS integrations would otherwise need to port this compatibility layer. | Likely belongs in a Rust host-adapter crate rather than the core process-table kernel. The host still supplies module memory access and syscall submission. | +| Exec/spawn launch planning | TS still reads argv/envp from process memory, resolves relative exec paths, handles `execveat(AT_EMPTY_PATH)`, performs spawn preflight, and Node/browser worker entries handle shebang recursion. Rust could own a launch-plan descriptor so host adapters only resolve/load bytes and instantiate. | Worker creation, module compilation, and byte loading remain host primitives. A design must preserve `posix_spawnp` "file actions exactly once" behavior and `execvpe`/PATH retry semantics. | +| Mount/read-only VFS policy contract | The mount spec carries `readonly`, but enforcement and mount routing are still host-side. Rust already owns process uid/gid/umask and permission checks from host stat metadata. | Host VFS backends remain platform executors. Rust should own guest-visible mount policy and provide a versioned contract for Node/browser adapters, including read-only enforcement, mount flags, path-to-mount identity, and `/proc/mounts` parity. | +| File-backed `mmap`/`msync` descriptors | TS populates file-backed mappings after `mmap`, tracks `MAP_SHARED` regions, flushes `msync`, and cleans tracking on `munmap`. Rust owns virtual address allocation and fd state, so it could own mapping descriptors and writeback policy. | Guest-memory copies and host file I/O remain host-side. Kernel should emit enough mapping/writeback commands for adapters to perform the copy without duplicating policy. | +| Readiness, `select`/`pselect`, and `epoll` descriptors | Rust now emits targeted wakeup metadata, but TS still owns retry queues, timeout policy, signal-safe wake grace, and an epoll interest mirror because of V8/browser evidence. | Timers and `Atomics.waitAsync` remain host-side. More precise descriptors need browser smoke/perf data before removing broad fallbacks or the epoll mirror. | +| Signal delivery event ABI | Rust owns signal state, but TS still copies delivery records into process channels, wakes blocked peers after `kill`, handles signal-death follow-up, and drains mqueue notification signals. | The host must still wake channels and touch process memory. Rust/shared can define a compact event/command ABI so adapters implement a generic signal delivery loop instead of copying TS-specific control flow. | +| Futex and cancellation wait descriptors | Futex waits and deferred `pthread_cancel` perturb host wait state (`Atomics.waitAsync`, timers, pipe reader registrations, poll/select retries), but the validation and wake/cancel target descriptions are kernel semantics. | Actual waits stay host-side because futex addresses are in process memory. A Rust-owned descriptor/event contract could reduce adapter-specific cancellation logic if it does not add hot-path round trips. | +| Virtual network interface `ioctl` metadata | TS currently fabricates `SIOCGIFCONF`, `SIOCGIFHWADDR`, and `SIOCGIFADDR` responses including interface name, loopback address, and virtual MAC layout. | The host still writes process memory. Rust/shared can own virtual interface inventory and struct layout metadata so adapters do not hardcode Linux `ifreq`/`ifconf` details. | +| Device queue/surface contracts | Rust already owns much of `/dev/fb0`, `/dev/input/mice`, and `/dev/dsp` device semantics and bounded queues. Any remaining TS surface should be evaluated for whether it is presentation-only or kernel policy. | DOM/canvas/Web Audio/input event collection stay host-side. Kernel-owned device queue contracts are useful only where they shrink adapter behavior without moving platform presentation into Rust. | diff --git a/docs/plans/2026-06-01-centralized-advisory-file-lock-native-bridge-plan.md b/docs/plans/2026-06-01-centralized-advisory-file-lock-native-bridge-plan.md new file mode 100644 index 000000000..e264418d3 --- /dev/null +++ b/docs/plans/2026-06-01-centralized-advisory-file-lock-native-bridge-plan.md @@ -0,0 +1,201 @@ +# Centralized Advisory File Lock And Native Bridge Plan + +Date: 2026-06-01 + +## Context + +Kandelo has only one supported kernel architecture: a centralized Rust kernel +coordinating process state while JS host adapters provide platform primitives. +Earlier decentralized/research paths are not compatibility targets. + +Advisory file locking is currently split across Rust and TypeScript: + +- Rust already supports the advisory-lock syscall surface. It parses + `fcntl`/`flock` requests, validates access mode, resolves + `SEEK_SET`/`SEEK_CUR`/`SEEK_END`, owns process/OFD context, and releases some + locks during close/exit cleanup. +- TS owns a `SharedLockTable` for host-backed files, keyed by a path hash, so + cross-process Kandelo locks are visible to all workers. +- The kernel calls `host_fcntl_lock` for host-backed files. + +So "TS owns advisory file locking" is shorthand for a narrower issue: Rust owns +most syscall semantics, but host-backed files still delegate conflict detection +and shared lock-table storage to TypeScript. That keeps important Kandelo-owned +policy outside the process table and makes future non-JS host adapters +reimplement the same lock-table semantics. At the same time, a host hook remains +valuable: Node hosts can mount native files through Kandelo's VFS, and those +files should eventually coordinate with native OS programs using OS-level +advisory locks. + +## Goals + +- Move Kandelo-owned advisory lock state into Rust centralized kernel state. +- Preserve support for POSIX byte-range `fcntl`, OFD locks, and BSD `flock` + mappings already handled by Rust. +- Replace path-hash identity with a stable VFS file identity contract. +- Keep a host/native lock bridge so Node-backed VFS files can reject or acquire + locks that conflict with native OS processes. +- Make browser/memfs hosts work without native lock support. +- Keep host hooks out of the syscall hot path except for actual lock requests. + +## Non-Goals + +- Do not preserve earlier decentralized/research behavior. +- Do not move native OS file APIs into Rust Wasm. +- Do not require native locking support in browser hosts. +- Do not implement mandatory locking. +- Do not rely on string paths as the final identity for hard links or renames. + +## Current Risk + +Path hashes are not a stable file identity. They miss hard-link equivalence, +can become stale across rename, and are vulnerable to collision. A correct +kernel-owned lock table needs identity from the VFS layer, not only the path +used for open. + +Native POSIX locks also have surprising ownership rules. Classic `fcntl` +record locks are often per native process, not per file descriptor; closing a +native fd can release locks for that file held by the process. A Node host +acting on behalf of many Kandelo processes cannot treat OS locks as the only +source of truth. Rust must still own Kandelo-internal conflict detection, and +the host bridge must mirror or probe native state carefully. + +## Proposed Shape + +### 1. Rust-Owned Kandelo Lock Table + +Add a kernel-wide advisory lock table, likely owned by `ProcessTable` or an +adjacent kernel resource table. It should track: + +- Stable file identity. +- Lock owner: POSIX pid owner or OFD owner. +- Lock kind: read, write, unlock. +- Byte range after `l_whence` resolution. +- Source syscall family: `fcntl`, OFD lock, or `flock` mapping if needed for + cleanup/debugging. + +The table should implement conflict detection, lock replacement, unlock range +splitting/removal, `F_GETLK` reporting, close cleanup, exec cleanup where +applicable, and process-exit cleanup. + +### 2. Stable VFS File Identity + +Define a versioned identity passed from host VFS adapters to the kernel for +open files. Preferred shape: + +- Mount/backend id. +- File id from backend metadata, ideally `(st_dev, st_ino)` for native files. +- Optional generation/version where a backend can provide one. +- Fallback resolved path only for backends that cannot expose a stable id. + +The kernel should store this identity on the OFD at open time. Host adapters +must document whether their identities are hard-link aware and rename stable. + +### 3. Native Lock Bridge Capability + +Keep `host_fcntl_lock` or replace it with a narrower, versioned host hook that +is called only for native-lock-capable identities. The hook should answer: + +- Is native locking supported for this file identity? +- Can a requested read/write byte-range lock be acquired without conflicting + with native OS processes? +- Has the host acquired or mirrored the native lock needed for Kandelo's + aggregate internal lock state? +- Can the host release or reconcile the native lock when Kandelo unlocks or + exits? + +For browser/memfs hosts, the hook can report unsupported and Rust should rely +only on the internal Kandelo lock table. + +### 4. Transaction Boundary + +For `F_SETLK`, Rust should: + +1. Resolve/validate the requested lock. +2. Check Kandelo-internal conflicts. +3. Ask the native bridge to acquire/probe if the file identity requires native + coordination. +4. Commit the Rust lock table only after native bridge success. +5. Roll back/reconcile native state if a later step fails. + +For `F_UNLCK`, Rust should update the internal table and ask the native bridge +to release or reconcile the aggregate native locks for that file. + +`F_SETLKW` needs a separate wait design. Blocking the centralized kernel or a +JS event loop on a native lock is not acceptable. Prefer nonblocking attempts +plus a host retry/wakeup path, or an async native-lock worker that completes +through the existing blocked-syscall machinery. + +### 5. Native Bridge Backend Strategy + +Do not assume Node has portable native locking built in. Evaluate backend +options separately: + +- POSIX `fcntl`/OFD locks via native addon or helper process. +- `flock` where byte-range locking is not required. +- Platform-specific Windows locking if needed later. +- No-op unsupported capability for browser/memfs. + +Because native lock ownership semantics can be per-process, the bridge should +maintain a host-side mirror of aggregate native locks per file identity. It +must not let OS lock state replace Rust's internal Kandelo conflict table. + +## Migration Slices + +1. **Design and tests for Rust lock table semantics.** + - Add Rust tests for read/read compatibility, write conflicts, + replacement, partial unlock, `F_GETLK`, `SEEK_END`, OFD owner reporting, + close cleanup, and process-exit cleanup. + - No host behavior change. + +2. **Introduce stable VFS file identity.** + - Extend the host/kernel file metadata contract so open OFDs carry stable + identity. + - Use existing `st_dev`/`st_ino` where valid. + - Add tests for hard links, rename, and fallback identity behavior. + +3. **Move Kandelo lock table into Rust.** + - Route host-backed file locking through the Rust table. + - Remove TS `SharedLockTable` conflict decisions once parity tests pass. + - Keep host hook disabled or advisory-only in this slice. + +4. **Add native bridge capability negotiation.** + - Extend the host adapter manifest or VFS capability metadata with native + locking support. + - Define return codes and transaction semantics for the hook. + - Add Node tests with an unsupported/no-op bridge first. + +5. **Implement Node native locking backend.** + - Choose backend technology after a focused spike. + - Add integration tests that verify conflicts with a native process or + helper holding a lock. + - Verify cleanup on Kandelo process exit and host teardown. + +6. **Remove obsolete TS lock-table code.** + - Delete `SharedLockTable` only after Rust table and native bridge behavior + cover the current tests. + - Keep only host native-lock backend code and minimal VFS capability glue. + +## Required Tests + +- Rust lock-table unit tests for all conflict/replacement/unlock cases. +- Rust syscall tests for `fcntl`, OFD locks, and `flock` mappings. +- Host integration tests for multiple Kandelo processes locking the same file. +- VFS identity tests covering hard links and rename where backends support them. +- Browser/memfs tests proving unsupported native locking still preserves + Kandelo-internal advisory locks. +- Node native-bridge tests with an external native process/helper holding a + conflicting lock. +- Exit/close/exec cleanup regressions. + +## Open Questions + +- Should native bridge calls receive individual lock operations or a full + desired aggregate lock snapshot for a file identity? +- Should `F_SETLKW` use existing blocked-syscall retry machinery or a dedicated + native-lock waiter? +- Which Node native-locking mechanism is acceptable for project dependencies + and CI? +- What identity fallback is acceptable for backends without stable inode-like + metadata? +- How should lock state be surfaced in diagnostics or procfs, if at all? diff --git a/host/src/browser-kernel-worker-entry.ts b/host/src/browser-kernel-worker-entry.ts index 1b491b046..98fa6e33a 100644 --- a/host/src/browser-kernel-worker-entry.ts +++ b/host/src/browser-kernel-worker-entry.ts @@ -96,9 +96,14 @@ import type { MainToKernelMessage, KernelToMainMessage, } from "./browser-kernel-protocol"; +import { FILE_MODES, OPEN_FLAGS } from "./generated/abi"; const PAGE_SIZE = 65536; const FORK_BUF_SIZE = FORK_SAVE_BUFFER_SIZE; +const O_WRONLY_CREAT_TRUNC = + OPEN_FLAGS.O_WRONLY | OPEN_FLAGS.O_CREAT | OPEN_FLAGS.O_TRUNC; +const FILE_PERMISSION_BITS = + FILE_MODES.S_IRWXU | FILE_MODES.S_IRWXG | FILE_MODES.S_IRWXO; // State let kernelWorker: CentralizedKernelWorker; @@ -318,12 +323,12 @@ function overlayEtcFromRootfs(target: MemoryFileSystem, rootfsImage: Uint8Array) // Only handle regular files for now; the canonical images/rootfs/etc/* // is flat (no subdirs, no symlinks). const st = source.stat(sourcePath); - const isRegular = (st.mode & 0xf000) === 0x8000; + const isRegular = (st.mode & FILE_MODES.S_IFMT) === FILE_MODES.S_IFREG; if (!isRegular) continue; // Read full content (sequential — pass null offset for read/write // semantics rather than pread/pwrite). - const fdR = source.open(sourcePath, 0, 0); // O_RDONLY + const fdR = source.open(sourcePath, OPEN_FLAGS.O_RDONLY, 0); const size = st.size; const buf = new Uint8Array(size); let read = 0; @@ -335,7 +340,11 @@ function overlayEtcFromRootfs(target: MemoryFileSystem, rootfsImage: Uint8Array) source.close(fdR); // Write into target. - const fdW = target.open(targetPath, 0o1101 /* O_WRONLY|O_CREAT|O_TRUNC */, st.mode & 0o777); + const fdW = target.open( + targetPath, + O_WRONLY_CREAT_TRUNC, + st.mode & FILE_PERMISSION_BITS, + ); if (read > 0) target.write(fdW, buf.subarray(0, read), null, read); target.close(fdW); } @@ -429,7 +438,11 @@ async function handleInit(msg: Extract) { try { memfs.mkdir(dir, 0o755); } catch { /* exists */ } } const certBytes = new TextEncoder().encode(caCertPem); - const certFd = memfs.open("/etc/ssl/certs/ca-certificates.crt", 0o1101, 0o644); + const certFd = memfs.open( + "/etc/ssl/certs/ca-certificates.crt", + O_WRONLY_CREAT_TRUNC, + 0o644, + ); memfs.write(certFd, certBytes, 0, certBytes.length); memfs.close(certFd); } catch (e) { diff --git a/host/src/generated/abi.ts b/host/src/generated/abi.ts index 2765519a8..862440a62 100644 --- a/host/src/generated/abi.ts +++ b/host/src/generated/abi.ts @@ -103,6 +103,151 @@ export const CH_SIG_HANDLER = 65564 as const; export const CH_SIG_FLAGS = 65568 as const; export const CH_SIG_OLD_MASK = 65576 as const; +export const PROC_SNAPSHOT_COUNT_OFFSET = 0 as const; +export const PROC_SNAPSHOT_COUNT_SIZE = 4 as const; +export const PROC_SNAPSHOT_RECORD_FIXED_SIZE = 36 as const; +export const PROC_SNAPSHOT_RECORD_FIELDS = { + pid: { offset: 0, size: 4, type: "u32" }, + ppid: { offset: 4, size: 4, type: "u32" }, + uid: { offset: 8, size: 4, type: "u32" }, + gid: { offset: 12, size: 4, type: "u32" }, + vsizeBytes: { offset: 16, size: 8, type: "u64" }, + state: { offset: 24, size: 4, type: "u32_ascii" }, + commLen: { offset: 28, size: 4, type: "u32" }, + cmdlineLen: { offset: 32, size: 4, type: "u32" }, +} as const; + +export const WAKEUP_EVENT_RECORD_SIZE = 5 as const; +export const WAKEUP_EVENT_TYPE_READABLE = 1 as const; +export const WAKEUP_EVENT_TYPE_WRITABLE = 2 as const; +export const WAKEUP_EVENT_TYPE_ACCEPT = 4 as const; +export const WAKEUP_EVENT_TYPES = { + readable: WAKEUP_EVENT_TYPE_READABLE, + writable: WAKEUP_EVENT_TYPE_WRITABLE, + accept: WAKEUP_EVENT_TYPE_ACCEPT, +} as const; +export const WAKEUP_EVENT_FIELDS = { + idx: { offset: 0, size: 4, type: "u32" }, + wakeType: { offset: 4, size: 1, type: "u8" }, +} as const; + +export const POLL_EVENTS = { + POLLIN: 1, + POLLPRI: 2, + POLLOUT: 4, + POLLERR: 8, + POLLHUP: 16, + POLLNVAL: 32, +} as const; + +export const EPOLL_EVENTS = { + EPOLLIN: 1, + EPOLLOUT: 4, + EPOLLERR: 8, + EPOLLHUP: 16, +} as const; + +export const SELECT_FD_SETSIZE = 1024 as const; +export const SELECT_FD_SET_BYTES = 128 as const; + +export const OPEN_FLAGS = { + O_RDONLY: 0, + O_WRONLY: 1, + O_RDWR: 2, + O_ACCMODE: 3, + O_CREAT: 64, + O_EXCL: 128, + O_NOCTTY: 256, + O_TRUNC: 512, + O_APPEND: 1024, + O_NONBLOCK: 2048, + O_DIRECTORY: 65536, + O_NOFOLLOW: 131072, + O_CLOEXEC: 524288, + O_CLOFORK: 8388608, +} as const; + +export const AT_FLAGS = { + AT_FDCWD: -100, + AT_SYMLINK_NOFOLLOW: 256, + AT_REMOVEDIR: 512, + AT_EMPTY_PATH: 4096, +} as const; + +export const FD_FLAGS = { + FD_CLOEXEC: 1, + FD_CLOFORK: 2, +} as const; + +export const FCNTL_COMMANDS = { + F_DUPFD: 0, + F_GETFD: 1, + F_SETFD: 2, + F_GETFL: 3, + F_SETFL: 4, + F_GETLK: 12, + F_SETLK: 13, + F_SETLKW: 14, + F_SETOWN: 8, + F_GETOWN: 9, + F_DUPFD_CLOEXEC: 1030, + F_DUPFD_CLOFORK: 1028, + F_OFD_GETLK: 36, + F_OFD_SETLK: 37, + F_OFD_SETLKW: 38, +} as const; + +export const ACCESS_MODES = { + F_OK: 0, + R_OK: 4, + W_OK: 2, + X_OK: 1, +} as const; + +export const FILE_MODES = { + S_IFMT: 61440, + S_IFSOCK: 49152, + S_IFLNK: 40960, + S_IFREG: 32768, + S_IFBLK: 24576, + S_IFDIR: 16384, + S_IFCHR: 8192, + S_IFIFO: 4096, + S_ISUID: 2048, + S_ISGID: 1024, + S_ISVTX: 512, + S_IRWXU: 448, + S_IRUSR: 256, + S_IWUSR: 128, + S_IXUSR: 64, + S_IRWXG: 56, + S_IRGRP: 32, + S_IWGRP: 16, + S_IXGRP: 8, + S_IRWXO: 7, + S_IROTH: 4, + S_IWOTH: 2, + S_IXOTH: 1, + S_MODE_BITS: 4095, +} as const; + +export const DIRENT_TYPES = { + DT_UNKNOWN: 0, + DT_FIFO: 1, + DT_CHR: 2, + DT_DIR: 4, + DT_BLK: 6, + DT_REG: 8, + DT_LNK: 10, + DT_SOCK: 12, +} as const; + +export const SEEK_WHENCE = { + SEEK_SET: 0, + SEEK_CUR: 1, + SEEK_END: 2, +} as const; + export const STRUCT_SIZE_WASM_STAT = 88 as const; export const STRUCT_SIZE_WASM_DIRENT = 16 as const; export const STRUCT_SIZE_WASM_TIMESPEC = 16 as const; diff --git a/host/src/kernel-worker.ts b/host/src/kernel-worker.ts index 2b854391e..8a8998b69 100644 --- a/host/src/kernel-worker.ts +++ b/host/src/kernel-worker.ts @@ -34,6 +34,7 @@ import { ABI_KERNEL_EXPORT, ABI_SYSCALL_NAMES, ABI_SYSCALLS, + AT_FLAGS, CHANNEL_STATUS_COMPLETE, CHANNEL_STATUS_IDLE, CHANNEL_STATUS_PENDING, @@ -52,8 +53,18 @@ import { CH_STATUS, CH_SYSCALL, CH_TOTAL_SIZE, + EPOLL_EVENTS, HOST_INTERCEPTED_SYSCALLS, + POLL_EVENTS, + PROC_SNAPSHOT_COUNT_OFFSET, + PROC_SNAPSHOT_COUNT_SIZE, + PROC_SNAPSHOT_RECORD_FIELDS, + PROC_SNAPSHOT_RECORD_FIXED_SIZE, + SELECT_FD_SET_BYTES, SYSCALL_ARGS, + WAKEUP_EVENT_FIELDS, + WAKEUP_EVENT_RECORD_SIZE, + WAKEUP_EVENT_TYPES, type SyscallArgDesc, } from "./generated/abi"; import { validateKernelHostAdapterManifest } from "./host-adapter-manifest"; @@ -322,22 +333,26 @@ export interface ProcessSnapshot { } function parseProcSnapshots(mem: Uint8Array): ProcessSnapshot[] { - if (mem.byteLength < 4) return []; + if (mem.byteLength < PROC_SNAPSHOT_COUNT_SIZE) return []; const dv = new DataView(mem.buffer, mem.byteOffset, mem.byteLength); - const count = dv.getUint32(0, true); - let off = 4; + const count = dv.getUint32(PROC_SNAPSHOT_COUNT_OFFSET, true); + let off = PROC_SNAPSHOT_COUNT_SIZE; const out: ProcessSnapshot[] = []; const dec = new TextDecoder("utf-8", { fatal: false }); for (let i = 0; i < count; i++) { - if (off + 36 > mem.byteLength) break; - const pid = dv.getUint32(off, true); off += 4; - const ppid = dv.getUint32(off, true); off += 4; - const uid = dv.getUint32(off, true); off += 4; - const gid = dv.getUint32(off, true); off += 4; - const vsizeBytes = Number(dv.getBigUint64(off, true)); off += 8; - const state = String.fromCharCode(dv.getUint32(off, true)) as ProcessSnapshot["state"]; off += 4; - const commLen = dv.getUint32(off, true); off += 4; - const cmdLen = dv.getUint32(off, true); off += 4; + if (off + PROC_SNAPSHOT_RECORD_FIXED_SIZE > mem.byteLength) break; + const record = off; + const pid = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.pid.offset, true); + const ppid = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.ppid.offset, true); + const uid = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.uid.offset, true); + const gid = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.gid.offset, true); + const vsizeBytes = Number(dv.getBigUint64(record + PROC_SNAPSHOT_RECORD_FIELDS.vsizeBytes.offset, true)); + const state = String.fromCharCode( + dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.state.offset, true), + ) as ProcessSnapshot["state"]; + const commLen = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.commLen.offset, true); + const cmdLen = dv.getUint32(record + PROC_SNAPSHOT_RECORD_FIELDS.cmdlineLen.offset, true); + off += PROC_SNAPSHOT_RECORD_FIXED_SIZE; if (off + commLen + cmdLen > mem.byteLength) break; const comm = dec.decode(mem.subarray(off, off + commLen)); off += commLen; @@ -649,8 +664,6 @@ export class CentralizedKernelWorker { retVal: number; errVal: number; }>(); - /** Maps "pid:tid" to ctidPtr for CLONE_CHILD_CLEARTID on thread exit */ - private threadCtidPtrs = new Map(); /** TCP listeners: "pid:fd" → { server, pid, port, connections } */ private tcpListeners = new Map>(); private lockTable: SharedLockTable | null = null; - /** Per-process shared memory mappings: pid → Map */ - private shmMappings = new Map>(); /** PTY index → pid mapping (for draining output after syscalls) */ private ptyIndexByPid = new Map(); @@ -1501,20 +1512,7 @@ export class CentralizedKernelWorker { this.processes.delete(pid); this.stdinFinite.delete(pid); this.stdinBuffers.delete(pid); - // Cancel any pending alarm timer for this process - const alarmTimer = this.alarmTimers.get(pid); - if (alarmTimer) { - clearTimeout(alarmTimer); - this.alarmTimers.delete(pid); - } - // Cancel any pending posix timers for this process - for (const [key, entry] of this.posixTimers) { - if (key.startsWith(`${pid}:`)) { - clearTimeout(entry.timeout); - if (entry.interval) clearInterval(entry.interval); - this.posixTimers.delete(key); - } - } + this.cancelProcessHostTimers(pid); // Cancel any pending sleep timer for this process const sleepTimer = this.pendingSleeps.get(pid); if (sleepTimer) { @@ -1534,6 +1532,59 @@ export class CentralizedKernelWorker { this.hostReaped.delete(pid); } + private cancelProcessHostTimers(pid: number): void { + const cleanup = this.takeRustProcessTimerCleanup(pid); + if (cleanup !== undefined) { + if (cleanup.cancelAlarm) { + const alarmTimer = this.alarmTimers.get(pid); + if (alarmTimer) clearTimeout(alarmTimer); + this.alarmTimers.delete(pid); + } + for (const timerId of cleanup.posixTimerIds) { + const key = `${pid}:${timerId}`; + const entry = this.posixTimers.get(key); + if (entry) { + clearTimeout(entry.timeout); + if (entry.interval) clearInterval(entry.interval); + this.posixTimers.delete(key); + } + } + return; + } + + const alarmTimer = this.alarmTimers.get(pid); + if (alarmTimer) { + clearTimeout(alarmTimer); + this.alarmTimers.delete(pid); + } + for (const [key, entry] of this.posixTimers) { + if (key.startsWith(`${pid}:`)) { + clearTimeout(entry.timeout); + if (entry.interval) clearInterval(entry.interval); + this.posixTimers.delete(key); + } + } + } + + private takeRustProcessTimerCleanup(pid: number): { cancelAlarm: boolean; posixTimerIds: number[] } | undefined { + const takeCleanup = this.kernelInstance!.exports.kernel_take_process_timer_cleanup as + ((pid: number, outPtr: bigint, maxTimerIds: number) => number) | undefined; + if (!takeCleanup) return undefined; + + const maxTimerIds = Math.floor((SCRATCH_SIZE - 8) / 4); + const result = takeCleanup(pid, BigInt(this.scratchOffset), maxTimerIds); + if (result < 0) return undefined; + + const view = new DataView(this.kernelMemory!.buffer, this.scratchOffset); + const cancelAlarm = view.getUint32(0, true) !== 0; + const count = view.getUint32(4, true); + const posixTimerIds: number[] = []; + for (let i = 0; i < count; i++) { + posixTimerIds.push(view.getUint32(8 + i * 4, true)); + } + return { cancelAlarm, posixTimerIds }; + } + /** * Run kernel-side exec setup: close CLOEXEC fds, reset signal handlers. * Returns 0 on success, negative errno on failure. @@ -2814,25 +2865,47 @@ export class CentralizedKernelWorker { } private wakeBlockedPoll(pid: number, pipeIdx: number): void { + this.wakeBlockedPollRetriesForPipe(pipeIdx, pid); + } + + private clearPendingPollRetryTimer(entry: { timer: any }): void { + if (entry.timer !== null) { + clearTimeout(entry.timer); + clearImmediate(entry.timer); + } + } + + private wakeBlockedPollRetriesForPipe( + pipeIdx: number, + pidFilter?: number, + options: { deferSignalSafe?: boolean } = {}, + ): boolean { // retrySyscall runs handleSyscall synchronously, which can re-insert // the same key via pendingPollRetries.set when the kernel returns // EAGAIN. JS Map iterators are not snapshots — re-inserted entries - // appear at the new tail and the iterator yields them, livelocking - // wakeBlockedPoll-hit / poll / poll-register inside one tick. Mirror + // appear at the new tail and a live iterator yields them, livelocking + // wakeup-event / poll / poll-register inside one tick. Mirror // wakeAllBlockedRetries' snapshot-and-skip-if-replaced pattern. + let deferredSignalSafeWake = false; const matches = Array.from(this.pendingPollRetries.entries()).filter( - ([, e]) => e.channel.pid === pid && e.pipeIndices.includes(pipeIdx), + ([, entry]) => ( + (pidFilter === undefined || entry.channel.pid === pidFilter) + && entry.pipeIndices.includes(pipeIdx) + ), ); for (const [key, entry] of matches) { if (this.pendingPollRetries.get(key) !== entry) continue; - if (entry.timer !== null) { - clearTimeout(entry.timer); + if (options.deferSignalSafe && entry.needsSignalSafeWake) { + deferredSignalSafeWake = true; + continue; } + this.clearPendingPollRetryTimer(entry); this.pendingPollRetries.delete(key); - if (this.processes.has(pid)) { + if (this.processes.has(entry.channel.pid)) { this.retrySyscall(entry.channel); } } + return deferredSignalSafeWake; } /** @@ -2867,15 +2940,7 @@ export class CentralizedKernelWorker { } } // 2. Blocked pollers watching this pipe - for (const [key, entry] of this.pendingPollRetries) { - if (pidFilter !== undefined && entry.channel.pid !== pidFilter) continue; - if (!entry.pipeIndices.includes(pipeIdx)) continue; - if (entry.timer !== null) clearTimeout(entry.timer); - this.pendingPollRetries.delete(key); - if (this.processes.has(entry.channel.pid)) { - this.retrySyscall(entry.channel); - } - } + this.wakeBlockedPollRetriesForPipe(pipeIdx, pidFilter); // 3. Broad wake for any other pending retries this.scheduleWakeBlockedRetries(); } @@ -2884,7 +2949,8 @@ export class CentralizedKernelWorker { * Public wake helper for host-side pipe reads (response pump in * the TCP/HTTP bridges). Call this AFTER directly reading data * from a pipe so any process blocked writing because the pipe was - * full can resume, plus a broad wake. + * full, or polling that pipe for writability, can resume. A broad + * wake still runs as a fallback for wait classes without pipe indices. */ public notifyPipeWritable(pipeIdx: number): void { const writers = this.pendingPipeWriters.get(pipeIdx); @@ -2896,6 +2962,7 @@ export class CentralizedKernelWorker { } } } + this.wakeBlockedPollRetriesForPipe(pipeIdx); this.scheduleWakeBlockedRetries(); } @@ -2933,25 +3000,28 @@ export class CentralizedKernelWorker { if (!drainFn) return; const MAX_EVENTS = 256; - const BYTES_PER_EVENT = 5; - const bufSize = MAX_EVENTS * BYTES_PER_EVENT; + const bufSize = MAX_EVENTS * WAKEUP_EVENT_RECORD_SIZE; const count = drainFn(BigInt(this.scratchOffset), bufSize, MAX_EVENTS); if (count === 0) return; const kernelMem = new Uint8Array(this.kernelMemory!.buffer); - const WAKE_READABLE = 1; - const WAKE_WRITABLE = 2; - const WAKE_ACCEPT = 4; + const wakeIdxField = WAKEUP_EVENT_FIELDS.idx; + const wakeTypeField = WAKEUP_EVENT_FIELDS.wakeType; let needBroadWake = false; + let needSignalSafeDeferredWake = false; for (let i = 0; i < count; i++) { - const off = this.scratchOffset + i * BYTES_PER_EVENT; - const wakeIdx = kernelMem[off] | (kernelMem[off + 1] << 8) | - (kernelMem[off + 2] << 16) | (kernelMem[off + 3] << 24); - const wakeType = kernelMem[off + 4]; - - if (wakeType & WAKE_READABLE) { + const off = this.scratchOffset + i * WAKEUP_EVENT_RECORD_SIZE; + const idxOff = off + wakeIdxField.offset; + const wakeIdx = + kernelMem[idxOff] | + (kernelMem[idxOff + 1] << 8) | + (kernelMem[idxOff + 2] << 16) | + (kernelMem[idxOff + 3] << 24); + const wakeType = kernelMem[off + wakeTypeField.offset]; + + if (wakeType & WAKEUP_EVENT_TYPES.readable) { // Pipe became readable — wake pending readers on this pipe const readers = this.pendingPipeReaders.get(wakeIdx); if (readers && readers.length > 0) { @@ -2964,7 +3034,7 @@ export class CentralizedKernelWorker { } } - if (wakeType & WAKE_WRITABLE) { + if (wakeType & WAKEUP_EVENT_TYPES.writable) { // Pipe became writable — wake pending writers on this pipe const writers = this.pendingPipeWriters.get(wakeIdx); if (writers && writers.length > 0) { @@ -2977,7 +3047,13 @@ export class CentralizedKernelWorker { } } - if (wakeType & WAKE_ACCEPT) { + if ((wakeType & WAKEUP_EVENT_TYPES.readable) || (wakeType & WAKEUP_EVENT_TYPES.writable)) { + if (this.wakeBlockedPollRetriesForPipe(wakeIdx, undefined, { deferSignalSafe: true })) { + needSignalSafeDeferredWake = true; + } + } + + if (wakeType & WAKEUP_EVENT_TYPES.accept) { this.wakeBlockedAccept(wakeIdx); } @@ -3004,12 +3080,12 @@ export class CentralizedKernelWorker { // time to land. Kill-triggered wakes (line ~2050) always use the // immediate setImmediate path — by the time kill has been processed // the signal is already queued, so there's no race. Pipe - // reader/writer wakes above run synchronously (not via this - // deferred path), so plain read/write throughput is unaffected. We - // only pay the delay when a pipe event happens to wake a ppoll or - // pselect6 caller. + // reader/writer and non-signal-safe poll wakes above run synchronously + // (not via this deferred path), so plain read/write throughput is + // unaffected. We only pay the delay when a pipe event happens to wake + // a ppoll or pselect6 caller. if (needBroadWake) { - if (this.anyPendingRetryNeedsSignalSafeWake()) { + if (needSignalSafeDeferredWake || this.anyPendingRetryNeedsSignalSafeWake()) { this.scheduleWakeBlockedRetriesDeferred(); } else { this.scheduleWakeBlockedRetries(); @@ -3926,7 +4002,7 @@ export class CentralizedKernelWorker { * pure-sleep case, fast-path'd to a setTimeout. */ private handleSelect(channel: ChannelInfo, origArgs: number[]): void { - const FD_SET_SIZE = 128; + const FD_SET_SIZE = SELECT_FD_SET_BYTES; const nfds = origArgs[0]; const readPtr = origArgs[1]; const writePtr = origArgs[2]; @@ -4077,7 +4153,7 @@ export class CentralizedKernelWorker { } private handlePselect6(channel: ChannelInfo, origArgs: number[]): void { - const FD_SET_SIZE = 128; + const FD_SET_SIZE = SELECT_FD_SET_BYTES; const processMem = new Uint8Array(channel.memory.buffer); const kernelMem = this.getKernelMem(); const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); @@ -4425,14 +4501,8 @@ export class CentralizedKernelWorker { } // EPOLL event flags → poll event flags - const EPOLLIN = 0x001; - const EPOLLOUT = 0x004; - const EPOLLERR = 0x008; - const EPOLLHUP = 0x010; - const POLLIN = 0x001; - const POLLOUT = 0x004; - const POLLERR = 0x008; - const POLLHUP = 0x010; + const { EPOLLIN, EPOLLOUT, EPOLLERR, EPOLLHUP } = EPOLL_EVENTS; + const { POLLIN, POLLOUT, POLLERR, POLLHUP } = POLL_EVENTS; // Build pollfds in kernel scratch data area // struct pollfd = { fd: i32, events: i16, revents: i16 } = 8 bytes @@ -5569,15 +5639,6 @@ export class CentralizedKernelWorker { this.callbacks.onFork(parentPid, childPid, channel.memory, threadFork).then((childChannelOffsets) => { if (!this.processes.has(parentPid)) return; - // Inherit TCP listener targets: if parent listens on a port, register - // the child as an additional target (fork children share listening sockets) - for (const [port, targets] of this.tcpListenerTargets) { - const parentTarget = targets.find(t => t.pid === parentPid); - if (parentTarget && !targets.some(t => t.pid === childPid)) { - targets.push({ pid: childPid, fd: parentTarget.fd }); - } - } - // Inherit epoll interest lists from parent for (const [key, interests] of this.epollInterests) { if (key.startsWith(`${parentPid}:`)) { @@ -5878,7 +5939,6 @@ export class CentralizedKernelWorker { * Resolves the fd path via kernel_get_fd_path, then delegates to exec flow. */ private handleExecveat(channel: ChannelInfo, origArgs: number[]): void { - const AT_EMPTY_PATH = 0x1000; const dirfd = origArgs[0]; const flags = origArgs[4]; @@ -5892,7 +5952,7 @@ export class CentralizedKernelWorker { let execPath: string; - if ((flags & AT_EMPTY_PATH) !== 0 && pathStr === "") { + if ((flags & AT_FLAGS.AT_EMPTY_PATH) !== 0 && pathStr === "") { // fexecve path: resolve fd to file path via kernel const getFdPath = this.kernelInstance!.exports.kernel_get_fd_path as ((pid: number, fd: number, bufPtr: bigint, bufLen: number) => number) | undefined; @@ -6022,10 +6082,6 @@ export class CentralizedKernelWorker { channel.pid, tid, fnPtr, argPtr, stackPtr, tlsPtr, ctidPtr, channel.memory, ).then((assignedTid) => { if (!this.processes.has(channel.pid)) return; - // Store ctidPtr for CLONE_CHILD_CLEARTID on thread exit - if (ctidPtr !== 0) { - this.threadCtidPtrs.set(`${channel.pid}:${assignedTid}`, ctidPtr); - } this.completeChannel(channel, SYS_CLONE, origArgs, undefined, assignedTid, 0); }).catch((err) => { console.error(`[kernel-worker] onClone failed: ${err}`); @@ -6057,24 +6113,15 @@ export class CentralizedKernelWorker { this.threadForkContexts.delete(tidKey); } - // CLONE_CHILD_CLEARTID: write 0 to ctidPtr and futex-wake it. - // This is normally done by the Linux kernel on thread exit; we must - // do it here because the thread worker never returns from __pthread_exit - // (it loops on SYS_EXIT). - if (tid > 0) { - const ctidKey = `${channel.pid}:${tid}`; - const ctidPtr = this.threadCtidPtrs.get(ctidKey); - if (ctidPtr && ctidPtr !== 0) { - this.threadCtidPtrs.delete(ctidKey); - const procView = new DataView(channel.memory.buffer); - procView.setInt32(ctidPtr, 0, true); - const i32View = new Int32Array(channel.memory.buffer); - Atomics.notify(i32View, ctidPtr >>> 2, 1); - } - } - - if (tid > 0) { - this.notifyThreadExit(channel.pid, tid); + // CLONE_CHILD_CLEARTID: ask the kernel to remove ThreadInfo and return + // the ctid pointer it recorded at clone time. The pointer still names + // process memory, so the host performs the actual clear + futex wake. + const ctidPtr = tid > 0 ? this.notifyThreadExit(channel.pid, tid) : 0; + if (ctidPtr !== 0) { + const procView = new DataView(channel.memory.buffer); + procView.setInt32(ctidPtr, 0, true); + const i32View = new Int32Array(channel.memory.buffer); + Atomics.notify(i32View, ctidPtr >>> 2, 1); } this.removeChannel(channel.pid, channel.channelOffset); // Complete channel to unblock the thread worker so it can exit cleanly @@ -6640,15 +6687,18 @@ export class CentralizedKernelWorker { /** * Notify the kernel that a thread has exited. - * Removes thread state from the process's thread table. + * Removes thread state from the process's thread table and returns the + * CLONE_CHILD_CLEARTID pointer the kernel recorded at clone time. */ - notifyThreadExit(pid: number, tid: number): void { - if (!this.kernelInstance) return; + notifyThreadExit(pid: number, tid: number): number { + if (!this.kernelInstance) return 0; const threadExit = this.kernelInstance.exports.kernel_thread_exit as ((pid: number, tid: number) => number) | undefined; if (threadExit) { - threadExit(pid, tid); + const ret = threadExit(pid, tid); + return ret === 0 ? 0 : ret >>> 0; } + return 0; } /** @@ -7229,9 +7279,9 @@ export class CentralizedKernelWorker { // Avoid duplicate listeners on the same pid:fd if (this.tcpListeners.has(key)) return; - // Register this pid:fd as a target for this port (needed for both - // Node.js TCP bridging and browser service worker bridging via - // pickListenerTarget + injectConnection) + // Register this pid:fd as a fallback/readiness target for this port. + // Runtime target selection is Rust-owned via kernel_pick_tcp_listener_target + // when that export is available. if (!this.tcpListenerTargets.has(port)) { this.tcpListenerTargets.set(port, []); this.tcpListenerRRIndex.set(port, 0); @@ -7298,6 +7348,9 @@ export class CentralizedKernelWorker { * resolve a port to a {pid, fd} before injecting a connection. */ pickListenerTarget(port: number): {pid: number, fd: number} | null { + const rustTarget = this.pickRustTcpListenerTarget(port, 0); + if (rustTarget !== undefined) return rustTarget; + const targets = this.tcpListenerTargets.get(port); if (!targets || targets.length === 0) return null; @@ -7325,6 +7378,21 @@ export class CentralizedKernelWorker { return candidates[idx]!; } + private pickRustTcpListenerTarget(port: number, excludePid: number): {pid: number, fd: number} | null | undefined { + const pick = this.kernelInstance!.exports.kernel_pick_tcp_listener_target as + ((port: number, excludePid: number, outPtr: bigint) => number) | undefined; + if (!pick) return undefined; + + const result = pick(port, excludePid, BigInt(this.scratchOffset)); + if (result <= 0) return null; + + const view = new DataView(this.kernelMemory!.buffer, this.scratchOffset); + const pid = view.getUint32(0, true); + const fd = view.getInt32(4, true); + if (!this.processes.has(pid)) return null; + return { pid, fd }; + } + // --------------------------------------------------------------------------- // External HTTP request bridge (host → in-kernel server, no real TCP) // --------------------------------------------------------------------------- @@ -7945,7 +8013,10 @@ export class CentralizedKernelWorker { if (entry.pid === pid) { this.io.network?.closeTcpListener?.(key); // Only close the server if no other processes share this port - const hasOtherTargets = this.tcpListenerTargets.has(entry.port); + const rustTarget = this.pickRustTcpListenerTarget(entry.port, pid); + const hasOtherTargets = rustTarget !== undefined + ? rustTarget !== null + : this.tcpListenerTargets.has(entry.port); if (!hasOtherTargets) { entry.server.close(); for (const conn of entry.connections) { @@ -7957,7 +8028,6 @@ export class CentralizedKernelWorker { } } this.tcpConnections.delete(pid); - this.shmMappings.delete(pid); } // ========================================================================= @@ -8146,38 +8216,39 @@ export class CentralizedKernelWorker { transferred += nRead; } - // Track the mapping for shmdt - let pidMappings = this.shmMappings.get(channel.pid); - if (!pidMappings) { - pidMappings = new Map(); - this.shmMappings.set(channel.pid, pidMappings); + const recordMapping = this.kernelInstance!.exports.kernel_ipc_shm_record_mapping as (addr: bigint, shmid: number, size: number) => number; + const recordResult = recordMapping(BigInt(addr >>> 0), shmid, size); + if (recordResult < 0) { + const kernelShmdt = this.kernelInstance!.exports.kernel_ipc_shmdt as ((shmid: number) => number) | undefined; + if (kernelShmdt) kernelShmdt(shmid); + this.completeChannelRaw(channel, recordResult, -recordResult); + this.relistenChannel(channel); + return; } - pidMappings.set(addr >>> 0, { segId: shmid, size }); this.completeChannelRaw(channel, addr, 0); this.relistenChannel(channel); } - /** shmdt: copy process memory back to segment, untrack mapping */ + /** shmdt: copy process memory back to segment, then detach Rust-owned mapping */ private handleIpcShmdt(channel: ChannelInfo, args: number[]): void { - const addr = args[0]; - const pidMappings = this.shmMappings.get(channel.pid); - if (!pidMappings) { - this.completeChannelRaw(channel, -22, 22); // EINVAL - this.relistenChannel(channel); - return; - } - const mapping = pidMappings.get(addr); - if (!mapping) { - this.completeChannelRaw(channel, -22, 22); // EINVAL - this.relistenChannel(channel); - return; - } + const addr = args[0] >>> 0; // Set current pid for kernel exports const setCurrentPid = this.kernelInstance!.exports.kernel_set_current_pid as ((pid: number) => void) | undefined; if (setCurrentPid) setCurrentPid(channel.pid); + const lookupMapping = this.kernelInstance!.exports.kernel_ipc_shm_lookup_mapping as (addr: bigint, outPtr: bigint) => number; + const kernelView = new DataView(this.kernelMemory!.buffer, this.scratchOffset); + const lookupResult = lookupMapping(BigInt(addr), BigInt(this.scratchOffset)); + if (lookupResult < 0) { + this.completeChannelRaw(channel, lookupResult, -lookupResult); + this.relistenChannel(channel); + return; + } + const shmid = kernelView.getInt32(0, true); + const size = kernelView.getUint32(4, true); + // Sync process memory back to kernel segment via write_chunk const writeChunk = this.kernelInstance!.exports.kernel_ipc_shm_write_chunk as (shmid: number, offset: number, dataPtr: bigint, dataLen: number) => number; const processMem = new Uint8Array(channel.memory.buffer); @@ -8185,20 +8256,17 @@ export class CentralizedKernelWorker { const chunkSize = CH_DATA_SIZE; const chunkPtr = this.scratchOffset + CH_DATA; let transferred = 0; - while (transferred < mapping.size) { - const remaining = mapping.size - transferred; + while (transferred < size) { + const remaining = size - transferred; const toWrite = Math.min(remaining, chunkSize); kernelMem.set(processMem.subarray(addr + transferred, addr + transferred + toWrite), chunkPtr); - const nWritten = writeChunk(mapping.segId, transferred, BigInt(chunkPtr), toWrite); + const nWritten = writeChunk(shmid, transferred, BigInt(chunkPtr), toWrite); if (nWritten <= 0) break; transferred += nWritten; } - // Kernel-side detach bookkeeping - const kernelShmdt = this.kernelInstance!.exports.kernel_ipc_shmdt as (shmid: number) => number; - const result = kernelShmdt(mapping.segId); - - pidMappings.delete(addr); + const kernelShmdtAddr = this.kernelInstance!.exports.kernel_ipc_shmdt_addr as (addr: bigint) => number; + const result = kernelShmdtAddr(BigInt(addr)); if (result < 0) { this.completeChannelRaw(channel, result, -result); diff --git a/host/src/platform/native-metadata.ts b/host/src/platform/native-metadata.ts index b90463a3a..f92b9bb07 100644 --- a/host/src/platform/native-metadata.ts +++ b/host/src/platform/native-metadata.ts @@ -1,11 +1,15 @@ import type { Stats } from "node:fs"; import type { StatResult } from "../types"; +import { ACCESS_MODES, FILE_MODES } from "../generated/abi"; -const MODE_CHANGE_MASK = 0o7777; const UID_GID_UNCHANGED = 0xffffffff; -const X_OK = 0o1; -const W_OK = 0o2; -const R_OK = 0o4; +const MODE_CHANGE_MASK = FILE_MODES.S_MODE_BITS; +const READABLE_BITS = + FILE_MODES.S_IRUSR | FILE_MODES.S_IRGRP | FILE_MODES.S_IROTH; +const WRITABLE_BITS = + FILE_MODES.S_IWUSR | FILE_MODES.S_IWGRP | FILE_MODES.S_IWOTH; +const EXECUTABLE_BITS = + FILE_MODES.S_IXUSR | FILE_MODES.S_IXGRP | FILE_MODES.S_IXOTH; interface VirtualMetadata { mode?: number; @@ -61,9 +65,15 @@ export class NativeMetadataOverlay { access(s: Stats, amode: number): void { const mode = this.toStatResult(s).mode; - if ((amode & R_OK) !== 0 && (mode & 0o444) === 0) throw new Error("EACCES"); - if ((amode & W_OK) !== 0 && (mode & 0o222) === 0) throw new Error("EACCES"); - if ((amode & X_OK) !== 0 && (mode & 0o111) === 0) throw new Error("EACCES"); + if ((amode & ACCESS_MODES.R_OK) !== 0 && (mode & READABLE_BITS) === 0) { + throw new Error("EACCES"); + } + if ((amode & ACCESS_MODES.W_OK) !== 0 && (mode & WRITABLE_BITS) === 0) { + throw new Error("EACCES"); + } + if ((amode & ACCESS_MODES.X_OK) !== 0 && (mode & EXECUTABLE_BITS) === 0) { + throw new Error("EACCES"); + } } private metadataFor(s: Stats): VirtualMetadata { diff --git a/host/src/platform/node.ts b/host/src/platform/node.ts index 8bf55dce2..896185dad 100644 --- a/host/src/platform/node.ts +++ b/host/src/platform/node.ts @@ -10,6 +10,7 @@ import * as fs from "node:fs"; import * as os from "node:os"; import * as path from "node:path"; import type { PlatformIO, StatResult, StatfsResult } from "../types"; +import { OPEN_FLAGS } from "../generated/abi"; import { nativeStatfs, translateOpenFlags } from "../vfs/host-fs"; import { NativeMetadataOverlay } from "./native-metadata"; @@ -62,7 +63,7 @@ export class NodePlatformIO implements PlatformIO { open(path: string, flags: number, mode: number): number { const nativePath = this.rewritePath(path); - const created = (flags & 0o100) !== 0 && !fs.existsSync(nativePath); + const created = (flags & OPEN_FLAGS.O_CREAT) !== 0 && !fs.existsSync(nativePath); const fd = fs.openSync(nativePath, translateOpenFlags(flags), mode); if (created) this.metadata.chmod(fs.fstatSync(fd), mode); this.fdPositions.set(fd, 0); diff --git a/host/src/vfs/default-mounts.ts b/host/src/vfs/default-mounts.ts index 99bfaa0c7..4256ab56e 100644 --- a/host/src/vfs/default-mounts.ts +++ b/host/src/vfs/default-mounts.ts @@ -12,8 +12,12 @@ */ import type { MountConfig } from "./types"; +import { OPEN_FLAGS } from "../generated/abi"; import { MemoryFileSystem } from "./memory-fs"; +const O_WRONLY_CREAT_TRUNC = + OPEN_FLAGS.O_WRONLY | OPEN_FLAGS.O_CREAT | OPEN_FLAGS.O_TRUNC; + export interface MountSpec { /** Absolute VFS mount point (e.g., "/etc"). No trailing slash except "/". */ path: string; @@ -94,7 +98,7 @@ function readTextFile(fs: MemoryFileSystem, path: string): string | null { function writeTextFile(fs: MemoryFileSystem, path: string, text: string): void { const bytes = new TextEncoder().encode(text); - const fd = fs.open(path, 0o1101, 0o644); // O_WRONLY | O_CREAT | O_TRUNC + const fd = fs.open(path, O_WRONLY_CREAT_TRUNC, 0o644); try { if (bytes.byteLength > 0) fs.write(fd, bytes, null, bytes.byteLength); } finally { diff --git a/host/src/vfs/device-fs.ts b/host/src/vfs/device-fs.ts index cc2c28217..a138bed89 100644 --- a/host/src/vfs/device-fs.ts +++ b/host/src/vfs/device-fs.ts @@ -1,9 +1,10 @@ import type { StatResult, StatfsResult } from "../types"; import type { FileSystemBackend, DirEntry } from "./types"; import { DEVFS_SUPER_MAGIC, zeroCapacityStatfs } from "../statfs"; +import { DIRENT_TYPES, FILE_MODES } from "../generated/abi"; -const S_IFCHR = 0o020000; -const S_IFDIR = 0o040000; +const { DT_CHR, DT_DIR, DT_LNK } = DIRENT_TYPES; +const { S_IFCHR, S_IFDIR } = FILE_MODES; type DeviceReader = (buffer: Uint8Array, length: number) => number; type DeviceWriter = (buffer: Uint8Array, length: number) => number; @@ -64,12 +65,12 @@ const SUBDIRS = ["pts", "shm", "mqueue"]; /** Extra entries to list in /dev readdir (kernel-managed, not in devices map). */ const EXTRA_ENTRIES: DirEntry[] = [ - { name: "ptmx", type: 2 /* DT_CHR */, ino: 0x100 }, - { name: "pts", type: 4 /* DT_DIR */, ino: 0x101 }, - { name: "fd", type: 10 /* DT_LNK */, ino: 0x102 }, - { name: "stdin", type: 10 /* DT_LNK */, ino: 0x103 }, - { name: "stdout", type: 10 /* DT_LNK */, ino: 0x104 }, - { name: "stderr", type: 10 /* DT_LNK */, ino: 0x105 }, + { name: "ptmx", type: DT_CHR, ino: 0x100 }, + { name: "pts", type: DT_DIR, ino: 0x101 }, + { name: "fd", type: DT_LNK, ino: 0x102 }, + { name: "stdin", type: DT_LNK, ino: 0x103 }, + { name: "stdout", type: DT_LNK, ino: 0x104 }, + { name: "stderr", type: DT_LNK, ino: 0x105 }, ]; function isRootPath(path: string): boolean { diff --git a/host/src/vfs/host-fs.ts b/host/src/vfs/host-fs.ts index 11d57d883..05f873165 100644 --- a/host/src/vfs/host-fs.ts +++ b/host/src/vfs/host-fs.ts @@ -11,6 +11,7 @@ import type { StatResult, StatfsResult } from "../types"; import { NativeMetadataOverlay } from "../platform/native-metadata"; import type { FileSystemBackend, DirEntry } from "./types"; import { DEFAULT_STATFS_BLOCK_SIZE, DEFAULT_STATFS_NAMELEN } from "../statfs"; +import { OPEN_FLAGS } from "../generated/abi"; /** * Translate Linux/POSIX open flags (as used by musl libc) to the @@ -18,35 +19,23 @@ import { DEFAULT_STATFS_BLOCK_SIZE, DEFAULT_STATFS_NAMELEN } from "../statfs"; * The numeric values differ between Linux and macOS/BSD. */ export function translateOpenFlags(linuxFlags: number): number { - // Linux flag constants (octal) - const L_O_WRONLY = 0o1; - const L_O_RDWR = 0o2; - const L_O_CREAT = 0o100; - const L_O_EXCL = 0o200; - const L_O_NOCTTY = 0o400; - const L_O_TRUNC = 0o1000; - const L_O_APPEND = 0o2000; - const L_O_NONBLOCK = 0o4000; - const L_O_DIRECTORY = 0o200000; - const L_O_NOFOLLOW = 0o400000; - let native = 0; // Access mode (bottom 2 bits) - if (linuxFlags & L_O_RDWR) native |= fs.constants.O_RDWR; - else if (linuxFlags & L_O_WRONLY) native |= fs.constants.O_WRONLY; + if (linuxFlags & OPEN_FLAGS.O_RDWR) native |= fs.constants.O_RDWR; + else if (linuxFlags & OPEN_FLAGS.O_WRONLY) native |= fs.constants.O_WRONLY; // else O_RDONLY = 0 - if (linuxFlags & L_O_CREAT) native |= fs.constants.O_CREAT; - if (linuxFlags & L_O_EXCL) native |= fs.constants.O_EXCL; - if (linuxFlags & L_O_TRUNC) native |= fs.constants.O_TRUNC; - if (linuxFlags & L_O_APPEND) native |= fs.constants.O_APPEND; - if (linuxFlags & L_O_NONBLOCK) native |= fs.constants.O_NONBLOCK; - if ((linuxFlags & L_O_DIRECTORY) && fs.constants.O_DIRECTORY) + if (linuxFlags & OPEN_FLAGS.O_CREAT) native |= fs.constants.O_CREAT; + if (linuxFlags & OPEN_FLAGS.O_EXCL) native |= fs.constants.O_EXCL; + if (linuxFlags & OPEN_FLAGS.O_TRUNC) native |= fs.constants.O_TRUNC; + if (linuxFlags & OPEN_FLAGS.O_APPEND) native |= fs.constants.O_APPEND; + if (linuxFlags & OPEN_FLAGS.O_NONBLOCK) native |= fs.constants.O_NONBLOCK; + if ((linuxFlags & OPEN_FLAGS.O_DIRECTORY) && fs.constants.O_DIRECTORY) native |= fs.constants.O_DIRECTORY; - if ((linuxFlags & L_O_NOFOLLOW) && fs.constants.O_NOFOLLOW) + if ((linuxFlags & OPEN_FLAGS.O_NOFOLLOW) && fs.constants.O_NOFOLLOW) native |= fs.constants.O_NOFOLLOW; - if ((linuxFlags & L_O_NOCTTY) && fs.constants.O_NOCTTY) + if ((linuxFlags & OPEN_FLAGS.O_NOCTTY) && fs.constants.O_NOCTTY) native |= fs.constants.O_NOCTTY; // O_LARGEFILE and O_CLOEXEC have no Node.js equivalent; ignored. @@ -125,7 +114,7 @@ export class HostFileSystem implements FileSystemBackend { open(path: string, flags: number, mode: number): number { const nativePath = this.safePath(path); - const created = (flags & 0o100) !== 0 && !fs.existsSync(nativePath); + const created = (flags & OPEN_FLAGS.O_CREAT) !== 0 && !fs.existsSync(nativePath); const fd = fs.openSync(nativePath, translateOpenFlags(flags), mode); if (created) this.metadata.chmod(fs.fstatSync(fd), mode); this.fdPositions.set(fd, 0); diff --git a/host/src/vfs/image-helpers.ts b/host/src/vfs/image-helpers.ts index aa1ba8230..9575db63c 100644 --- a/host/src/vfs/image-helpers.ts +++ b/host/src/vfs/image-helpers.ts @@ -6,9 +6,11 @@ * For host-disk-aware utilities (walking a directory, saving to a file), * see scripts-side helpers. */ +import { OPEN_FLAGS } from "../generated/abi"; import type { MemoryFileSystem } from "./memory-fs"; -const O_WRONLY_CREAT_TRUNC = 0o1101; +const O_WRONLY_CREAT_TRUNC = + OPEN_FLAGS.O_WRONLY | OPEN_FLAGS.O_CREAT | OPEN_FLAGS.O_TRUNC; /** Write text content to a path in the memfs. Creates parent dirs implicitly via writeVfsBinary. */ export function writeVfsFile( diff --git a/host/src/vfs/memory-fs.ts b/host/src/vfs/memory-fs.ts index 2f26f8686..353a27cc8 100644 --- a/host/src/vfs/memory-fs.ts +++ b/host/src/vfs/memory-fs.ts @@ -1,6 +1,7 @@ import { decompress as zstdDecompress } from "fzstd"; import type { StatResult, StatfsResult } from "../types"; import { SFFS_SUPER_MAGIC } from "../statfs"; +import { DIRENT_TYPES, FILE_MODES, OPEN_FLAGS } from "../generated/abi"; import type { FileSystemBackend, DirEntry } from "./types"; import { SharedFS, @@ -8,6 +9,9 @@ import { } from "./sharedfs-vendor"; import type { ZipEntry } from "./zip"; +const O_WRONLY_CREAT_TRUNC = + OPEN_FLAGS.O_WRONLY | OPEN_FLAGS.O_CREAT | OPEN_FLAGS.O_TRUNC; + /** Serializable lazy file entry for transfer between instances. */ export interface LazyFileEntry { ino: number; @@ -271,7 +275,7 @@ export class MemoryFileSystem implements FileSystemBackend { try { this.fs.mkdir(current, 0o755); } catch { /* exists */ } } // Create empty stub file - const fd = this.fs.open(path, 0o1101, mode); // O_WRONLY | O_CREAT | O_TRUNC + const fd = this.fs.open(path, O_WRONLY_CREAT_TRUNC, mode); this.fs.close(fd); // Get inode const st = this.fs.stat(path); @@ -362,7 +366,7 @@ export class MemoryFileSystem implements FileSystemBackend { const target = symlinkTargets.get(ze.fileName)!; this.fs.symlink(target, vfsPath); } else { - const fd = this.fs.open(vfsPath, 0o1101, ze.mode); // O_WRONLY | O_CREAT | O_TRUNC + const fd = this.fs.open(vfsPath, O_WRONLY_CREAT_TRUNC, ze.mode); this.fs.close(fd); } @@ -454,7 +458,7 @@ export class MemoryFileSystem implements FileSystemBackend { throw new Error(`Failed to fetch lazy file ${entry.path}: HTTP ${resp.status}`); } const data = new Uint8Array(await resp.arrayBuffer()); - const fd = this.fs.open(entry.path, 0o1101, 0o755); // O_WRONLY | O_CREAT | O_TRUNC + const fd = this.fs.open(entry.path, O_WRONLY_CREAT_TRUNC, 0o755); this.fs.write(fd, data); this.fs.close(fd); this.lazyFiles.delete(st.ino); @@ -498,7 +502,7 @@ export class MemoryFileSystem implements FileSystemBackend { const ze = zipLookup.get(zipFileName); if (!ze) continue; const content = extractZipEntry(zipData, ze); - const fd = this.fs.open(vfsPath, 0o1101, 0o755); // O_WRONLY | O_CREAT | O_TRUNC + const fd = this.fs.open(vfsPath, O_WRONLY_CREAT_TRUNC, 0o755); if (content.length > 0) this.fs.write(fd, content); this.fs.close(fd); } @@ -930,7 +934,7 @@ export class MemoryFileSystem implements FileSystemBackend { gid: number, content: Uint8Array, ): void { - const fd = this.open(path, 0o1101, mode); // O_WRONLY | O_CREAT | O_TRUNC + const fd = this.open(path, O_WRONLY_CREAT_TRUNC, mode); if (content.length > 0) this.write(fd, content, null, content.length); this.close(fd); this.chown(path, uid, gid); @@ -966,10 +970,14 @@ export class MemoryFileSystem implements FileSystemBackend { if (!entry) return null; // Determine d_type from mode const mode = entry.stat.mode; - let dtype = 0; // DT_UNKNOWN - if ((mode & 0xf000) === 0x8000) dtype = 8; // DT_REG - else if ((mode & 0xf000) === 0x4000) dtype = 4; // DT_DIR - else if ((mode & 0xf000) === 0xa000) dtype = 10; // DT_LNK + let dtype: number = DIRENT_TYPES.DT_UNKNOWN; + if ((mode & FILE_MODES.S_IFMT) === FILE_MODES.S_IFREG) { + dtype = DIRENT_TYPES.DT_REG; + } else if ((mode & FILE_MODES.S_IFMT) === FILE_MODES.S_IFDIR) { + dtype = DIRENT_TYPES.DT_DIR; + } else if ((mode & FILE_MODES.S_IFMT) === FILE_MODES.S_IFLNK) { + dtype = DIRENT_TYPES.DT_LNK; + } return { name: entry.name, type: dtype, ino: entry.stat.ino }; } diff --git a/host/src/vfs/zip.ts b/host/src/vfs/zip.ts index c2ed001d4..ea739e09e 100644 --- a/host/src/vfs/zip.ts +++ b/host/src/vfs/zip.ts @@ -7,6 +7,7 @@ */ import { inflateSync } from "fflate"; +import { FILE_MODES } from "../generated/abi"; // --- Zip format signatures --- @@ -29,9 +30,7 @@ const COMPRESSION_DEFLATE = 8; // Unix creator OS code const CREATOR_UNIX = 3; -// Unix file type mask for symlinks -const S_IFLNK = 0xa000; -const S_IFMT = 0xf000; +const { S_IFLNK, S_IFMT } = FILE_MODES; export interface ZipEntry { fileName: string; diff --git a/host/src/wasi-shim.ts b/host/src/wasi-shim.ts index e98c6b84b..d9ff6e0b2 100644 --- a/host/src/wasi-shim.ts +++ b/host/src/wasi-shim.ts @@ -16,6 +16,7 @@ import { ABI_SYSCALLS, + AT_FLAGS, CHANNEL_STATUS_IDLE, CHANNEL_STATUS_PENDING, CH_ARG_SIZE, @@ -26,6 +27,10 @@ import { CH_RETURN, CH_STATUS, CH_SYSCALL, + FCNTL_COMMANDS, + FILE_MODES, + OPEN_FLAGS, + SEEK_WHENCE, STRUCT_SIZE_WASM_STAT, } from "./generated/abi"; @@ -74,38 +79,36 @@ const SYS_DUP2 = ABI_SYSCALLS.Dup2; const SYS_SHUTDOWN = ABI_SYSCALLS.Shutdown; // --- POSIX flags (from crates/shared/src/lib.rs) --- -const O_RDONLY = 0; -const O_WRONLY = 1; -const O_RDWR = 2; -const O_CREAT = 0o100; -const O_EXCL = 0o200; -const O_TRUNC = 0o1000; -const O_APPEND = 0o2000; -const O_NONBLOCK = 0o4000; -const O_DIRECTORY = 0o200000; -const O_NOFOLLOW = 0o400000; - -const AT_FDCWD = -100; -const AT_SYMLINK_NOFOLLOW = 0x100; -const AT_REMOVEDIR = 0x200; - -const F_GETFL = 3; -const F_SETFL = 4; - -// SEEK constants (POSIX) -const SEEK_SET = 0; -const SEEK_CUR = 1; -const SEEK_END = 2; - -// S_IFMT mode bits -const S_IFDIR = 0o040000; -const S_IFCHR = 0o020000; -const S_IFBLK = 0o060000; -const S_IFREG = 0o100000; -const S_IFIFO = 0o010000; -const S_IFLNK = 0o120000; -const S_IFSOCK = 0o140000; -const S_IFMT = 0o170000; +const O_RDONLY = OPEN_FLAGS.O_RDONLY; +const O_RDWR = OPEN_FLAGS.O_RDWR; +const O_ACCMODE = OPEN_FLAGS.O_ACCMODE; +const O_CREAT = OPEN_FLAGS.O_CREAT; +const O_EXCL = OPEN_FLAGS.O_EXCL; +const O_TRUNC = OPEN_FLAGS.O_TRUNC; +const O_APPEND = OPEN_FLAGS.O_APPEND; +const O_NONBLOCK = OPEN_FLAGS.O_NONBLOCK; +const O_DIRECTORY = OPEN_FLAGS.O_DIRECTORY; +const O_NOFOLLOW = OPEN_FLAGS.O_NOFOLLOW; + +const AT_FDCWD = AT_FLAGS.AT_FDCWD; +const AT_SYMLINK_NOFOLLOW = AT_FLAGS.AT_SYMLINK_NOFOLLOW; +const AT_REMOVEDIR = AT_FLAGS.AT_REMOVEDIR; + +const F_GETFL = FCNTL_COMMANDS.F_GETFL; +const F_SETFL = FCNTL_COMMANDS.F_SETFL; + +const SEEK_SET = SEEK_WHENCE.SEEK_SET; +const SEEK_CUR = SEEK_WHENCE.SEEK_CUR; +const SEEK_END = SEEK_WHENCE.SEEK_END; + +const S_IFDIR = FILE_MODES.S_IFDIR; +const S_IFCHR = FILE_MODES.S_IFCHR; +const S_IFBLK = FILE_MODES.S_IFBLK; +const S_IFREG = FILE_MODES.S_IFREG; +const S_IFIFO = FILE_MODES.S_IFIFO; +const S_IFLNK = FILE_MODES.S_IFLNK; +const S_IFSOCK = FILE_MODES.S_IFSOCK; +const S_IFMT = FILE_MODES.S_IFMT; // Stat struct size written by kernel const WASM_STAT_SIZE = STRUCT_SIZE_WASM_STAT; @@ -1124,8 +1127,10 @@ export class WasiShim { if (errno) { // If O_RDWR fails with EISDIR or EACCES, retry with O_RDONLY if ((errno === 21 || errno === 13) && !(posixFlags & O_CREAT)) { - posixFlags = (posixFlags & ~3) | O_RDONLY; - const retry = this.doSyscall(SYS_OPENAT, kernelDirfd, pathAddr, posixFlags, 0o666); + posixFlags = (posixFlags & ~O_ACCMODE) | O_RDONLY; + const retry = this.doSyscall( + SYS_OPENAT, kernelDirfd, pathAddr, posixFlags, 0o666, + ); if (retry.errno) return translateLinuxErrno(retry.errno); new DataView(this.memory.buffer).setUint32(fdOut, retry.result, true); return WASI_ESUCCESS; diff --git a/host/test/generated-abi.test.ts b/host/test/generated-abi.test.ts index dfe1b011a..96914e5f4 100644 --- a/host/test/generated-abi.test.ts +++ b/host/test/generated-abi.test.ts @@ -6,6 +6,8 @@ import { ABI_SYSCALL_NAMES, ABI_SYSCALLS, ABI_VERSION, + ACCESS_MODES, + AT_FLAGS, CHANNEL_STATUS, CH_ARG_SIZE, CH_ARGS, @@ -23,6 +25,11 @@ import { CH_STATUS, CH_SYSCALL, CH_TOTAL_SIZE, + DIRENT_TYPES, + EPOLL_EVENTS, + FCNTL_COMMANDS, + FD_FLAGS, + FILE_MODES, HOST_ADAPTER_MANIFEST_FIELDS, HOST_ADAPTER_MANIFEST_MAGIC, HOST_ADAPTER_MANIFEST_SIZE, @@ -34,6 +41,7 @@ import { HOST_ADAPTER_VERSION, HOST_ADAPTER_WORKER_FEATURES, HOST_INTERCEPTED_SYSCALLS, + OPEN_FLAGS, PROCESS_MEMORY_DEFAULT_INITIAL_PAGES, PROCESS_MEMORY_DEFAULT_MAX_PAGES, PROCESS_MEMORY_DEFAULT_THREAD_SLOTS, @@ -52,12 +60,23 @@ import { PROCESS_MEMORY_THREAD_SLOTS_NONE, PROCESS_MEMORY_THREAD_SLOTS_USE_HOST_DEFAULT, PROCESS_MEMORY_WASM_PAGE_SIZE, + POLL_EVENTS, + PROC_SNAPSHOT_COUNT_OFFSET, + PROC_SNAPSHOT_COUNT_SIZE, + PROC_SNAPSHOT_RECORD_FIELDS, + PROC_SNAPSHOT_RECORD_FIXED_SIZE, + SELECT_FD_SET_BYTES, + SELECT_FD_SETSIZE, + SEEK_WHENCE, STRUCT_SIZE_WASM_DIRENT, STRUCT_SIZE_WASM_POLL_FD, STRUCT_SIZE_WASM_STAT, STRUCT_SIZE_WASM_STATFS, STRUCT_SIZE_WASM_TIMESPEC, SYSCALL_ARGS, + WAKEUP_EVENT_FIELDS, + WAKEUP_EVENT_RECORD_SIZE, + WAKEUP_EVENT_TYPES, } from "../src/generated/abi"; const snapshot = JSON.parse( @@ -91,12 +110,28 @@ function namedNumberMap(entries: NamedNumber[]): Record { return Object.fromEntries(entries.map(({ name, number }) => [name, number])); } +function namedValueMap(entries: Array<{ name: string; value: number }>): Record { + return Object.fromEntries(entries.map(({ name, value }) => [name, value])); +} + function hostAdapterManifestField(name: string): { offset: number; size: number } { const field = snapshot.host_adapter.manifest_fields.find((f: { name: string }) => f.name === name); if (!field) throw new Error(`missing host_adapter manifest field ${name}`); return { offset: field.offset, size: field.size }; } +function processSnapshotField(name: string): { offset: number; size: number; type: string } { + const field = snapshot.process_snapshot.record_fields.find((f: { name: string }) => f.name === name); + if (!field) throw new Error(`missing process_snapshot field ${name}`); + return { offset: field.offset, size: field.size, type: field.type }; +} + +function wakeupEventField(name: string): { offset: number; size: number; type: string } { + const field = snapshot.wakeup_events.fields.find((f: { name: string }) => f.name === name); + if (!field) throw new Error(`missing wakeup_events field ${name}`); + return { offset: field.offset, size: field.size, type: field.type }; +} + describe("generated host ABI bindings", () => { it("match the ABI version and channel layout snapshot", () => { expect(ABI_VERSION).toBe(snapshot.abi_version); @@ -217,4 +252,56 @@ describe("generated host ABI bindings", () => { expect(PROCESS_MEMORY_THREAD_SLOT_CHANNEL_SPILL_PAGE) .toBe(layout.thread_slot.pages.find((p: { name: string }) => p.name === "syscall_channel_spill").page_offset); }); + + it("match Rust-owned process snapshot schema metadata", () => { + expect(PROC_SNAPSHOT_COUNT_OFFSET).toBe(snapshot.process_snapshot.count_offset); + expect(PROC_SNAPSHOT_COUNT_SIZE).toBe(snapshot.process_snapshot.count_size); + expect(PROC_SNAPSHOT_RECORD_FIXED_SIZE).toBe( + snapshot.process_snapshot.record_fixed_size, + ); + + for (const fieldName of Object.keys(PROC_SNAPSHOT_RECORD_FIELDS)) { + expect( + PROC_SNAPSHOT_RECORD_FIELDS[ + fieldName as keyof typeof PROC_SNAPSHOT_RECORD_FIELDS + ], + ).toEqual(processSnapshotField(fieldName)); + } + }); + + it("match Rust-owned wakeup event schema metadata", () => { + expect(WAKEUP_EVENT_RECORD_SIZE).toBe(snapshot.wakeup_events.record_size); + expect(Object.entries(WAKEUP_EVENT_TYPES)).toEqual( + snapshot.wakeup_events.types.map((t: { name: string; bit: number }) => [ + t.name, + t.bit, + ]), + ); + + for (const fieldName of Object.keys(WAKEUP_EVENT_FIELDS)) { + expect( + WAKEUP_EVENT_FIELDS[ + fieldName as keyof typeof WAKEUP_EVENT_FIELDS + ], + ).toEqual(wakeupEventField(fieldName)); + } + }); + + it("match Rust-owned I/O multiplexing metadata", () => { + expect(POLL_EVENTS).toEqual(namedValueMap(snapshot.io_multiplexing.poll_events)); + expect(EPOLL_EVENTS).toEqual(namedValueMap(snapshot.io_multiplexing.epoll_events)); + expect(SELECT_FD_SETSIZE).toBe(snapshot.io_multiplexing.select.fd_setsize); + expect(SELECT_FD_SET_BYTES).toBe(snapshot.io_multiplexing.select.fd_set_bytes); + }); + + it("match Rust-owned VFS metadata", () => { + expect(OPEN_FLAGS).toEqual(namedValueMap(snapshot.vfs_metadata.open_flags)); + expect(AT_FLAGS).toEqual(namedValueMap(snapshot.vfs_metadata.at_flags)); + expect(FD_FLAGS).toEqual(namedValueMap(snapshot.vfs_metadata.fd_flags)); + expect(FCNTL_COMMANDS).toEqual(namedValueMap(snapshot.vfs_metadata.fcntl_commands)); + expect(ACCESS_MODES).toEqual(namedValueMap(snapshot.vfs_metadata.access_modes)); + expect(FILE_MODES).toEqual(namedValueMap(snapshot.vfs_metadata.file_modes)); + expect(DIRENT_TYPES).toEqual(namedValueMap(snapshot.vfs_metadata.dirent_types)); + expect(SEEK_WHENCE).toEqual(namedValueMap(snapshot.vfs_metadata.seek_whence)); + }); }); diff --git a/host/test/process-wait-lifecycle.test.ts b/host/test/process-wait-lifecycle.test.ts index 0f3efae3c..19dee234d 100644 --- a/host/test/process-wait-lifecycle.test.ts +++ b/host/test/process-wait-lifecycle.test.ts @@ -121,6 +121,145 @@ describe("Rust-owned process wait lifecycle", () => { expect(worker.sendSignalToProcess).not.toHaveBeenCalled(); expect(worker.wakeWaitingParent).not.toHaveBeenCalled(); }); + + it("thread exit uses Rust-owned ctid metadata for clear-tid wakeup", () => { + const memory = createSharedMemory(); + const ctidPtr = 2048; + new DataView(memory.buffer).setInt32(ctidPtr, 123, true); + + const mainChannel = createChannel(10, memory, 0); + const threadChannel = createChannel(10, memory, 1024); + const kernelThreadExit = vi.fn(() => ctidPtr); + const worker = createWorkerHarness({ + kernel_thread_exit: kernelThreadExit, + }); + worker.processes = new Map([ + [10, { + pid: 10, + memory, + channels: [mainChannel, threadChannel], + ptrWidth: 4, + }], + ]); + worker.activeChannels = [mainChannel, threadChannel]; + worker.channelTids = new Map([["10:1024", 77]]); + worker.threadForkContexts = new Map([["10:1024", { fnPtr: 1, argPtr: 2 }]]); + worker.completeChannelRaw = vi.fn(); + + worker.handleExit(threadChannel, ABI_SYSCALLS.Exit, [0]); + + expect(kernelThreadExit).toHaveBeenCalledWith(10, 77); + expect(new DataView(memory.buffer).getInt32(ctidPtr, true)).toBe(0); + expect(worker.processes.get(10).channels).toEqual([mainChannel]); + expect(worker.activeChannels).toEqual([mainChannel]); + expect(worker.channelTids.has("10:1024")).toBe(false); + expect(worker.threadForkContexts.has("10:1024")).toBe(false); + expect(worker.completeChannelRaw).toHaveBeenCalledWith(threadChannel, 0, 0); + }); + + it("shmdt resolves attachment metadata from Rust before syncing bytes", () => { + const kernelMemory = createSharedMemory(); + const processMemory = createSharedMemory(); + const addr = 4096; + new Uint8Array(processMemory.buffer).set([3, 1, 4, 1], addr); + + const setCurrentPid = vi.fn(); + const lookupMapping = vi.fn((_addr: bigint, outPtr: bigint) => { + const view = new DataView(kernelMemory.buffer); + view.setInt32(Number(outPtr), 9, true); + view.setUint32(Number(outPtr) + 4, 4, true); + return 0; + }); + const writeChunk = vi.fn((_shmid: number, _offset: number, dataPtr: bigint, dataLen: number) => { + const bytes = new Uint8Array(kernelMemory.buffer, Number(dataPtr), dataLen); + expect(Array.from(bytes)).toEqual([3, 1, 4, 1]); + return dataLen; + }); + const detachByAddr = vi.fn(() => 0); + const worker = createWorkerHarness({ + kernel_set_current_pid: setCurrentPid, + kernel_ipc_shm_lookup_mapping: lookupMapping, + kernel_ipc_shm_write_chunk: writeChunk, + kernel_ipc_shmdt_addr: detachByAddr, + }); + worker.kernelMemory = kernelMemory; + worker.completeChannelRaw = vi.fn(); + worker.relistenChannel = vi.fn(); + + const channel = createChannel(11, processMemory); + worker.handleIpcShmdt(channel, [addr]); + + expect(setCurrentPid).toHaveBeenCalledWith(11); + expect(lookupMapping).toHaveBeenCalledWith(BigInt(addr), BigInt(worker.scratchOffset)); + expect(writeChunk).toHaveBeenCalledTimes(1); + expect(writeChunk.mock.calls[0][0]).toBe(9); + expect(writeChunk.mock.calls[0][1]).toBe(0); + expect(typeof writeChunk.mock.calls[0][2]).toBe("bigint"); + expect(writeChunk.mock.calls[0][3]).toBe(4); + expect(detachByAddr).toHaveBeenCalledWith(BigInt(addr)); + expect(worker.completeChannelRaw).toHaveBeenCalledWith(channel, 0, 0); + expect(worker.relistenChannel).toHaveBeenCalledWith(channel); + }); + + it("tcp listener target selection uses Rust-owned process policy", () => { + const kernelMemory = createSharedMemory(); + const pickTarget = vi.fn((_port: number, _excludePid: number, outPtr: bigint) => { + const view = new DataView(kernelMemory.buffer); + view.setUint32(Number(outPtr), 44, true); + view.setInt32(Number(outPtr) + 4, 7, true); + return 1; + }); + const worker = createWorkerHarness({ + kernel_pick_tcp_listener_target: pickTarget, + }); + worker.kernelMemory = kernelMemory; + worker.processes = new Map([[44, { pid: 44 }]]); + worker.tcpListenerTargets = new Map([[8080, [{ pid: 1, fd: 3 }]]]); + + expect(worker.pickListenerTarget(8080)).toEqual({ pid: 44, fd: 7 }); + expect(pickTarget).toHaveBeenCalledWith(8080, 0, BigInt(worker.scratchOffset)); + }); + + it("host timer cancellation follows Rust-owned cleanup metadata", () => { + vi.useFakeTimers(); + try { + const kernelMemory = createSharedMemory(); + const takeCleanup = vi.fn((_pid: number, outPtr: bigint, _maxTimerIds: number) => { + const view = new DataView(kernelMemory.buffer); + view.setUint32(Number(outPtr), 1, true); + view.setUint32(Number(outPtr) + 4, 2, true); + view.setUint32(Number(outPtr) + 8, 4, true); + view.setUint32(Number(outPtr) + 12, 8, true); + return 2; + }); + const worker = createWorkerHarness({ + kernel_take_process_timer_cleanup: takeCleanup, + }); + worker.kernelMemory = kernelMemory; + worker.alarmTimers = new Map([ + [11, setTimeout(() => {}, 1000)], + [12, setTimeout(() => {}, 1000)], + ]); + worker.posixTimers = new Map([ + ["11:4", { timeout: setTimeout(() => {}, 1000) }], + ["11:8", { timeout: setTimeout(() => {}, 1000), interval: setInterval(() => {}, 1000) }], + ["11:9", { timeout: setTimeout(() => {}, 1000) }], + ["12:4", { timeout: setTimeout(() => {}, 1000) }], + ]); + + worker.cancelProcessHostTimers(11); + + expect(takeCleanup).toHaveBeenCalledWith(11, BigInt(worker.scratchOffset), expect.any(Number)); + expect(worker.alarmTimers.has(11)).toBe(false); + expect(worker.alarmTimers.has(12)).toBe(true); + expect(worker.posixTimers.has("11:4")).toBe(false); + expect(worker.posixTimers.has("11:8")).toBe(false); + expect(worker.posixTimers.has("11:9")).toBe(true); + expect(worker.posixTimers.has("12:4")).toBe(true); + } finally { + vi.useRealTimers(); + } + }); }); function createWorkerHarness(exports: Record): any { @@ -139,12 +278,12 @@ function createSharedMemory(): WebAssembly.Memory { }); } -function createChannel(pid: number, memory: WebAssembly.Memory): any { +function createChannel(pid: number, memory: WebAssembly.Memory, channelOffset = 0): any { return { pid, memory, - channelOffset: 0, - i32View: new Int32Array(memory.buffer, 0), + channelOffset, + i32View: new Int32Array(memory.buffer, channelOffset), consecutiveSyscalls: 0, }; } diff --git a/host/test/readiness-wakeup.test.ts b/host/test/readiness-wakeup.test.ts new file mode 100644 index 000000000..ba8739de4 --- /dev/null +++ b/host/test/readiness-wakeup.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from "vitest"; +import { CentralizedKernelWorker } from "../src/kernel-worker"; + +describe("readiness wakeup targeting", () => { + it("retries only poll waiters that watch the woken pipe", () => { + const worker = createWorkerHarness([11, 12]); + const channel11 = createChannel(11); + const channel12 = createChannel(12); + + worker.pendingPollRetries.set(100, { + timer: null, + channel: channel11, + pipeIndices: [7], + }); + worker.pendingPollRetries.set(200, { + timer: null, + channel: channel12, + pipeIndices: [9], + }); + + const deferred = worker.wakeBlockedPollRetriesForPipe(7); + + expect(deferred).toBe(false); + expect(worker.retried).toEqual([channel11]); + expect(worker.pendingPollRetries.has(100)).toBe(false); + expect(worker.pendingPollRetries.has(200)).toBe(true); + }); + + it("leaves signal-safe ppoll retries deferred when requested", () => { + const worker = createWorkerHarness([11, 12]); + const signalSafeChannel = createChannel(11); + const normalChannel = createChannel(12); + + worker.pendingPollRetries.set(100, { + timer: null, + channel: signalSafeChannel, + pipeIndices: [7], + needsSignalSafeWake: true, + }); + worker.pendingPollRetries.set(200, { + timer: null, + channel: normalChannel, + pipeIndices: [7], + }); + + const deferred = worker.wakeBlockedPollRetriesForPipe(7, undefined, { + deferSignalSafe: true, + }); + + expect(deferred).toBe(true); + expect(worker.retried).toEqual([normalChannel]); + expect(worker.pendingPollRetries.has(100)).toBe(true); + expect(worker.pendingPollRetries.has(200)).toBe(false); + }); +}); + +function createWorkerHarness(pids: number[]): any { + return Object.assign(Object.create(CentralizedKernelWorker.prototype), { + pendingPollRetries: new Map(), + processes: new Map(pids.map((pid) => [pid, {}])), + retried: [], + retrySyscall(channel: unknown) { + this.retried.push(channel); + }, + }); +} + +function createChannel(pid: number): any { + return { pid }; +} diff --git a/tools/xtask/src/dump_abi.rs b/tools/xtask/src/dump_abi.rs index 3d1871659..4de639849 100644 --- a/tools/xtask/src/dump_abi.rs +++ b/tools/xtask/src/dump_abi.rs @@ -14,6 +14,14 @@ //! adapter boot contract metadata //! * [`wasm_posix_shared::host_abi`] — host adapter syscall marshalling //! descriptors +//! * [`wasm_posix_shared::wakeup_event`] — kernel wakeup event record +//! layout consumed by the host retry scheduler +//! * [`wasm_posix_shared::poll`], [`wasm_posix_shared::epoll`], and +//! [`wasm_posix_shared::select`] — I/O multiplexing event metadata +//! * [`wasm_posix_shared::flags`], [`wasm_posix_shared::access`], +//! [`wasm_posix_shared::mode`], [`wasm_posix_shared::dirent`], and +//! [`wasm_posix_shared::seek`] — VFS-visible constants consumed by host +//! adapters //! //! When `--kernel-wasm ` is provided, the snapshot also covers //! every export in the built kernel `.wasm` (after filtering through @@ -32,10 +40,10 @@ use std::collections::BTreeMap; use std::mem::{offset_of, size_of}; use std::path::PathBuf; -use serde_json::{Value, json}; +use serde_json::{json, Value}; use wasm_posix_shared as shared; -use crate::{JsonMap, repo_root}; +use crate::{repo_root, JsonMap}; pub fn run(args: Vec) -> Result<(), String> { let mut out_path: Option = None; @@ -425,6 +433,126 @@ fn render_ts_module() -> String { channel::SIG_OLD_MASK )); + out.push_str(&format!( + "export const PROC_SNAPSHOT_COUNT_OFFSET = {} as const;\n", + shared::process_snapshot::COUNT_OFFSET + )); + out.push_str(&format!( + "export const PROC_SNAPSHOT_COUNT_SIZE = {} as const;\n", + shared::process_snapshot::COUNT_SIZE + )); + out.push_str(&format!( + "export const PROC_SNAPSHOT_RECORD_FIXED_SIZE = {} as const;\n", + shared::process_snapshot::RECORD_FIXED_SIZE + )); + out.push_str("export const PROC_SNAPSHOT_RECORD_FIELDS = {\n"); + for field in process_snapshot_fields() { + out.push_str(&format!( + " {}: {{ offset: {}, size: {}, type: {:?} }},\n", + field.name, field.offset, field.size, field.ty + )); + } + out.push_str("} as const;\n\n"); + + out.push_str(&format!( + "export const WAKEUP_EVENT_RECORD_SIZE = {} as const;\n", + shared::wakeup_event::RECORD_SIZE + )); + out.push_str(&format!( + "export const WAKEUP_EVENT_TYPE_READABLE = {} as const;\n", + shared::wakeup_event::TYPE_READABLE + )); + out.push_str(&format!( + "export const WAKEUP_EVENT_TYPE_WRITABLE = {} as const;\n", + shared::wakeup_event::TYPE_WRITABLE + )); + out.push_str(&format!( + "export const WAKEUP_EVENT_TYPE_ACCEPT = {} as const;\n", + shared::wakeup_event::TYPE_ACCEPT + )); + out.push_str("export const WAKEUP_EVENT_TYPES = {\n"); + out.push_str(" readable: WAKEUP_EVENT_TYPE_READABLE,\n"); + out.push_str(" writable: WAKEUP_EVENT_TYPE_WRITABLE,\n"); + out.push_str(" accept: WAKEUP_EVENT_TYPE_ACCEPT,\n"); + out.push_str("} as const;\n"); + out.push_str("export const WAKEUP_EVENT_FIELDS = {\n"); + for field in wakeup_event_fields() { + out.push_str(&format!( + " {}: {{ offset: {}, size: {}, type: {:?} }},\n", + field.name, field.offset, field.size, field.ty + )); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const POLL_EVENTS = {\n"); + for (name, value) in poll_events() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const EPOLL_EVENTS = {\n"); + for (name, value) in epoll_events() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str(&format!( + "export const SELECT_FD_SETSIZE = {} as const;\n", + shared::select::FD_SETSIZE + )); + out.push_str(&format!( + "export const SELECT_FD_SET_BYTES = {} as const;\n\n", + shared::select::FD_SET_BYTES + )); + + out.push_str("export const OPEN_FLAGS = {\n"); + for (name, value) in open_flags() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const AT_FLAGS = {\n"); + for (name, value) in at_flags() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const FD_FLAGS = {\n"); + for (name, value) in fd_flags() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const FCNTL_COMMANDS = {\n"); + for (name, value) in fcntl_commands() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const ACCESS_MODES = {\n"); + for (name, value) in access_modes() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const FILE_MODES = {\n"); + for (name, value) in file_modes() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const DIRENT_TYPES = {\n"); + for (name, value) in dirent_types() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + + out.push_str("export const SEEK_WHENCE = {\n"); + for (name, value) in seek_whence() { + out.push_str(&format!(" {}: {},\n", name, value)); + } + out.push_str("} as const;\n\n"); + out.push_str(&format!( "export const STRUCT_SIZE_WASM_STAT = {} as const;\n", size_of::() @@ -660,6 +788,10 @@ fn build_snapshot(kernel_wasm: &std::path::Path) -> Result { root.insert("channel_header".into(), channel_header()); root.insert("channel_signal_area".into(), channel_signal_area()); root.insert("channel_buffers".into(), channel_buffers()); + root.insert("process_snapshot".into(), process_snapshot()); + root.insert("wakeup_events".into(), wakeup_events()); + root.insert("io_multiplexing".into(), io_multiplexing()); + root.insert("vfs_metadata".into(), vfs_metadata()); root.insert("marshalled_structs".into(), marshalled_structs()); root.insert("syscalls".into(), syscalls()); @@ -881,6 +1013,360 @@ fn channel_signal_area() -> Value { Value::Object(m.into_iter().collect()) } +struct ProcessSnapshotField { + name: &'static str, + offset: usize, + size: usize, + ty: &'static str, +} + +fn process_snapshot_fields() -> [ProcessSnapshotField; 8] { + use shared::process_snapshot::*; + [ + ProcessSnapshotField { + name: "pid", + offset: RECORD_PID_OFFSET, + size: 4, + ty: "u32", + }, + ProcessSnapshotField { + name: "ppid", + offset: RECORD_PPID_OFFSET, + size: 4, + ty: "u32", + }, + ProcessSnapshotField { + name: "uid", + offset: RECORD_UID_OFFSET, + size: 4, + ty: "u32", + }, + ProcessSnapshotField { + name: "gid", + offset: RECORD_GID_OFFSET, + size: 4, + ty: "u32", + }, + ProcessSnapshotField { + name: "vsizeBytes", + offset: RECORD_VSIZE_BYTES_OFFSET, + size: 8, + ty: "u64", + }, + ProcessSnapshotField { + name: "state", + offset: RECORD_STATE_OFFSET, + size: 4, + ty: "u32_ascii", + }, + ProcessSnapshotField { + name: "commLen", + offset: RECORD_COMM_LEN_OFFSET, + size: 4, + ty: "u32", + }, + ProcessSnapshotField { + name: "cmdlineLen", + offset: RECORD_CMDLINE_LEN_OFFSET, + size: 4, + ty: "u32", + }, + ] +} + +fn process_snapshot() -> Value { + use shared::process_snapshot::*; + let fields = process_snapshot_fields() + .iter() + .map(|field| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(field.name)); + m.insert("offset".into(), json!(field.offset)); + m.insert("size".into(), json!(field.size)); + m.insert("type".into(), json!(field.ty)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + + let mut m: JsonMap = BTreeMap::new(); + m.insert("count_offset".into(), json!(COUNT_OFFSET)); + m.insert("count_size".into(), json!(COUNT_SIZE)); + m.insert("record_fixed_size".into(), json!(RECORD_FIXED_SIZE)); + m.insert("record_fields".into(), Value::Array(fields)); + Value::Object(m.into_iter().collect()) +} + +struct WakeupEventField { + name: &'static str, + offset: usize, + size: usize, + ty: &'static str, +} + +fn wakeup_event_fields() -> [WakeupEventField; 2] { + use shared::wakeup_event::*; + [ + WakeupEventField { + name: "idx", + offset: IDX_OFFSET, + size: IDX_SIZE, + ty: "u32", + }, + WakeupEventField { + name: "wakeType", + offset: TYPE_OFFSET, + size: TYPE_SIZE, + ty: "u8", + }, + ] +} + +fn wakeup_events() -> Value { + use shared::wakeup_event::*; + let fields = wakeup_event_fields() + .iter() + .map(|field| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(field.name)); + m.insert("offset".into(), json!(field.offset)); + m.insert("size".into(), json!(field.size)); + m.insert("type".into(), json!(field.ty)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + + let types = [ + ("readable", TYPE_READABLE), + ("writable", TYPE_WRITABLE), + ("accept", TYPE_ACCEPT), + ] + .into_iter() + .map(|(name, bit)| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(name)); + m.insert("bit".into(), json!(bit)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + + let mut m: JsonMap = BTreeMap::new(); + m.insert("record_size".into(), json!(RECORD_SIZE)); + m.insert("fields".into(), Value::Array(fields)); + m.insert("types".into(), Value::Array(types)); + Value::Object(m.into_iter().collect()) +} + +fn poll_events() -> [(&'static str, i16); 6] { + use shared::poll::*; + [ + ("POLLIN", POLLIN), + ("POLLPRI", POLLPRI), + ("POLLOUT", POLLOUT), + ("POLLERR", POLLERR), + ("POLLHUP", POLLHUP), + ("POLLNVAL", POLLNVAL), + ] +} + +fn epoll_events() -> [(&'static str, u32); 4] { + use shared::epoll::*; + [ + ("EPOLLIN", EPOLLIN), + ("EPOLLOUT", EPOLLOUT), + ("EPOLLERR", EPOLLERR), + ("EPOLLHUP", EPOLLHUP), + ] +} + +fn io_multiplexing() -> Value { + let poll_events = poll_events() + .into_iter() + .map(|(name, value)| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(name)); + m.insert("value".into(), json!(value)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + let epoll_events = epoll_events() + .into_iter() + .map(|(name, value)| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(name)); + m.insert("value".into(), json!(value)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + + let mut select: JsonMap = BTreeMap::new(); + select.insert("fd_setsize".into(), json!(shared::select::FD_SETSIZE)); + select.insert("fd_set_bytes".into(), json!(shared::select::FD_SET_BYTES)); + + let mut m: JsonMap = BTreeMap::new(); + m.insert("poll_events".into(), Value::Array(poll_events)); + m.insert("epoll_events".into(), Value::Array(epoll_events)); + m.insert("select".into(), Value::Object(select.into_iter().collect())); + Value::Object(m.into_iter().collect()) +} + +fn open_flags() -> [(&'static str, u32); 14] { + use shared::flags::*; + [ + ("O_RDONLY", O_RDONLY), + ("O_WRONLY", O_WRONLY), + ("O_RDWR", O_RDWR), + ("O_ACCMODE", O_ACCMODE), + ("O_CREAT", O_CREAT), + ("O_EXCL", O_EXCL), + ("O_NOCTTY", O_NOCTTY), + ("O_TRUNC", O_TRUNC), + ("O_APPEND", O_APPEND), + ("O_NONBLOCK", O_NONBLOCK), + ("O_DIRECTORY", O_DIRECTORY), + ("O_NOFOLLOW", O_NOFOLLOW), + ("O_CLOEXEC", O_CLOEXEC), + ("O_CLOFORK", O_CLOFORK), + ] +} + +fn at_flags() -> [(&'static str, i32); 4] { + use shared::flags::*; + [ + ("AT_FDCWD", AT_FDCWD), + ("AT_SYMLINK_NOFOLLOW", AT_SYMLINK_NOFOLLOW as i32), + ("AT_REMOVEDIR", AT_REMOVEDIR as i32), + ("AT_EMPTY_PATH", AT_EMPTY_PATH as i32), + ] +} + +fn fd_flags() -> [(&'static str, u32); 2] { + use shared::fd_flags::*; + [("FD_CLOEXEC", FD_CLOEXEC), ("FD_CLOFORK", FD_CLOFORK)] +} + +fn fcntl_commands() -> [(&'static str, u32); 15] { + use shared::fcntl_cmd::*; + [ + ("F_DUPFD", F_DUPFD), + ("F_GETFD", F_GETFD), + ("F_SETFD", F_SETFD), + ("F_GETFL", F_GETFL), + ("F_SETFL", F_SETFL), + ("F_GETLK", F_GETLK), + ("F_SETLK", F_SETLK), + ("F_SETLKW", F_SETLKW), + ("F_SETOWN", F_SETOWN), + ("F_GETOWN", F_GETOWN), + ("F_DUPFD_CLOEXEC", F_DUPFD_CLOEXEC), + ("F_DUPFD_CLOFORK", F_DUPFD_CLOFORK), + ("F_OFD_GETLK", F_OFD_GETLK), + ("F_OFD_SETLK", F_OFD_SETLK), + ("F_OFD_SETLKW", F_OFD_SETLKW), + ] +} + +fn access_modes() -> [(&'static str, u32); 4] { + use shared::access::*; + [ + ("F_OK", F_OK), + ("R_OK", R_OK), + ("W_OK", W_OK), + ("X_OK", X_OK), + ] +} + +fn file_modes() -> [(&'static str, u32); 24] { + use shared::mode::*; + [ + ("S_IFMT", S_IFMT), + ("S_IFSOCK", S_IFSOCK), + ("S_IFLNK", S_IFLNK), + ("S_IFREG", S_IFREG), + ("S_IFBLK", S_IFBLK), + ("S_IFDIR", S_IFDIR), + ("S_IFCHR", S_IFCHR), + ("S_IFIFO", S_IFIFO), + ("S_ISUID", S_ISUID), + ("S_ISGID", S_ISGID), + ("S_ISVTX", S_ISVTX), + ("S_IRWXU", S_IRWXU), + ("S_IRUSR", S_IRUSR), + ("S_IWUSR", S_IWUSR), + ("S_IXUSR", S_IXUSR), + ("S_IRWXG", S_IRWXG), + ("S_IRGRP", S_IRGRP), + ("S_IWGRP", S_IWGRP), + ("S_IXGRP", S_IXGRP), + ("S_IRWXO", S_IRWXO), + ("S_IROTH", S_IROTH), + ("S_IWOTH", S_IWOTH), + ("S_IXOTH", S_IXOTH), + ("S_MODE_BITS", S_MODE_BITS), + ] +} + +fn dirent_types() -> [(&'static str, u32); 8] { + use shared::dirent::*; + [ + ("DT_UNKNOWN", DT_UNKNOWN), + ("DT_FIFO", DT_FIFO), + ("DT_CHR", DT_CHR), + ("DT_DIR", DT_DIR), + ("DT_BLK", DT_BLK), + ("DT_REG", DT_REG), + ("DT_LNK", DT_LNK), + ("DT_SOCK", DT_SOCK), + ] +} + +fn seek_whence() -> [(&'static str, u32); 3] { + use shared::seek::*; + [ + ("SEEK_SET", SEEK_SET), + ("SEEK_CUR", SEEK_CUR), + ("SEEK_END", SEEK_END), + ] +} + +fn named_values(entries: [(&'static str, u32); N]) -> Value { + let values = entries + .into_iter() + .map(|(name, value)| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(name)); + m.insert("value".into(), json!(value)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + Value::Array(values) +} + +fn named_signed_values(entries: [(&'static str, i32); N]) -> Value { + let values = entries + .into_iter() + .map(|(name, value)| { + let mut m: JsonMap = BTreeMap::new(); + m.insert("name".into(), json!(name)); + m.insert("value".into(), json!(value)); + Value::Object(m.into_iter().collect()) + }) + .collect(); + Value::Array(values) +} + +fn vfs_metadata() -> Value { + let mut m: JsonMap = BTreeMap::new(); + m.insert("open_flags".into(), named_values(open_flags())); + m.insert("at_flags".into(), named_signed_values(at_flags())); + m.insert("fd_flags".into(), named_values(fd_flags())); + m.insert("fcntl_commands".into(), named_values(fcntl_commands())); + m.insert("access_modes".into(), named_values(access_modes())); + m.insert("file_modes".into(), named_values(file_modes())); + m.insert("dirent_types".into(), named_values(dirent_types())); + m.insert("seek_whence".into(), named_values(seek_whence())); + Value::Object(m.into_iter().collect()) +} + fn marshalled_structs() -> Value { use shared::fbdev::{FbBitfield, FbFixScreenInfo, FbVarScreenInfo}; use shared::{WasmDirent, WasmFlock, WasmPollFd, WasmStat, WasmStatfs, WasmTimespec}; @@ -1610,7 +2096,15 @@ fn classify_compat_change(old: &Value, new: &Value) -> Result bool { - matches!(section, "host_adapter" | "syscall_arg_descriptors") + matches!( + section, + "host_adapter" + | "io_multiplexing" + | "process_snapshot" + | "syscall_arg_descriptors" + | "vfs_metadata" + | "wakeup_events" + ) } fn classify_additive_object_by_key( @@ -1818,6 +2312,85 @@ mod tests { "marshalled_structs": { "WasmStat": {"size": 96, "fields": []} }, + "process_snapshot": { + "count_offset": 0, + "count_size": 4, + "record_fixed_size": 36, + "record_fields": [ + {"name": "pid", "offset": 0, "size": 4, "type": "u32"} + ] + }, + "wakeup_events": { + "record_size": 5, + "fields": [ + {"name": "pipeIdx", "offset": 0, "size": 4, "type": "u32"}, + {"name": "wakeType", "offset": 4, "size": 1, "type": "u8"} + ], + "types": [ + {"name": "readable", "bit": 1}, + {"name": "writable", "bit": 2} + ] + }, + "io_multiplexing": { + "poll_events": [ + {"name": "POLLIN", "value": 1}, + {"name": "POLLPRI", "value": 2}, + {"name": "POLLOUT", "value": 4}, + {"name": "POLLERR", "value": 8}, + {"name": "POLLHUP", "value": 16}, + {"name": "POLLNVAL", "value": 32} + ], + "epoll_events": [ + {"name": "EPOLLIN", "value": 1}, + {"name": "EPOLLOUT", "value": 4}, + {"name": "EPOLLERR", "value": 8}, + {"name": "EPOLLHUP", "value": 16} + ], + "select": { + "fd_setsize": 1024, + "fd_set_bytes": 128 + } + }, + "vfs_metadata": { + "open_flags": [ + {"name": "O_RDONLY", "value": 0}, + {"name": "O_WRONLY", "value": 1}, + {"name": "O_RDWR", "value": 2} + ], + "at_flags": [ + {"name": "AT_FDCWD", "value": -100}, + {"name": "AT_SYMLINK_NOFOLLOW", "value": 256}, + {"name": "AT_REMOVEDIR", "value": 512}, + {"name": "AT_EMPTY_PATH", "value": 4096} + ], + "fd_flags": [ + {"name": "FD_CLOEXEC", "value": 1}, + {"name": "FD_CLOFORK", "value": 2} + ], + "fcntl_commands": [ + {"name": "F_GETFL", "value": 3}, + {"name": "F_SETFL", "value": 4} + ], + "access_modes": [ + {"name": "F_OK", "value": 0}, + {"name": "R_OK", "value": 4}, + {"name": "W_OK", "value": 2}, + {"name": "X_OK", "value": 1} + ], + "file_modes": [ + {"name": "S_IFMT", "value": 61440}, + {"name": "S_IFREG", "value": 32768} + ], + "dirent_types": [ + {"name": "DT_UNKNOWN", "value": 0}, + {"name": "DT_REG", "value": 8} + ], + "seek_whence": [ + {"name": "SEEK_SET", "value": 0}, + {"name": "SEEK_CUR", "value": 1}, + {"name": "SEEK_END", "value": 2} + ] + }, "syscalls": [ {"number": 1, "name": "Open"}, {"number": 2, "name": "Close"} @@ -1898,6 +2471,62 @@ mod tests { ); } + #[test] + fn adding_process_snapshot_section_is_compatible() { + let mut old = base_snapshot(); + old.as_object_mut().unwrap().remove("process_snapshot"); + let new = base_snapshot(); + + let report = classify_compat_change(&old, &new).unwrap(); + assert!(report.breaking.is_empty(), "{report:?}"); + assert_eq!( + report.additive, + vec!["added top-level section \"process_snapshot\""] + ); + } + + #[test] + fn adding_io_multiplexing_section_is_compatible() { + let mut old = base_snapshot(); + old.as_object_mut().unwrap().remove("io_multiplexing"); + let new = base_snapshot(); + + let report = classify_compat_change(&old, &new).unwrap(); + assert!(report.breaking.is_empty(), "{report:?}"); + assert_eq!( + report.additive, + vec!["added top-level section \"io_multiplexing\""] + ); + } + + #[test] + fn adding_wakeup_events_section_is_compatible() { + let mut old = base_snapshot(); + old.as_object_mut().unwrap().remove("wakeup_events"); + let new = base_snapshot(); + + let report = classify_compat_change(&old, &new).unwrap(); + assert!(report.breaking.is_empty(), "{report:?}"); + assert_eq!( + report.additive, + vec!["added top-level section \"wakeup_events\""] + ); + } + + #[test] + fn adding_vfs_metadata_section_is_compatible() { + let mut old = base_snapshot(); + old.as_object_mut().unwrap().remove("vfs_metadata"); + let new = base_snapshot(); + + let report = classify_compat_change(&old, &new).unwrap(); + assert!(report.breaking.is_empty(), "{report:?}"); + assert_eq!( + report.additive, + vec!["added top-level section \"vfs_metadata\""] + ); + } + #[test] fn changed_existing_export_is_breaking() { let old = base_snapshot();