Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions crates/kernel/src/fork.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::fd::{FdEntry, FdTable, OpenFileDescRef};
use crate::lock::LockTable;
use crate::memory::{MappedRegion, MemoryManager};
use crate::ofd::{FileType, OfdTable, OpenFileDesc};
use crate::process::{Process, ProcessState};
use crate::process::{Process, ProcessState, ShmMapping};
use crate::signal::{SignalAction, SignalHandler, SignalState};
use crate::socket::SocketTable;
use crate::terminal::{NCCS, TerminalState, WinSize};
Expand All @@ -38,6 +38,7 @@ const MAX_OFDS: u32 = 65536;
const MAX_ENV_VARS: u32 = 65536;
const MAX_ARGV: u32 = 65536;
const MAX_PATH_LEN: usize = 1048576; // 1 MiB
const MAX_SHM_MAPPINGS: usize = 4096;
const MAX_STRING_LEN: usize = 1048576; // 1 MiB

// ── Writer helper ───────────────────────────────────────────────────────────
Expand Down Expand Up @@ -534,6 +535,14 @@ pub fn serialize_fork_state(proc: &Process, buf: &mut [u8]) -> Result<usize, Err
}
}

// ── SysV shared-memory attachments (v7) ──
w.write_u32(proc.shm_mappings.len() as u32)?;
for mapping in &proc.shm_mappings {
w.write_u32(mapping.addr as u32)?;
w.write_u32(mapping.shmid as u32)?;
w.write_u32(mapping.size as u32)?;
}

// ── Patch total_size ──
let total = w.pos as u32;
w.patch_u32(total_size_offset, total);
Expand Down Expand Up @@ -927,6 +936,22 @@ pub fn deserialize_fork_state(buf: &[u8], child_pid: u32) -> Result<Process, Err
}
}

// ── SysV shared-memory attachments (v7) ──
let mut shm_mappings = Vec::new();
if r.remaining() >= 4 {
let count = r.read_u32()? as usize;
if count > MAX_SHM_MAPPINGS {
return Err(Errno::EINVAL);
}
shm_mappings = Vec::with_capacity(count);
for _ in 0..count {
let addr = r.read_u32()? as usize;
let shmid = r.read_u32()? as i32;
let size = r.read_u32()? as usize;
shm_mappings.push(ShmMapping { addr, shmid, size });
}
}

Ok(Process {
pid: child_pid,
ppid,
Expand Down Expand Up @@ -988,6 +1013,7 @@ pub fn deserialize_fork_state(buf: &[u8], child_pid: u32) -> Result<Process, Err
// registered as a host display target. fbDOOM doesn't fork
// mid-game; documented limitation in the design doc.
fb_binding: None,
shm_mappings,
fork_count: 0,
})
}
Expand Down Expand Up @@ -1380,6 +1406,7 @@ pub fn deserialize_exec_state(buf: &[u8], pid: u32) -> Result<Process, Errno> {
// exec wipes any prior framebuffer binding — the new program
// must open and mmap /dev/fb0 itself.
fb_binding: None,
shm_mappings: Vec::new(),
// The fork counter exists as a kernel-side regression guardrail.
// Resetting on exec keeps semantics simple: the next spawn-from-this-pid
// test starts from a clean slate. The plan's regression check inspects
Expand Down Expand Up @@ -1527,6 +1554,19 @@ mod tests {
}
}

#[test]
fn test_fork_state_preserves_shm_mappings() {
let mut proc = Process::new(1);
proc.record_shm_mapping(0x20000, 17, 4096);

let mut buf = vec![0u8; 64 * 1024];
let written = serialize_fork_state(&proc, &mut buf).unwrap();
let child = deserialize_fork_state(&buf[..written], 42).unwrap();

assert_eq!(child.shm_mapping_at(0x20000).unwrap().shmid, 17);
assert_eq!(child.shm_mapping_at(0x20000).unwrap().size, 4096);
}

#[test]
fn test_buffer_too_small() {
let proc = Process::new(1);
Expand All @@ -1546,7 +1586,8 @@ mod tests {

#[test]
fn test_exec_roundtrip_default_process() {
let proc = Process::new(1);
let mut proc = Process::new(1);
proc.record_shm_mapping(0x20000, 17, 4096);
let mut buf = vec![0u8; 64 * 1024];
let written = serialize_exec_state(&proc, &mut buf).unwrap();
assert!(written > 12);
Expand All @@ -1556,6 +1597,7 @@ mod tests {
assert_eq!(restored.pid, 1);
assert_eq!(restored.ppid, 0); // default ppid
assert_eq!(restored.signals.pending, 0);
assert!(restored.shm_mappings.is_empty());
}

#[test]
Expand Down
19 changes: 19 additions & 0 deletions crates/kernel/src/ipc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,17 @@ impl IpcTable {
Ok(seg.segsz)
}

/// Inherit an existing attachment across fork without rechecking
/// permissions. The parent already passed `shmat`; fork only creates
/// another process reference to the same segment.
pub fn shm_attach_inherited(&mut self, shmid: i32, pid: u32) -> Result<(), Errno> {
let seg = self.shm_segments.get_mut(&shmid).ok_or(Errno::EINVAL)?;
seg.nattch += 1;
seg.lpid = pid as i32;
seg.atime = crate::current_time_secs();
Ok(())
}

/// Read a chunk of shared memory segment data into a buffer.
/// Returns bytes written.
pub fn shm_read_chunk(&self, shmid: i32, offset: u32, buf: &mut [u8]) -> Result<u32, Errno> {
Expand Down Expand Up @@ -1446,9 +1457,17 @@ mod tests {
assert_eq!(info.nattch, 1);
assert_eq!(info.lpid, 42);

t.shm_attach_inherited(id, 43).unwrap();
let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap();
assert_eq!(info.nattch, 2);
assert_eq!(info.lpid, 43);

// Detach
t.shmdt(id, 42).unwrap();
let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap();
assert_eq!(info.nattch, 1);
t.shmdt(id, 43).unwrap();
let info = t.shmctl(id, IPC_STAT, 1, 0, 0).unwrap().unwrap();
assert_eq!(info.nattch, 0);
}

Expand Down
59 changes: 59 additions & 0 deletions crates/kernel/src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ pub struct FbBinding {
pub fmt: u32,
}

/// Per-process SysV shared-memory attachment.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ShmMapping {
pub addr: usize,
pub shmid: i32,
pub size: usize,
}

/// Per-thread state within a process.
#[derive(Debug, Clone)]
pub struct ThreadInfo {
Expand Down Expand Up @@ -401,6 +409,9 @@ pub struct Process {
/// Live mmap of `/dev/fb0`, if any. `Some` between successful
/// `mmap` and the matching `munmap`/process-exit/exec.
pub fb_binding: Option<FbBinding>,
/// SysV shared-memory attachments keyed by the process virtual address
/// returned from `shmat`.
pub shm_mappings: Vec<ShmMapping>,
/// Counts how many times this process has called fork() (parent side, on success).
/// Read-only from outside the kernel via `kernel_get_fork_count`.
/// Used as a regression guardrail by the spawn test suite to confirm
Expand Down Expand Up @@ -485,6 +496,7 @@ impl Process {
procfs_bufs: Vec::new(),
has_exec: false,
fb_binding: None,
shm_mappings: Vec::new(),
fork_count: 0,
}
}
Expand Down Expand Up @@ -565,6 +577,26 @@ impl Process {
self.threads.iter_mut().find(|t| t.tid == tid)
}

/// Record or replace a SysV shared-memory attachment for an address.
pub fn record_shm_mapping(&mut self, addr: usize, shmid: i32, size: usize) {
if let Some(mapping) = self.shm_mappings.iter_mut().find(|m| m.addr == addr) {
*mapping = ShmMapping { addr, shmid, size };
} else {
self.shm_mappings.push(ShmMapping { addr, shmid, size });
}
}

/// Find a SysV shared-memory attachment by its process address.
pub fn shm_mapping_at(&self, addr: usize) -> Option<ShmMapping> {
self.shm_mappings.iter().copied().find(|m| m.addr == addr)
}

/// Remove and return a SysV shared-memory attachment by its process address.
pub fn remove_shm_mapping(&mut self, addr: usize) -> Option<ShmMapping> {
let idx = self.shm_mappings.iter().position(|m| m.addr == addr)?;
Some(self.shm_mappings.swap_remove(idx))
}

/// True if `tid` names the process's main thread. The main thread's TID
/// equals the process PID (Linux convention) and is not tracked in
/// [`Process::threads`]; per-thread signal state for the main thread lives
Expand Down Expand Up @@ -941,6 +973,33 @@ mod tests {
assert_eq!(proc.fork_count(), 0);
}

#[test]
fn shm_mapping_bookkeeping_is_keyed_by_process_addr() {
let mut proc = Process::new(1);

proc.record_shm_mapping(0x20000, 7, 4096);
assert_eq!(
proc.shm_mapping_at(0x20000),
Some(ShmMapping {
addr: 0x20000,
shmid: 7,
size: 4096,
})
);

proc.record_shm_mapping(0x20000, 8, 8192);
assert_eq!(proc.shm_mappings.len(), 1);
assert_eq!(
proc.remove_shm_mapping(0x20000),
Some(ShmMapping {
addr: 0x20000,
shmid: 8,
size: 8192,
})
);
assert_eq!(proc.shm_mapping_at(0x20000), None);
}

#[test]
fn spawn_child_basic_inherits_cwd_and_returns_pid() {
use crate::process_table::ProcessTable;
Expand Down
12 changes: 12 additions & 0 deletions crates/kernel/src/process_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ fn bump_inherited_resource_refcounts(child: &Process) {
}
}
}

let ipc = unsafe { crate::ipc::global_ipc_table() };
for mapping in &child.shm_mappings {
let _ = ipc.shm_attach_inherited(mapping.shmid, child.pid);
}
}

/// Build the fork-only `fork_pipe_replay` table: a list of (read_fd,
Expand Down Expand Up @@ -410,6 +415,13 @@ impl ProcessTable {
let pshared = unsafe { crate::pshared::global_pshared_table() };
pshared.cleanup_process(pid);

// Drop SysV shared-memory attachments that were still live when the
// process exited or was reaped.
let ipc = unsafe { crate::ipc::global_ipc_table() };
for mapping in &proc.shm_mappings {
let _ = ipc.shmdt(mapping.shmid, pid);
}

if retain_limbo_leader && proc.pgid == pid && self.group_has_member(pid) {
self.processes.insert(pid, Self::limbo_process_from(&proc));
}
Expand Down
50 changes: 49 additions & 1 deletion crates/kernel/src/wasm_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2121,6 +2121,10 @@ pub extern "C" fn kernel_exec_setup(pid: u32) -> i32 {
match crate::fork::deserialize_exec_state(&buf[..written], pid) {
Ok(new_proc) => {
table.get_mut(pid).map(|p| {
let ipc = unsafe { crate::ipc::global_ipc_table() };
for mapping in &p.shm_mappings {
let _ = ipc.shmdt(mapping.shmid, pid);
}
*p = new_proc;
p.has_exec = true;
});
Expand Down Expand Up @@ -3204,7 +3208,7 @@ fn dispatch_channel_syscall(nr: u32, args: &[i64; 6]) -> i32 {
}
// SYS_SHMAT (345), SYS_SHMDT (346): intercepted by host for process memory management
345 => kernel_ipc_shmat(a1, a2, a3),
346 => kernel_ipc_shmdt(a1),
346 => kernel_ipc_shmdt_addr(a1 as usize),
347 => {
// SYS_SHMCTL: (shmid, cmd, buf_ptr)
let ipc = unsafe { crate::ipc::global_ipc_table() };
Expand Down Expand Up @@ -4057,6 +4061,34 @@ pub extern "C" fn kernel_ipc_shmat(shmid: i32, _shmaddr: i32, flags: i32) -> i32
}
}

/// Record the process address chosen by the host-managed mmap for a SysV
/// shared-memory attachment.
#[unsafe(no_mangle)]
pub extern "C" fn kernel_ipc_shm_record_mapping(addr: usize, shmid: i32, size: u32) -> i32 {
let (_guard, proc) = unsafe { get_process() };
proc.record_shm_mapping(addr, shmid, size as usize);
0
}

/// Look up a SysV shared-memory attachment by process address.
/// Writes `{ i32 shmid, u32 size }` to `out_ptr`.
#[unsafe(no_mangle)]
pub extern "C" fn kernel_ipc_shm_lookup_mapping(addr: usize, out_ptr: *mut u8) -> i32 {
if out_ptr.is_null() {
return -(Errno::EFAULT as i32);
}

let (_guard, proc) = unsafe { get_process() };
let Some(mapping) = proc.shm_mapping_at(addr) else {
return -(Errno::EINVAL as i32);
};

let out = unsafe { core::slice::from_raw_parts_mut(out_ptr, 8) };
out[0..4].copy_from_slice(&mapping.shmid.to_le_bytes());
out[4..8].copy_from_slice(&(mapping.size as u32).to_le_bytes());
0
}

/// Detach from shared memory segment.
/// Host should call kernel_ipc_shm_write_chunk first to sync data back.
#[unsafe(no_mangle)]
Expand All @@ -4069,6 +4101,22 @@ pub extern "C" fn kernel_ipc_shmdt(shmid: i32) -> i32 {
}
}

/// Detach from a shared-memory segment by process address.
/// Host should call kernel_ipc_shm_lookup_mapping and sync data back first.
#[unsafe(no_mangle)]
pub extern "C" fn kernel_ipc_shmdt_addr(addr: usize) -> i32 {
let ipc = unsafe { crate::ipc::global_ipc_table() };
let (_guard, proc) = unsafe { get_process() };
let Some(mapping) = proc.remove_shm_mapping(addr) else {
return -(Errno::EINVAL as i32);
};

match ipc.shmdt(mapping.shmid, proc.pid) {
Ok(()) => 0,
Err(e) => -(e as i32),
}
}

/// Read a chunk of shared memory segment data into scratch area.
/// Returns bytes written to out_ptr.
#[unsafe(no_mangle)]
Expand Down
4 changes: 2 additions & 2 deletions docs/plans/2026-05-20-rust-owned-host-logic-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ path.
| Done / PR #534 | Rust-owned syscall marshalling descriptors | `crates/shared::host_abi` owns simple pointer-argument descriptors; `dump-abi` generates `SYSCALL_ARGS`; TS host keeps memory copies but reads generated descriptors. | The old TS `SYSCALL_ARGS` table and syscall-number size switches are gone. `poll`/`ppoll`, SysV message prefix, `semop`, and `msgrcv` copy-back adjustments are metadata fields. Nested-pointer syscalls (`readv`/`writev`/preadv/pwritev) stay on dedicated TS paths. | Shared unit tests for descriptor ordering/high-risk sizes/nested-pointer exclusion; xtask ABI tests; `bash scripts/check-abi-version.sh`; generated ABI vitest; host build; kernel lib tests. |
| Done / PR #534 follow-up | Extended host-visible syscall numbers and names | Add Rust/shared metadata for ABI-visible syscall numbers still hardcoded in host TS but not currently in `shared::Syscall`, such as `getrandom`, `clone`, `futex`, `ppoll`, `pselect6`, epoll, `exit_group`, `waitid`, `msync`, preadv/pwritev, mqueue, SysV IPC, `sched_yield`, `fallocate`, timers, and `thread_cancel`. Generate TS bindings, logging names, and snapshot coverage. | Host TS no longer defines literal syscall numbers for this set, and syscall trace names are generated from Rust-owned metadata. Existing `HOST_INTERCEPTED_SYSCALLS` remains separate for fork/exec/spawn because those are caught before normal dispatch. Public behavior unchanged. | Rust metadata uniqueness tests; xtask compatibility tests; `bash scripts/check-abi-version.sh update` + check; generated ABI vitest; host build; kernel lib tests. |
| Done / stacked PR | Rust-defined host adapter manifest | Add a compact Rust-defined manifest describing ABI version, required host adapter protocol version, required/optional exports, worker protocol features, and channel metadata. JS validates it during kernel boot. | Boot fails earlier with clear errors when the host/kernel contract is incompatible. No Worker creation or Wasm instantiation moves out of JS. | Rust manifest serialization tests; ABI snapshot check; vitest boot validation cases; Node/browser worker-entry smoke if boot code changes. |
| In progress / stacked PR | Process lifecycle cleanup consolidation | Rust `ProcessTable` now owns parent lookup, wait-target matching, wait-status derivation, host-crash zombie marking, authorized child reaping, and thread-exit clear-tid metadata. TS keeps blocked waiter queues, Worker/memory cleanup, and the actual clear-tid memory write/futex wake because the ctid pointer names process memory. Remaining audit: thread channel/Worker allocation and free-list lifecycle, host timer cancellation, TCP listener target policy, and shared-memory mapping cleanup. | Kernel owns process lifecycle invariants that do not require Worker identity; JS owns Worker termination, memory objects, crash observation, and platform callbacks. | ProcessTable unit tests; fork/exec/spawn/clone/wait tests; crash/trap tests; browser parity smoke when worker entries change. |
| Planned | IPC/resource cleanup in Rust | Move remaining pure SysV IPC and POSIX mqueue lifetime/cleanup state into Rust-owned process cleanup paths. | `remove_process()` owns IPC cleanup; JS only wakes or schedules blocked channels when host primitives are involved. | SysV IPC and mqueue Rust tests plus host integration/e2e coverage for blocking and cleanup. |
| In progress / stacked PR | Process lifecycle cleanup consolidation | Rust `ProcessTable` now owns parent lookup, wait-target matching, wait-status derivation, host-crash zombie marking, authorized child reaping, thread-exit clear-tid metadata, and SysV shared-memory attachment metadata. TS keeps blocked waiter queues, Worker/memory cleanup, and process-memory writes/futex wakeups because those pointers name guest memory. Remaining audit: thread channel/Worker allocation and free-list lifecycle, host timer cancellation, and TCP listener target policy. | Kernel owns process lifecycle invariants that do not require Worker identity; JS owns Worker termination, memory objects, crash observation, and platform callbacks. | ProcessTable unit tests; fork/exec/spawn/clone/wait tests; crash/trap tests; browser parity smoke when worker entries change. |
| In progress / stacked PR | IPC/resource cleanup in Rust | Rust `Process` now records `shmat` address -> segment metadata, inherits it through fork, clears it across exec setup, and detaches live mappings from `remove_process()`. TS still copies bytes between guest memory and kernel SysV segments because only the host can address guest `Memory`. | `remove_process()` owns IPC attachment cleanup; JS only handles guest-memory transfer and host primitive wake/schedule work. | SysV IPC and mqueue Rust tests plus host integration/e2e coverage for blocking and cleanup. |
| Planned | Readiness metadata improvements | Replace broad host inference with kernel-emitted readiness events for pipe/socket/poll/select cases where the kernel already knows state changes. | JS still owns timers/retry queues/`Atomics.waitAsync`, but readiness decisions are less inferred from syscall numbers. No extra Wasm round trip per syscall. | Pipe/socket/poll/select/ppoll/pselect tests; browser bridge smoke for affected wake paths; performance comparison before removing broad fallback logic. |
| Planned | VFS policy split | Keep backend I/O, OPFS/IndexedDB/fetch, Node `fs`, and lazy archive materialization in JS. Move permission and policy decisions into Rust where process uid/gid/umask/fd context is authoritative. | Guest-visible policy is enforced in Rust; host adapters only perform platform operations requested through a checked contract. | VFS unit tests, uid/gid/permission tests, host-fs metadata tests, default mount tests, Node/browser parity tests. |
| Planned | Procfs/process snapshot schema metadata | Generate binary process snapshot schema/constants consumed by TS UI decoding, or replace TS decoding with a Rust-exported stable formatter if that does not add hot-path cost. | TS no longer hand-decodes undocumented offsets for kernel process snapshot data. Procfs text formatting remains Rust-owned. | Rust procfs/process snapshot tests; generated ABI vitest; UI/kernel-host tests that consume snapshots. |
Expand Down
Loading
Loading