From 32c9a762af12dcdaef65abe2dc0f3c728a587670 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 1 May 2026 13:23:14 -0700 Subject: [PATCH] policy_fn: extend execve argv freeze to peer processes (#27) Signed-off-by: Cong Wang --- crates/sandlock-core/src/lib.rs | 2 +- crates/sandlock-core/src/policy_fn.rs | 19 +- crates/sandlock-core/src/sandbox_freeze.rs | 374 +++++++++++++++++++++ crates/sandlock-core/src/seccomp/notif.rs | 61 ++-- crates/sandlock-core/src/sibling_freeze.rs | 182 ---------- 5 files changed, 429 insertions(+), 209 deletions(-) create mode 100644 crates/sandlock-core/src/sandbox_freeze.rs delete mode 100644 crates/sandlock-core/src/sibling_freeze.rs diff --git a/crates/sandlock-core/src/lib.rs b/crates/sandlock-core/src/lib.rs index 9a25967..5274152 100644 --- a/crates/sandlock-core/src/lib.rs +++ b/crates/sandlock-core/src/lib.rs @@ -15,7 +15,7 @@ pub(crate) mod random; pub(crate) mod time; pub(crate) mod cow; pub(crate) mod checkpoint; -pub(crate) mod sibling_freeze; +pub(crate) mod sandbox_freeze; pub mod netlink; pub(crate) mod procfs; pub(crate) mod port_remap; diff --git a/crates/sandlock-core/src/policy_fn.rs b/crates/sandlock-core/src/policy_fn.rs index 4759b76..2e5f0aa 100644 --- a/crates/sandlock-core/src/policy_fn.rs +++ b/crates/sandlock-core/src/policy_fn.rs @@ -64,11 +64,15 @@ pub enum SyscallCategory { /// /// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by /// construction: before the supervisor returns `Continue` for an -/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every sibling thread -/// of the calling tid so the kernel's post-Continue re-read sees the -/// same memory the supervisor inspected. Siblings are killed by the -/// kernel during execve's `de_thread` step anyway, so the pause has -/// no observable cost. See `crate::sibling_freeze`. +/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task in the +/// sandbox — both sibling threads of the calling tid (same TGID, share +/// `mm_struct`) and peer threads in other TGIDs that may alias argv +/// pages via `MAP_SHARED` mappings or share `mm_struct` via +/// `clone(CLONE_VM)`. The kernel's post-Continue re-read therefore +/// sees the same memory the supervisor inspected. Siblings are killed +/// by the kernel during execve's `de_thread` step; peer threads are +/// detached after `NOTIF_SEND` and resume normally. See +/// `crate::sandbox_freeze`. /// /// Network fields (`host`, `port`) are TOCTOU-safe because the /// supervisor performs `connect`/`sendto`/`bind` on-behalf via @@ -89,8 +93,9 @@ pub struct SyscallEvent { pub port: Option, /// Size argument (for mmap, brk). pub size: Option, - /// Command arguments for execve/execveat. TOCTOU-safe: sibling - /// threads are frozen before the kernel re-reads. + /// Command arguments for execve/execveat. TOCTOU-safe: every task + /// in the sandbox (caller's siblings and peer processes) is frozen + /// before the kernel re-reads argv from child memory. pub argv: Option>, /// Whether the supervisor denied this syscall. pub denied: bool, diff --git a/crates/sandlock-core/src/sandbox_freeze.rs b/crates/sandlock-core/src/sandbox_freeze.rs new file mode 100644 index 0000000..d08fe1f --- /dev/null +++ b/crates/sandlock-core/src/sandbox_freeze.rs @@ -0,0 +1,374 @@ +//! Freeze sandbox threads of an execve caller before returning Continue. +//! +//! # Why +//! +//! Per `seccomp_unotify(2)`, after the supervisor responds with +//! `Continue`, the kernel re-reads the syscall's user-memory pointers +//! before executing the syscall. For execve, that means the kernel +//! re-reads `pathname` and the argv array from child memory. Any task +//! that can write to that memory in the window between the supervisor's +//! inspection and the kernel's re-read can defeat the decision +//! `policy_fn` made on the values it saw. +//! +//! Two distinct task classes can write that memory: +//! 1. Sibling threads of the calling tid (same TGID; share `mm_struct` +//! by definition). +//! 2. Peer processes in other TGIDs that alias the same pages via +//! `MAP_SHARED` mappings (memfd, SysV shm, shared file mmap), or +//! that share the calling task's `mm_struct` via +//! `clone(CLONE_VM)` without `CLONE_THREAD`. +//! +//! `freeze_sandbox_for_execve` closes both classes. It enumerates every +//! TGID tracked in `ProcessIndex` (the canonical sandbox membership +//! set), walks `/proc//task` per TGID, and `PTRACE_SEIZE` + +//! `PTRACE_INTERRUPT` every TID. Together with the supervisor's +//! sequential notification dispatch (which prevents new clone/fork +//! notifications from being processed while the freeze is in flight), +//! every entity that could mutate argv is paused before the kernel +//! re-reads. +//! +//! # Sibling vs peer cleanup +//! +//! Sibling threads (same TGID as the caller) are killed by the kernel +//! during execve's `de_thread` step, so the supervisor never has to +//! detach them — their ptrace state is reaped along with the threads. +//! +//! Peer threads (different TGID) survive execve. The supervisor must +//! `PTRACE_DETACH` them after `NOTIF_SEND` so they can resume normal +//! execution. The freeze function returns the peer TID list for that +//! purpose; siblings are not returned because they need no follow-up. +//! +//! # Failure modes (strict) +//! +//! The freeze is an invariant: if the supervisor exposed argv to +//! `policy_fn` and the callback returned Allow, the kernel must re-read +//! the same memory the supervisor inspected. We refuse to silently +//! degrade — if the freeze cannot be established, the supervisor +//! denies the execve with `EPERM` rather than letting it proceed +//! without TOCTOU protection. +//! +//! - `PTRACE_SEIZE` returns `ESRCH` for a sibling that exited between +//! enumeration and seize. Treated as success: there is no thread to +//! race. +//! - Any other ptrace failure (YAMA `ptrace_scope` >= 2 outside the +//! parent chain, another tracer attached, kernel resource limits) +//! produces an error; siblings already frozen during the partial +//! attempt are detached so they resume normally; the caller fails +//! the syscall closed. + +use std::collections::HashSet; +use std::fs; +use std::io; + +/// Read the `State:` field from `/proc//status`. Returns the +/// single-character state code (`R`, `S`, `D`, `T`, `t`, `Z`, `X`) +/// or `None` if the file or line is unreadable. +fn read_task_state(tid: i32) -> Option { + let status = fs::read_to_string(format!("/proc/{}/status", tid)).ok()?; + let line = status.lines().find(|l| l.starts_with("State:"))?; + // Format is "State:\t ()" — find the first non-space + // character after the colon. + line.split_whitespace().nth(1).and_then(|s| s.chars().next()) +} + +/// `PTRACE_SEIZE` + `PTRACE_INTERRUPT` a single tid and wait for the +/// confirmed ptrace-stop. Returns `Ok(true)` if the tid is now +/// ptrace-stopped (and must be detached later), `Ok(false)` if the +/// tid does not need to be ptrace-attached (already exited, or held +/// in an uninterruptible kernel wait where it cannot mutate user +/// memory), or an error if ptrace refused. +/// +/// # Why we read `/proc//status` first +/// +/// A task in `TASK_UNINTERRUPTIBLE` (`State: D`) — most commonly the +/// vfork parent of the execve caller, suspended in `kernel_clone` +/// until its child execs — cannot enter ptrace-stop until its +/// kernel wait clears. For vfork specifically, the wait won't clear +/// until we send Continue, but we can't send Continue while we're +/// blocked in `waitpid` for that exact task. Naively waitpid'ing +/// would deadlock the supervisor. +/// +/// Such tasks also don't *need* to be ptrace-attached: they can't +/// run user code while in uninterruptible wait, and therefore can't +/// mutate argv. The kernel is already holding them for us. We skip +/// the seize entirely and return `Ok(false)` so the caller does not +/// add them to the detach list. +/// +/// On a partial-progress failure (PTRACE_SEIZE succeeded but +/// PTRACE_INTERRUPT did not), the function detaches itself before +/// returning so the caller doesn't have to track partial state. +fn seize_and_interrupt(tid: i32) -> io::Result { + // Skip tasks the kernel is already holding for us. See doc above. + if read_task_state(tid) == Some('D') { + return Ok(false); + } + + let ret = unsafe { + libc::ptrace(libc::PTRACE_SEIZE as libc::c_uint, tid, 0, 0) + }; + if ret < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::ESRCH) { + return Ok(false); // already exited — nothing to freeze + } + return Err(err); + } + // PTRACE_SEIZE succeeded; from here, any error path must DETACH + // before returning so we don't leave the task traced-but-running. + + let ret = unsafe { + libc::ptrace(libc::PTRACE_INTERRUPT as libc::c_uint, tid, 0, 0) + }; + if ret < 0 { + let err = io::Error::last_os_error(); + let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; + if err.raw_os_error() == Some(libc::ESRCH) { + return Ok(false); + } + return Err(err); + } + + // Wait for the confirmed ptrace-stop. The task was not in + // uninterruptible wait when we checked, so PTRACE_INTERRUPT + // delivers within microseconds. `__WALL` is needed because + // siblings are threads (not children of the supervisor in the + // traditional fork sense) and waitpid(2) by default ignores them. + let mut status: i32 = 0; + let _ = unsafe { libc::waitpid(tid, &mut status, libc::__WALL) }; + Ok(true) +} + +/// Detach a previously-frozen task. Used to roll back partial +/// progress when a later task refuses to be frozen, and to release +/// peer tasks after the kernel has re-read execve argv. +fn detach(tid: i32) { + let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; +} + +/// Enumerate every TID in a TGID via `/proc//task/`. Linux +/// resolves `/proc//task` to the same directory, so this +/// works whether `tgid` is the leader's PID or any TID in the group. +fn list_threads_of_tgid(tgid: i32) -> io::Result> { + let dir = fs::read_dir(format!("/proc/{}/task", tgid))?; + let mut tids = Vec::new(); + for entry in dir { + let entry = match entry { + Ok(e) => e, + Err(_) => continue, + }; + let name = entry.file_name(); + let name_str = match name.to_str() { + Some(s) => s, + None => continue, + }; + if let Ok(tid) = name_str.parse::() { + tids.push(tid); + } + } + Ok(tids) +} + +/// Read the TGID containing `tid` from `/proc//status`. +fn read_tgid_of_tid(tid: i32) -> io::Result { + let status = fs::read_to_string(format!("/proc/{}/status", tid))?; + for line in status.lines() { + if let Some(rest) = line.strip_prefix("Tgid:") { + return rest.trim().parse().map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("parse Tgid: {}", e), + ) + }); + } + } + Err(io::Error::new( + io::ErrorKind::InvalidData, + "no Tgid: line in /proc//status", + )) +} + +/// Outcome of a sandbox-wide freeze. The supervisor must call +/// `detach_peers(&outcome.peer_tids)` after `NOTIF_SEND` to let the +/// peer processes resume. +#[derive(Debug, Default)] +pub(crate) struct SandboxFreeze { + /// TIDs in *other* TGIDs that were ptrace-stopped. These survive + /// execve and must be detached so they can resume normal + /// execution. Siblings of `caller_tid` are deliberately not in + /// this list — execve's `de_thread` kills them and the kernel + /// reaps their ptrace state automatically. + pub peer_tids: Vec, +} + +/// Freeze every sandbox thread that could mutate execve argv before +/// the kernel re-reads it. +/// +/// Walks every TGID in `processes` (and defensively the caller's own +/// TGID), enumerates each TGID's threads via `/proc//task/`, +/// and `PTRACE_SEIZE` + `PTRACE_INTERRUPT` every TID except +/// `caller_tid`. Sibling threads of `caller_tid` and peer threads in +/// other TGIDs are both covered. +/// +/// Strict semantics: if any task refuses to be frozen, every +/// already-frozen task is detached and the error is propagated. The +/// caller is expected to deny the execve with `EPERM`, preserving the +/// invariant that exposed argv is always TOCTOU-safe. +/// +/// On success, returns the list of *peer* TIDs that survive execve and +/// must be detached after `NOTIF_SEND`. Sibling TIDs are not returned +/// because they die in `de_thread`. +pub(crate) fn freeze_sandbox_for_execve( + processes: &crate::seccomp::state::ProcessIndex, + caller_tid: i32, +) -> io::Result { + let caller_tgid = read_tgid_of_tid(caller_tid)?; + + // ProcessIndex is the canonical sandbox membership set. The + // supervisor's `register_child_if_new` runs before per-syscall + // handlers, so the caller's TGID is guaranteed to be present. + let tgids: HashSet = processes.pids_snapshot(); + + let mut sibling_tids: Vec = Vec::new(); + let mut peer_tids: Vec = Vec::new(); + + for tgid in &tgids { + // /proc//task may disappear if the TGID exited between + // snapshot and walk — that's fine, no threads to freeze. + let tids = match list_threads_of_tgid(*tgid) { + Ok(t) => t, + Err(_) => continue, + }; + for tid in tids { + if tid == caller_tid { + continue; + } + match seize_and_interrupt(tid) { + Ok(true) => { + if *tgid == caller_tgid { + sibling_tids.push(tid); + } else { + peer_tids.push(tid); + } + } + Ok(false) => continue, // already exited — fine + Err(e) => { + // Roll back: detach every task we already froze + // (siblings + peers) so they resume normally. + for t in &sibling_tids { + detach(*t); + } + for t in &peer_tids { + detach(*t); + } + return Err(e); + } + } + } + } + + Ok(SandboxFreeze { peer_tids }) +} + +/// Detach peer TIDs after the kernel has re-read execve argv. Errors +/// are ignored: a peer that already exited returns ESRCH, which is +/// harmless. +pub(crate) fn detach_peers(peer_tids: &[i32]) { + for tid in peer_tids { + detach(*tid); + } +} + +/// Helper called from the dispatch hot path. Returns true if the +/// notification is for an execve-class syscall whose Continue response +/// requires freezing siblings. +pub(crate) fn requires_freeze_on_continue(syscall_nr: i64) -> bool { + syscall_nr == libc::SYS_execve || syscall_nr == libc::SYS_execveat +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::seccomp::state::ProcessIndex; + + #[test] + fn list_threads_of_tgid_includes_self() { + // Our own /proc/self/task always exists and always contains + // at least our own tid. + let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32; + let tids = list_threads_of_tgid(our_tid).unwrap(); + assert!(tids.contains(&our_tid)); + } + + #[test] + fn requires_freeze_only_for_exec() { + assert!(requires_freeze_on_continue(libc::SYS_execve)); + assert!(requires_freeze_on_continue(libc::SYS_execveat)); + assert!(!requires_freeze_on_continue(libc::SYS_openat)); + assert!(!requires_freeze_on_continue(libc::SYS_connect)); + } + + /// Regression test for the cross-process TOCTOU concern raised on + /// issue #27 (Changaco): a peer process in the sandbox — different + /// TGID, possibly aliasing argv pages via shared memory — must also + /// be frozen before the kernel re-reads execve argv. Sibling-thread + /// freeze alone (`freeze_siblings_for_execve`) does not cover this. + /// + /// This test registers a peer process in `ProcessIndex` and verifies + /// that `freeze_sandbox_for_execve` puts it in ptrace-stop, the same + /// way `freeze_siblings_for_execve` does for siblings. + #[test] + fn freeze_sandbox_includes_peer_process() { + use std::process::{Command, Stdio}; + + let mut peer = Command::new("/bin/sleep") + .arg("60") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("spawn peer sleep"); + let peer_pid = peer.id() as i32; + + // Give the peer a moment to actually be running. + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Register the peer in a fresh ProcessIndex (mirrors what the + // supervisor's clone/fork notification handler would do). + let processes = ProcessIndex::new(); + processes + .register(peer_pid) + .expect("register peer in ProcessIndex"); + + let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32; + + let outcome = freeze_sandbox_for_execve(&processes, our_tid) + .expect("freeze_sandbox_for_execve"); + + // Peer's TID is its own TGID (single-threaded sleep), and it's + // a different TGID from ours, so it should be in peer_tids. + assert!( + outcome.peer_tids.contains(&peer_pid), + "peer pid {} should be in peer_tids: {:?}", + peer_pid, + outcome.peer_tids + ); + + // Verify the peer is actually ptrace-stopped via /proc. + let status = std::fs::read_to_string(format!("/proc/{}/status", peer_pid)) + .expect("read peer status"); + let state_line = status + .lines() + .find(|l| l.starts_with("State:")) + .expect("State: line"); + assert!( + state_line.contains("t (tracing stop)") || state_line.contains("T (stopped)"), + "peer should be ptrace-stopped, got: {}", + state_line + ); + + // Cleanup: detach the peer so it can resume and be killed. + detach_peers(&outcome.peer_tids); + let _ = peer.kill(); + let _ = peer.wait(); + } +} diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index da170f2..b99bbd2 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -781,8 +781,9 @@ async fn emit_policy_event( // in static Landlock rules. // // argv IS extracted for execve/execveat: the supervisor freezes - // sibling threads before returning Continue (sibling_freeze module), - // so the post-Continue re-read sees the same memory we read here. + // every task in the sandbox (siblings + peers) before returning + // Continue (sandbox_freeze module), so the post-Continue re-read + // sees the same memory we read here. // // Network fields are TOCTOU-safe because connect/sendto/bind are // performed on-behalf via pidfd_getfd; the kernel never re-reads @@ -916,35 +917,57 @@ async fn handle_notification( } } - // TOCTOU-close for execve (issue #27): freeze sibling threads of - // the calling tid before the kernel re-reads pathname/argv from - // child memory. Cheap because the kernel's de_thread step in - // execve kills the siblings anyway — we're just stopping them - // moments earlier, closing the race window for the supervisor's - // argv inspection in policy_fn. + // TOCTOU-close for execve (issue #27): freeze every sandbox task + // that could mutate argv before the kernel re-reads it after + // Continue. This covers two distinct writer classes: + // 1. Sibling threads of the calling tid (same TGID, share mm). + // 2. Peer processes in other TGIDs that alias argv pages via + // MAP_SHARED mappings or share mm via clone(CLONE_VM). + // Sibling-thread freeze alone closed (1) but not (2), as raised + // by Changaco on issue #27. // - // Only relevant when we're sending Continue: a denial response - // (Errno) means the kernel never re-reads, so no freeze needed. + // Only relevant when sending Continue: a denial (Errno) means the + // kernel never re-reads, so no freeze is needed. // - // Strict on failure: if we cannot freeze the siblings, we cannot + // Strict on failure: if we cannot establish the freeze, we cannot // uphold the argv-safety invariant, so we deny the execve with // EPERM rather than letting it through unprotected. + // + // Sibling threads die in execve's de_thread; the kernel reaps + // their ptrace state. Peer threads survive — we detach them after + // NOTIF_SEND so they resume normally. let nr = notif.data.nr as i64; + let mut peer_tids_to_detach: Vec = Vec::new(); if matches!(action, NotifAction::Continue) - && crate::sibling_freeze::requires_freeze_on_continue(nr) + && crate::sandbox_freeze::requires_freeze_on_continue(nr) { - if let Err(e) = crate::sibling_freeze::freeze_siblings_for_execve(notif.pid as i32) { - eprintln!( - "sandlock: argv-safety freeze failed for pid {}: {} \ - — denying execve to preserve TOCTOU invariant", - notif.pid, e - ); - action = NotifAction::Errno(libc::EPERM); + match crate::sandbox_freeze::freeze_sandbox_for_execve( + &ctx.processes, + notif.pid as i32, + ) { + Ok(outcome) => { + peer_tids_to_detach = outcome.peer_tids; + } + Err(e) => { + eprintln!( + "sandlock: argv-safety freeze failed for pid {}: {} \ + — denying execve to preserve TOCTOU invariant", + notif.pid, e + ); + action = NotifAction::Errno(libc::EPERM); + } } } // Ignore error — child may have exited between recv and response. let _ = send_response(fd, notif.id, action); + + // Detach peer processes after NOTIF_SEND so they resume. Siblings + // of the caller's TGID are intentionally not detached: they die in + // execve's de_thread and the kernel reaps the ptrace state. + if !peer_tids_to_detach.is_empty() { + crate::sandbox_freeze::detach_peers(&peer_tids_to_detach); + } } // ============================================================ diff --git a/crates/sandlock-core/src/sibling_freeze.rs b/crates/sandlock-core/src/sibling_freeze.rs deleted file mode 100644 index 42365c0..0000000 --- a/crates/sandlock-core/src/sibling_freeze.rs +++ /dev/null @@ -1,182 +0,0 @@ -//! Freeze sibling threads of an execve caller before returning Continue. -//! -//! # Why -//! -//! Per `seccomp_unotify(2)`, after the supervisor responds with -//! `Continue`, the kernel re-reads the syscall's user-memory pointers -//! before executing the syscall. For execve, that means the kernel -//! re-reads `pathname` and the argv array from child memory. A racing -//! sibling thread of the calling tid can mutate that memory in the -//! window between the supervisor's response and the kernel's re-read, -//! defeating any decision policy_fn made on the values it inspected. -//! -//! This module closes the window for execve specifically. Before the -//! supervisor returns Continue, every sibling tid is `PTRACE_SEIZE`d -//! and `PTRACE_INTERRUPT`ed (which puts it in group-stop). The kernel -//! re-reads while no sibling is running. Then the supervisor releases -//! its hold on the seccomp notification. -//! -//! # Why this is essentially free for execve -//! -//! `execve(2)` already terminates all sibling threads as part of -//! `de_thread`. Freezing them moments earlier doesn't change observable -//! behavior — the kernel kills them anyway. We don't need to detach -//! explicitly; the siblings die with the rest of the thread group -//! during execve, and ptrace records associated with them are reaped -//! by the kernel. -//! -//! # Failure modes (strict) -//! -//! The freeze is an invariant: if the supervisor exposed argv to -//! `policy_fn` and the callback returned Allow, the kernel must re-read -//! the same memory the supervisor inspected. We refuse to silently -//! degrade — if the freeze cannot be established, the supervisor -//! denies the execve with `EPERM` rather than letting it proceed -//! without TOCTOU protection. -//! -//! - `PTRACE_SEIZE` returns `ESRCH` for a sibling that exited between -//! enumeration and seize. Treated as success: there is no thread to -//! race. -//! - Any other ptrace failure (YAMA `ptrace_scope` >= 2 outside the -//! parent chain, another tracer attached, kernel resource limits) -//! produces an error; siblings already frozen during the partial -//! attempt are detached so they resume normally; the caller fails -//! the syscall closed. - -use std::fs; -use std::io; - -/// Enumerate sibling tids of `caller_tid` from `/proc//task/`. -/// `caller_tid` is excluded from the result. -fn list_siblings(caller_tid: i32) -> io::Result> { - let dir = fs::read_dir(format!("/proc/{}/task", caller_tid))?; - let mut tids = Vec::new(); - for entry in dir { - let entry = match entry { - Ok(e) => e, - Err(_) => continue, - }; - let name = entry.file_name(); - let name_str = match name.to_str() { - Some(s) => s, - None => continue, - }; - let tid: i32 = match name_str.parse() { - Ok(t) => t, - Err(_) => continue, - }; - if tid != caller_tid { - tids.push(tid); - } - } - Ok(tids) -} - -/// `PTRACE_SEIZE` + `PTRACE_INTERRUPT` a single tid and wait for the -/// resulting group-stop. Returns `Ok(true)` if the tid is now stopped, -/// `Ok(false)` if the tid had already exited (ESRCH; nothing to do), -/// or an error if ptrace refused. -/// -/// On a partial-progress failure (PTRACE_SEIZE succeeded but -/// PTRACE_INTERRUPT did not), the function detaches itself before -/// returning so the caller doesn't have to track partial state. -fn seize_and_interrupt(tid: i32) -> io::Result { - let ret = unsafe { - libc::ptrace(libc::PTRACE_SEIZE as libc::c_uint, tid, 0, 0) - }; - if ret < 0 { - let err = io::Error::last_os_error(); - if err.raw_os_error() == Some(libc::ESRCH) { - return Ok(false); // already exited — nothing to freeze - } - return Err(err); - } - // PTRACE_SEIZE succeeded; from here, any error path must DETACH - // before returning so we don't leave the sibling traced-but-running. - - let ret = unsafe { - libc::ptrace(libc::PTRACE_INTERRUPT as libc::c_uint, tid, 0, 0) - }; - if ret < 0 { - let err = io::Error::last_os_error(); - let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; - if err.raw_os_error() == Some(libc::ESRCH) { - return Ok(false); - } - return Err(err); - } - - // Wait for the ptrace-stop. WNOHANG would race; we want to block - // until the sibling is genuinely stopped. __WALL is needed because - // siblings are threads (not children of the supervisor in the - // traditional fork sense) and waitpid(2) by default ignores them. - let mut status: i32 = 0; - let _ = unsafe { libc::waitpid(tid, &mut status, libc::__WALL) }; - Ok(true) -} - -/// Detach a previously-frozen sibling. Used to roll back partial -/// progress when a later sibling refuses to be frozen. -fn detach(tid: i32) { - let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; -} - -/// Freeze all sibling threads of `caller_tid`. -/// -/// Strict semantics: if any sibling refuses to be frozen, all -/// successfully-frozen siblings are detached (so they resume normally) -/// and the error is propagated. The caller is expected to deny the -/// execve with EPERM, preserving the invariant that exposed argv is -/// always TOCTOU-safe. -/// -/// On success, returns the number of siblings frozen. The supervisor -/// does not actively detach on the success path — siblings die during -/// execve's `de_thread`, and the kernel reaps the ptrace state. -pub(crate) fn freeze_siblings_for_execve(caller_tid: i32) -> io::Result { - let siblings = list_siblings(caller_tid)?; - let mut frozen: Vec = Vec::with_capacity(siblings.len()); - for tid in siblings { - match seize_and_interrupt(tid) { - Ok(true) => frozen.push(tid), - Ok(false) => continue, // already exited — fine - Err(e) => { - // Roll back: detach everything we already froze so they - // resume normally, then fail. - for ftid in &frozen { - detach(*ftid); - } - return Err(e); - } - } - } - Ok(frozen.len()) -} - -/// Helper called from the dispatch hot path. Returns true if the -/// notification is for an execve-class syscall whose Continue response -/// requires freezing siblings. -pub(crate) fn requires_freeze_on_continue(syscall_nr: i64) -> bool { - syscall_nr == libc::SYS_execve || syscall_nr == libc::SYS_execveat -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn list_siblings_excludes_self() { - // Our own /proc/self/task always exists; just check we don't - // see our own tid in the list. - let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32; - let siblings = list_siblings(our_tid).unwrap(); - assert!(!siblings.contains(&our_tid)); - } - - #[test] - fn requires_freeze_only_for_exec() { - assert!(requires_freeze_on_continue(libc::SYS_execve)); - assert!(requires_freeze_on_continue(libc::SYS_execveat)); - assert!(!requires_freeze_on_continue(libc::SYS_openat)); - assert!(!requires_freeze_on_continue(libc::SYS_connect)); - } -}