diff --git a/api/src/syscall/mod.rs b/api/src/syscall/mod.rs index a0e0e251..da1ff60a 100644 --- a/api/src/syscall/mod.rs +++ b/api/src/syscall/mod.rs @@ -428,6 +428,11 @@ pub fn handle_syscall(uctx: &mut UserContext) { uctx.arg3(), uctx.arg4(), ), + Sysno::clone3 => sys_clone3( + uctx, + uctx.arg0() as _, // args_ptr + uctx.arg1() as _, // args_size + ), #[cfg(target_arch = "x86_64")] Sysno::fork => sys_fork(uctx), Sysno::exit => sys_exit(uctx.arg0() as _), diff --git a/api/src/syscall/task/clone.rs b/api/src/syscall/task/clone.rs index 06b50e2c..c11f7922 100644 --- a/api/src/syscall/task/clone.rs +++ b/api/src/syscall/task/clone.rs @@ -3,229 +3,313 @@ use alloc::sync::Arc; use axerrno::{AxError, AxResult}; use axfs::FS_CONTEXT; use axhal::uspace::UserContext; -use axtask::{AxTaskExt, current, spawn_task}; +use axtask::{current, spawn_task, AxTaskExt}; use bitflags::bitflags; use kspin::SpinNoIrq; use linux_raw_sys::general::*; use starry_core::{ mm::copy_from_kernel, - task::{AsThread, ProcessData, Thread, add_task_to_table}, + task::{add_task_to_table, AsThread, ProcessData, Thread}, }; use starry_process::Pid; use starry_signal::Signo; use starry_vm::VmMutPtr; use crate::{ - file::{FD_TABLE, FileLike, PidFd}, + file::{FileLike, PidFd, FD_TABLE}, task::new_user_task, }; bitflags! { - /// Options for use with [`sys_clone`]. + /// Options for use with [`sys_clone`] and [`sys_clone3`]. #[derive(Debug, Clone, Copy, Default)] - struct CloneFlags: u32 { - /// The calling process and the child process run in the same - /// memory space. - const VM = CLONE_VM; - /// The caller and the child process share the same filesystem - /// information. - const FS = CLONE_FS; - /// The calling process and the child process share the same file - /// descriptor table. - const FILES = CLONE_FILES; - /// The calling process and the child process share the same table - /// of signal handlers. - const SIGHAND = CLONE_SIGHAND; + pub struct CloneFlags: u64 { + /// The calling process and the child process run in the same memory space. + const VM = CLONE_VM as u64; + /// The caller and the child process share the same filesystem information. + const FS = CLONE_FS as u64; + /// The calling process and the child process share the same file descriptor table. + const FILES = CLONE_FILES as u64; + /// The calling process and the child process share the same table of signal handlers. + const SIGHAND = CLONE_SIGHAND as u64; /// Sets pidfd to the child process's PID file descriptor. - const PIDFD = CLONE_PIDFD; - /// If the calling process is being traced, then trace the child - /// also. - const PTRACE = CLONE_PTRACE; - /// The execution of the calling process is suspended until the - /// child releases its virtual memory resources via a call to - /// execve(2) or _exit(2) (as with vfork(2)). - const VFORK = CLONE_VFORK; - /// The parent of the new child (as returned by getppid(2)) - /// will be the same as that of the calling process. - const PARENT = CLONE_PARENT; - /// The child is placed in the same thread group as the calling - /// process. - const THREAD = CLONE_THREAD; + const PIDFD = CLONE_PIDFD as u64; + /// If the calling process is being traced, then trace the child also. + const PTRACE = CLONE_PTRACE as u64; + /// The execution of the calling process is suspended until the child releases + /// its virtual memory resources via a call to execve(2) or _exit(2) (as with vfork(2)). + const VFORK = CLONE_VFORK as u64; + /// The parent of the new child (as returned by getppid(2)) will be the same + /// as that of the calling process. + const PARENT = CLONE_PARENT as u64; + /// The child is placed in the same thread group as the calling process. + const THREAD = CLONE_THREAD as u64; /// The cloned child is started in a new mount namespace. - const NEWNS = CLONE_NEWNS; - /// The child and the calling process share a single list of System - /// V semaphore adjustment values - const SYSVSEM = CLONE_SYSVSEM; + const NEWNS = CLONE_NEWNS as u64; + /// The child and the calling process share a single list of System V + /// semaphore adjustment values. + const SYSVSEM = CLONE_SYSVSEM as u64; /// The TLS (Thread Local Storage) descriptor is set to tls. - const SETTLS = CLONE_SETTLS; + const SETTLS = CLONE_SETTLS as u64; /// Store the child thread ID in the parent's memory. - const PARENT_SETTID = CLONE_PARENT_SETTID; - /// Clear (zero) the child thread ID in child memory when the child - /// exits, and do a wakeup on the futex at that address. - const CHILD_CLEARTID = CLONE_CHILD_CLEARTID; - /// A tracing process cannot force `CLONE_PTRACE` on this child - /// process. - const UNTRACED = CLONE_UNTRACED; + const PARENT_SETTID = CLONE_PARENT_SETTID as u64; + /// Clear (zero) the child thread ID in child memory when the child exits, + /// and do a wakeup on the futex at that address. + const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; + /// A tracing process cannot force `CLONE_PTRACE` on this child process. + const UNTRACED = CLONE_UNTRACED as u64; /// Store the child thread ID in the child's memory. - const CHILD_SETTID = CLONE_CHILD_SETTID; + const CHILD_SETTID = CLONE_CHILD_SETTID as u64; /// Create the process in a new cgroup namespace. - const NEWCGROUP = CLONE_NEWCGROUP; + const NEWCGROUP = CLONE_NEWCGROUP as u64; /// Create the process in a new UTS namespace. - const NEWUTS = CLONE_NEWUTS; + const NEWUTS = CLONE_NEWUTS as u64; /// Create the process in a new IPC namespace. - const NEWIPC = CLONE_NEWIPC; + const NEWIPC = CLONE_NEWIPC as u64; /// Create the process in a new user namespace. - const NEWUSER = CLONE_NEWUSER; + const NEWUSER = CLONE_NEWUSER as u64; /// Create the process in a new PID namespace. - const NEWPID = CLONE_NEWPID; + const NEWPID = CLONE_NEWPID as u64; /// Create the process in a new network namespace. - const NEWNET = CLONE_NEWNET; + const NEWNET = CLONE_NEWNET as u64; /// The new process shares an I/O context with the calling process. - const IO = CLONE_IO; + const IO = CLONE_IO as u64; + /// Clear signal handlers on clone (since Linux 5.5). + const CLEAR_SIGHAND = 0x100000000u64; + /// Clone into specific cgroup (since Linux 5.7). + const INTO_CGROUP = 0x200000000u64; } } -pub fn sys_clone( - uctx: &UserContext, - flags: u32, - stack: usize, - parent_tid: usize, - #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, - tls: usize, - #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, -) -> AxResult { - const FLAG_MASK: u32 = 0xff; - let exit_signal = flags & FLAG_MASK; - let mut flags = CloneFlags::from_bits_truncate(flags & !FLAG_MASK); - if flags.contains(CloneFlags::VFORK) { - debug!("sys_clone: CLONE_VFORK slow path"); - flags.remove(CloneFlags::VM); - } +/// Unified arguments for clone/clone3/fork/vfork. +#[derive(Debug, Clone, Copy, Default)] +pub struct CloneArgs { + pub flags: CloneFlags, + pub exit_signal: u64, + pub stack: usize, + pub tls: usize, + pub parent_tid: usize, + pub child_tid: usize, + pub pidfd: usize, +} - debug!( - "sys_clone <= flags: {flags:?}, exit_signal: {exit_signal}, stack: {stack:#x}, ptid: \ - {parent_tid:#x}, ctid: {child_tid:#x}, tls: {tls:#x}" - ); +impl CloneArgs { + fn validate(&self) -> AxResult<()> { + let Self { + flags, + exit_signal, + .. + } = self; - if exit_signal != 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { - return Err(AxError::InvalidInput); - } - if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { - return Err(AxError::InvalidInput); - } - if flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { - return Err(AxError::InvalidInput); - } - let exit_signal = Signo::from_repr(exit_signal as u8); + if *exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { + return Err(AxError::InvalidInput); + } + if flags.contains(CloneFlags::THREAD) + && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) + { + return Err(AxError::InvalidInput); + } + if flags.contains(CloneFlags::SIGHAND) && !flags.contains(CloneFlags::VM) { + return Err(AxError::InvalidInput); + } + if flags.contains(CloneFlags::VFORK) && flags.contains(CloneFlags::THREAD) { + return Err(AxError::InvalidInput); + } + if *exit_signal >= 64 { + return Err(AxError::InvalidInput); + } - let mut new_uctx = *uctx; - if stack != 0 { - new_uctx.set_sp(stack); - } - if flags.contains(CloneFlags::SETTLS) { - new_uctx.set_tls(tls); - } - new_uctx.set_retval(0); + let namespace_flags = CloneFlags::NEWNS + | CloneFlags::NEWIPC + | CloneFlags::NEWNET + | CloneFlags::NEWPID + | CloneFlags::NEWUSER + | CloneFlags::NEWUTS + | CloneFlags::NEWCGROUP; - let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) { - child_tid - } else { - 0 - }; + if flags.intersects(namespace_flags) { + warn!("sys_clone/sys_clone3: namespace flags detected, stub support only"); + } - let curr = current(); - let old_proc_data = &curr.as_thread().proc_data; + Ok(()) + } - let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); + pub fn do_clone(self, uctx: &UserContext) -> AxResult { + self.validate()?; - let tid = new_task.id().as_u64() as Pid; - if flags.contains(CloneFlags::PARENT_SETTID) { - (parent_tid as *mut Pid).vm_write(tid).ok(); - } + let Self { + mut flags, + exit_signal, + stack, + tls, + parent_tid, + child_tid, + pidfd, + } = self; - let new_proc_data = if flags.contains(CloneFlags::THREAD) { - new_task - .ctx_mut() - .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); - old_proc_data.clone() - } else { - let proc = if flags.contains(CloneFlags::PARENT) { - old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? - } else { - old_proc_data.proc.clone() + if flags.contains(CloneFlags::VFORK) { + debug!("do_clone: CLONE_VFORK slow path"); + flags.remove(CloneFlags::VM); } - .fork(tid); - let aspace = if flags.contains(CloneFlags::VM) { - old_proc_data.aspace.clone() + debug!( + "do_clone <= flags: {:?}, exit_signal: {}, stack: {:#x}, tls: {:#x}", + flags, exit_signal, stack, tls + ); + + let exit_signal = if exit_signal > 0 { + Signo::from_repr(exit_signal as u8) } else { - let mut aspace = old_proc_data.aspace.lock(); - let aspace = aspace.try_clone()?; - copy_from_kernel(&mut aspace.lock())?; - aspace + None }; - new_task - .ctx_mut() - .set_page_table_root(aspace.lock().page_table_root()); - let signal_actions = if flags.contains(CloneFlags::SIGHAND) { - old_proc_data.signal.actions.clone() + let mut new_uctx = *uctx; + if stack != 0 { + new_uctx.set_sp(stack); + } + if flags.contains(CloneFlags::SETTLS) { + new_uctx.set_tls(tls); + } + new_uctx.set_retval(0); + + let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) { + child_tid } else { - Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) + 0 }; - let proc_data = ProcessData::new( - proc, - old_proc_data.exe_path.read().clone(), - old_proc_data.cmdline.read().clone(), - aspace, - signal_actions, - exit_signal, - ); - proc_data.set_umask(old_proc_data.umask()); - // Inherit heap pointers from parent to ensure child's heap state is consistent after fork - proc_data.set_heap_top(old_proc_data.get_heap_top()); - { - let mut scope = proc_data.scope.write(); - if flags.contains(CloneFlags::FILES) { - FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); + let curr = current(); + let old_proc_data = &curr.as_thread().proc_data; + + let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); + + let tid = new_task.id().as_u64() as Pid; + if flags.contains(CloneFlags::PARENT_SETTID) && parent_tid != 0 { + (parent_tid as *mut Pid).vm_write(tid).ok(); + } + + let new_proc_data = if flags.contains(CloneFlags::THREAD) { + new_task + .ctx_mut() + .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); + old_proc_data.clone() + } else { + let proc = if flags.contains(CloneFlags::PARENT) { + old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? } else { - FD_TABLE - .scope_mut(&mut scope) - .write() - .clone_from(&FD_TABLE.read()); + old_proc_data.proc.clone() } + .fork(tid); + + let aspace = if flags.contains(CloneFlags::VM) { + old_proc_data.aspace.clone() + } else { + let mut aspace = old_proc_data.aspace.lock(); + let aspace = aspace.try_clone()?; + copy_from_kernel(&mut aspace.lock())?; + aspace + }; + new_task + .ctx_mut() + .set_page_table_root(aspace.lock().page_table_root()); - if flags.contains(CloneFlags::FS) { - FS_CONTEXT.scope_mut(&mut scope).clone_from(&FS_CONTEXT); + let signal_actions = if flags.contains(CloneFlags::SIGHAND) { + old_proc_data.signal.actions.clone() + } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { + Arc::new(SpinNoIrq::new(Default::default())) } else { - FS_CONTEXT - .scope_mut(&mut scope) - .lock() - .clone_from(&FS_CONTEXT.lock()); + Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) + }; + + let proc_data = ProcessData::new( + proc, + old_proc_data.exe_path.read().clone(), + old_proc_data.cmdline.read().clone(), + aspace, + signal_actions, + exit_signal, + ); + proc_data.set_umask(old_proc_data.umask()); + proc_data.set_heap_top(old_proc_data.get_heap_top()); + + { + let mut scope = proc_data.scope.write(); + if flags.contains(CloneFlags::FILES) { + FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); + } else { + FD_TABLE + .scope_mut(&mut scope) + .write() + .clone_from(&FD_TABLE.read()); + } + + if flags.contains(CloneFlags::FS) { + FS_CONTEXT.scope_mut(&mut scope).clone_from(&FS_CONTEXT); + } else { + FS_CONTEXT + .scope_mut(&mut scope) + .lock() + .clone_from(&FS_CONTEXT.lock()); + } } + + proc_data + }; + + new_proc_data.proc.add_thread(tid); + + if flags.contains(CloneFlags::PIDFD) && pidfd != 0 { + let pidfd_obj = PidFd::new(&new_proc_data); + let fd = pidfd_obj.add_to_fd_table(true)?; + (pidfd as *mut i32).vm_write(fd)?; } - proc_data - }; + let thr = Thread::new(tid, new_proc_data); + if flags.contains(CloneFlags::CHILD_CLEARTID) && child_tid != 0 { + thr.set_clear_child_tid(child_tid); + } + *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); - new_proc_data.proc.add_thread(tid); + let task = spawn_task(new_task); + add_task_to_table(&task); - if flags.contains(CloneFlags::PIDFD) { - let pidfd = PidFd::new(&new_proc_data); - (parent_tid as *mut i32).vm_write(pidfd.add_to_fd_table(true)?)?; + Ok(tid as _) } +} + +pub fn sys_clone( + uctx: &UserContext, + flags: u32, + stack: usize, + parent_tid: usize, + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, + tls: usize, + #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, +) -> AxResult { + const FLAG_MASK: u32 = 0xff; + let clone_flags = CloneFlags::from_bits_truncate((flags & !FLAG_MASK) as u64); + let exit_signal = (flags & FLAG_MASK) as u64; - let thr = Thread::new(tid, new_proc_data); - if flags.contains(CloneFlags::CHILD_CLEARTID) { - thr.set_clear_child_tid(child_tid); + if clone_flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { + return Err(AxError::InvalidInput); } - *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); - let task = spawn_task(new_task); - add_task_to_table(&task); + let args = CloneArgs { + flags: clone_flags, + exit_signal, + stack, + tls, + parent_tid, + child_tid, + // In sys_clone, parent_tid is reused for pidfd when CLONE_PIDFD is set + pidfd: if clone_flags.contains(CloneFlags::PIDFD) { + parent_tid + } else { + 0 + }, + }; - Ok(tid as _) + args.do_clone(uctx) } #[cfg(target_arch = "x86_64")] diff --git a/api/src/syscall/task/clone3.rs b/api/src/syscall/task/clone3.rs new file mode 100644 index 00000000..29248c58 --- /dev/null +++ b/api/src/syscall/task/clone3.rs @@ -0,0 +1,79 @@ +use axerrno::{AxError, AxResult}; +use axhal::uspace::UserContext; +use starry_vm::VmPtr; + +use super::clone::{CloneArgs, CloneFlags}; + +/// Structure passed to clone3() system call. +#[repr(C)] +#[derive(Debug, Clone, Copy, Default, bytemuck::Zeroable, bytemuck::AnyBitPattern)] +pub struct Clone3Args { + pub flags: u64, + pub pidfd: u64, + pub child_tid: u64, + pub parent_tid: u64, + pub exit_signal: u64, + pub stack: u64, + pub stack_size: u64, + pub tls: u64, + pub set_tid: u64, + pub set_tid_size: u64, + pub cgroup: u64, +} + +const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; + +impl TryFrom for CloneArgs { + type Error = axerrno::AxError; + + fn try_from(args: Clone3Args) -> AxResult { + if args.set_tid != 0 || args.set_tid_size != 0 { + warn!("sys_clone3: set_tid/set_tid_size not supported, ignoring"); + } + if args.cgroup != 0 { + warn!("sys_clone3: cgroup parameter not supported, ignoring"); + } + + let flags = CloneFlags::from_bits_truncate(args.flags); + + let stack = if args.stack > 0 { + if args.stack_size > 0 { + (args.stack + args.stack_size) as usize + } else { + args.stack as usize + } + } else { + 0 + }; + + Ok(CloneArgs { + flags, + exit_signal: args.exit_signal, + stack, + tls: args.tls as usize, + parent_tid: args.parent_tid as usize, + child_tid: args.child_tid as usize, + pidfd: args.pidfd as usize, + }) + } +} + +pub fn sys_clone3(uctx: &UserContext, args: *const Clone3Args, size: usize) -> AxResult { + debug!("sys_clone3 <= args: {args:p}, size: {size}"); + + if size < MIN_CLONE_ARGS_SIZE { + warn!("sys_clone3: size {size} too small, minimum is {MIN_CLONE_ARGS_SIZE}"); + return Err(AxError::InvalidInput); + } + + if size > core::mem::size_of::() { + debug!("sys_clone3: size {size} larger than expected, using known fields only"); + } + + let clone3_args = args.vm_read()?; + + debug!("sys_clone3: args = {clone3_args:?}"); + + let clone_args = CloneArgs::try_from(clone3_args)?; + clone_args.do_clone(uctx) +} diff --git a/api/src/syscall/task/mod.rs b/api/src/syscall/task/mod.rs index a6e77afc..2143a0e8 100644 --- a/api/src/syscall/task/mod.rs +++ b/api/src/syscall/task/mod.rs @@ -1,4 +1,5 @@ mod clone; +mod clone3; mod ctl; mod execve; mod exit; @@ -7,4 +8,6 @@ mod schedule; mod thread; mod wait; -pub use self::{clone::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*}; +pub use self::{ + clone::*, clone3::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*, +};