From 060607a797065e94fdcbc93241e61bd40fa80fe9 Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Fri, 6 Feb 2026 07:23:26 -0500 Subject: [PATCH 1/2] fix(arm64): fix fork crash and enable telnet over TCP - Fix walk_mapped_pages() to iterate L0[0-512] on ARM64 (was 0-256, an x86_64 assumption that skipped stack pages at L0[511]) - Unify stack constants in exec_process ARM64 paths to use USER_STACK_REGION_START instead of hardcoded 0x0000_FFFF_FF01_0000 - Add DATA_ABORT diagnostic: print TTBR0_EL1 for process identification - Add fork diagnostic: print child stack mapping and L4 frame address - Simplify telnetd to use direct socket I/O (skip PTY for now) - Add EOF handling in init_shell for non-PID-1 instances (telnet sessions) - Add TCP debug tracing via serial_println for connection debugging - Enable port forwarding (2323) in run.sh for ARM64 Co-Authored-By: Claude Opus 4.6 --- kernel/src/arch_impl/aarch64/exception.rs | 14 +- kernel/src/memory/process_memory.rs | 8 +- kernel/src/net/ipv4.rs | 4 + kernel/src/net/tcp.rs | 19 +++ kernel/src/process/manager.rs | 38 +++-- run.sh | 2 +- userspace/bin/services/telnetd.rs | 192 +++------------------- userspace/examples/init_shell.rs | 14 +- 8 files changed, 97 insertions(+), 194 deletions(-) diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs index 0d20c1e0..452b1167 100644 --- a/kernel/src/arch_impl/aarch64/exception.rs +++ b/kernel/src/arch_impl/aarch64/exception.rs @@ -88,16 +88,22 @@ pub extern "C" fn handle_sync_exception(frame: *mut Aarch64ExceptionFrame, esr: // Check if from userspace (EL0) - SPSR[3:0] indicates source EL let from_el0 = (frame_ref.spsr & 0xF) == 0; + let ttbr0: u64; + unsafe { + core::arch::asm!("mrs {}, ttbr0_el1", out(reg) ttbr0, options(nomem, nostack)); + } + crate::serial_println!( + "[DATA_ABORT] TTBR0_EL1={:#x}, FAR={:#x}, ESR={:#x}", + ttbr0, + far, + esr + ); if from_el0 { // From userspace - terminate the process with SIGSEGV crate::serial_println!("[exception] Terminating userspace process with SIGSEGV"); // Get current TTBR0 to find the process - let ttbr0: u64; - unsafe { - core::arch::asm!("mrs {}, ttbr0_el1", out(reg) ttbr0, options(nomem, nostack)); - } let page_table_phys = ttbr0 & !0xFFFF_0000_0000_0FFF; // Find and terminate the process diff --git a/kernel/src/memory/process_memory.rs b/kernel/src/memory/process_memory.rs index ee0da36a..cb579699 100644 --- a/kernel/src/memory/process_memory.rs +++ b/kernel/src/memory/process_memory.rs @@ -1019,8 +1019,12 @@ impl ProcessPageTable { let l4_virt = phys_offset + self.level_4_frame.start_address().as_u64(); let l4_table = &*(l4_virt.as_ptr() as *const PageTable); - // Walk L4 entries 0-255 (userspace only, 256-511 is kernel) - for l4_idx in 0..256u64 { + // Walk userspace L4 entries (x86_64: 0-255; ARM64 TTBR0 uses 0-511) + #[cfg(target_arch = "x86_64")] + let l4_range = 0..256u64; + #[cfg(target_arch = "aarch64")] + let l4_range = 0..512u64; + for l4_idx in l4_range { let l4_entry = &l4_table[l4_idx as usize]; if l4_entry.is_unused() || !l4_entry.flags().contains(PageTableFlags::PRESENT) { continue; diff --git a/kernel/src/net/ipv4.rs b/kernel/src/net/ipv4.rs index d1dc55a6..9d5672be 100644 --- a/kernel/src/net/ipv4.rs +++ b/kernel/src/net/ipv4.rs @@ -173,6 +173,10 @@ pub fn handle_ipv4(eth_frame: &EthernetFrame, ip: &Ipv4Packet) { } } PROTOCOL_TCP => { + crate::serial_println!("[IPv4] TCP packet from {}.{}.{}.{} -> {}.{}.{}.{} ({} bytes)", + ip.src_ip[0], ip.src_ip[1], ip.src_ip[2], ip.src_ip[3], + ip.dst_ip[0], ip.dst_ip[1], ip.dst_ip[2], ip.dst_ip[3], + ip.payload.len()); super::tcp::handle_tcp(ip, ip.payload); } PROTOCOL_UDP => { diff --git a/kernel/src/net/tcp.rs b/kernel/src/net/tcp.rs index 61f94bda..b5920795 100644 --- a/kernel/src/net/tcp.rs +++ b/kernel/src/net/tcp.rs @@ -417,10 +417,16 @@ pub fn handle_tcp(ip: &Ipv4Packet, data: &[u8]) { Some(h) => h, None => { log::warn!("TCP: Failed to parse header"); + crate::serial_println!("[TCP] Failed to parse header from {}.{}.{}.{}", ip.src_ip[0], ip.src_ip[1], ip.src_ip[2], ip.src_ip[3]); return; } }; + crate::serial_println!("[TCP] RX {}.{}.{}.{}:{} -> port {} flags={:?} seq={} ack={} len={}", + ip.src_ip[0], ip.src_ip[1], ip.src_ip[2], ip.src_ip[3], + header.src_port, header.dst_port, header.flags, + header.seq_num, header.ack_num, payload.len()); + log::debug!( "TCP: Received {} bytes from {}.{}.{}.{}:{} -> port {} seq={} ack={} flags={:?}", payload.len(), @@ -452,7 +458,9 @@ pub fn handle_tcp(ip: &Ipv4Packet, data: &[u8]) { // No existing connection - check for listening socket { let mut listeners = TCP_LISTENERS.lock(); + crate::serial_println!("[TCP] Checking listeners for port {} (have {} listeners)", header.dst_port, listeners.len()); if let Some(listener) = listeners.get_mut(&header.dst_port) { + crate::serial_println!("[TCP] Found listener on port {}", header.dst_port); if header.flags.syn && !header.flags.ack { // SYN received on listening socket - add to pending queue handle_syn_for_listener(listener, ip.src_ip, &header, &config); @@ -483,6 +491,7 @@ pub fn handle_tcp(ip: &Ipv4Packet, data: &[u8]) { } // No connection and no listener - send RST + crate::serial_println!("[TCP] No socket for port {}, sending RST", header.dst_port); log::debug!("TCP: No socket for port {}, sending RST", header.dst_port); send_rst(&config, ip.src_ip, &header); } @@ -738,7 +747,12 @@ fn handle_syn_for_listener( header: &TcpHeader, config: &super::NetConfig, ) { + crate::serial_println!("[TCP] handle_syn_for_listener: port {} from {}.{}.{}.{}:{} (pending={}/{})", + header.dst_port, src_ip[0], src_ip[1], src_ip[2], src_ip[3], header.src_port, + listener.pending.len(), listener.backlog); + if listener.pending.len() >= listener.backlog { + crate::serial_println!("[TCP] Backlog full, sending RST for port {}", header.dst_port); log::warn!("TCP: Backlog full, sending RST for port {}", header.dst_port); // Send RST to tell client the connection was refused send_rst(config, src_ip, header); @@ -758,6 +772,7 @@ fn handle_syn_for_listener( recv_next: header.seq_num.wrapping_add(1), // +1 for SYN }); + crate::serial_println!("[TCP] SYN received on port {}, sending SYN+ACK (ISN={})", header.dst_port, send_isn); log::debug!("TCP: SYN received, sending SYN+ACK"); // Send SYN+ACK @@ -824,7 +839,10 @@ pub fn send_tcp_packet( payload, ); + crate::serial_println!("[TCP] TX -> {}.{}.{}.{}:{} from port {} flags={:?} seq={} ack={}", + dst_ip[0], dst_ip[1], dst_ip[2], dst_ip[3], dst_port, src_port, flags, seq_num, ack_num); if let Err(e) = super::send_ipv4(dst_ip, PROTOCOL_TCP, &packet) { + crate::serial_println!("[TCP] Failed to send packet: {}", e); log::warn!("TCP: Failed to send packet: {}", e); } } @@ -907,6 +925,7 @@ pub fn tcp_listen( ref_count: core::sync::atomic::AtomicUsize::new(1), }); + crate::serial_println!("[TCP] Listening on port {} (backlog={}, pid={:?})", local_port, backlog, owner_pid); log::info!("TCP: Listening on port {}", local_port); Ok(()) diff --git a/kernel/src/process/manager.rs b/kernel/src/process/manager.rs index 50518114..5dba7c1a 100644 --- a/kernel/src/process/manager.rs +++ b/kernel/src/process/manager.rs @@ -1433,6 +1433,12 @@ impl ProcessManager { log::error!("ARM64 fork: Failed to map user stack: {}", e); "Failed to map user stack in child's page table" })?; + crate::serial_println!( + "ARM64 fork: child stack mapped {:#x}-{:#x} in child page table (L4 frame={:#x})", + child_stack_bottom.as_u64(), + child_stack_top.as_u64(), + child_page_table_ref.level_4_frame().start_address().as_u64() + ); // Allocate a globally unique thread ID for the child's main thread let child_thread_id = crate::task::thread::allocate_thread_id(); @@ -2775,6 +2781,7 @@ impl ProcessManager { program_name: Option<&str>, argv: &[&[u8]], ) -> Result<(u64, u64), &'static str> { + use crate::arch_impl::aarch64::constants::USER_STACK_REGION_START; use crate::memory::arch_stub::{Page, PageTableFlags, Size4KiB}; log::info!( @@ -2829,12 +2836,12 @@ impl ProcessManager { log::warn!("ARM64: Failed to unmap old user data pages: {}", e); } + let user_stack_top = USER_STACK_REGION_START; + { const STACK_SIZE: usize = 64 * 1024; - // ARM64 canonical user address (not x86_64's 0x7FFF_FF01_0000) - const STACK_TOP: u64 = 0x0000_FFFF_FF01_0000; - let unmap_bottom = VirtAddr::new(STACK_TOP - STACK_SIZE as u64); - let unmap_top = VirtAddr::new(STACK_TOP); + let unmap_bottom = VirtAddr::new(user_stack_top - STACK_SIZE as u64); + let unmap_top = VirtAddr::new(user_stack_top); if let Err(e) = new_page_table.unmap_user_pages(unmap_bottom, unmap_top) { log::warn!("ARM64: Failed to unmap old stack pages: {}", e); } @@ -2850,11 +2857,9 @@ impl ProcessManager { ); const USER_STACK_SIZE: usize = 64 * 1024; - // ARM64 canonical user address (not x86_64's 0x7FFF_FF01_0000) - const USER_STACK_TOP: u64 = 0x0000_FFFF_FF01_0000; - let stack_bottom = VirtAddr::new(USER_STACK_TOP - USER_STACK_SIZE as u64); - let stack_top = VirtAddr::new(USER_STACK_TOP); + let stack_bottom = VirtAddr::new(user_stack_top - USER_STACK_SIZE as u64); + let stack_top = VirtAddr::new(user_stack_top); log::info!("exec_process_with_argv [ARM64]: Mapping stack pages into new process page table"); let start_page = Page::::containing_address(stack_bottom); @@ -2873,7 +2878,7 @@ impl ProcessManager { )?; } - let initial_rsp = self.setup_argv_on_stack(&new_page_table, USER_STACK_TOP, argv)?; + let initial_rsp = self.setup_argv_on_stack(&new_page_table, user_stack_top, argv)?; log::info!( "exec_process_with_argv [ARM64]: argc/argv set up on stack, SP_EL0={:#x}", @@ -2987,6 +2992,7 @@ impl ProcessManager { elf_data: &[u8], program_name: Option<&str>, ) -> Result { + use crate::arch_impl::aarch64::constants::USER_STACK_REGION_START; use crate::memory::arch_stub::{Page, PageTableFlags, Size4KiB}; log::info!( @@ -3049,12 +3055,12 @@ impl ProcessManager { } // Unmap the stack region before mapping new stack pages + let user_stack_top = USER_STACK_REGION_START; + { const STACK_SIZE: usize = 64 * 1024; // 64KB stack - // ARM64 canonical user address (not x86_64's 0x7FFF_FF01_0000) - const STACK_TOP: u64 = 0x0000_FFFF_FF01_0000; - let unmap_bottom = VirtAddr::new(STACK_TOP - STACK_SIZE as u64); - let unmap_top = VirtAddr::new(STACK_TOP); + let unmap_bottom = VirtAddr::new(user_stack_top - STACK_SIZE as u64); + let unmap_top = VirtAddr::new(user_stack_top); if let Err(e) = new_page_table.unmap_user_pages(unmap_bottom, unmap_top) { log::warn!("ARM64: Failed to unmap old stack pages: {}", e); } @@ -3074,12 +3080,10 @@ impl ProcessManager { // Allocate and map stack directly into the new process page table const USER_STACK_SIZE: usize = 64 * 1024; // 64KB stack - // ARM64 canonical user address (not x86_64's 0x7FFF_FF01_0000) - const USER_STACK_TOP: u64 = 0x0000_FFFF_FF01_0000; // Calculate stack range - let stack_bottom = VirtAddr::new(USER_STACK_TOP - USER_STACK_SIZE as u64); - let stack_top = VirtAddr::new(USER_STACK_TOP); + let stack_bottom = VirtAddr::new(user_stack_top - USER_STACK_SIZE as u64); + let stack_top = VirtAddr::new(user_stack_top); // Map stack pages into the NEW process page table log::info!("exec_process [ARM64]: Mapping stack pages into new process page table"); diff --git a/run.sh b/run.sh index 4975d5ed..e653a5bd 100755 --- a/run.sh +++ b/run.sh @@ -183,7 +183,7 @@ if [ "$ARCH" = "arm64" ]; then $DISPLAY_OPTS \ $DISK_OPTS \ -device virtio-net-device,netdev=net0 \ - -netdev user,id=net0 \ + -netdev user,id=net0,hostfwd=tcp::2323-:2323 \ -serial mon:stdio \ -no-reboot \ & diff --git a/userspace/bin/services/telnetd.rs b/userspace/bin/services/telnetd.rs index 84ce7f57..642bdd40 100644 --- a/userspace/bin/services/telnetd.rs +++ b/userspace/bin/services/telnetd.rs @@ -1,214 +1,72 @@ -//! Telnet server using PTY infrastructure +//! Telnet server - spawns a shell per connection //! //! - Binds to port 2323 -//! - Accepts a single TCP connection -//! - Creates a PTY pair and forks a shell attached to the slave -//! - Relays data between socket and PTY master using poll() +//! - Accepts TCP connections +//! - Forks a child with stdin/stdout/stderr redirected to the socket +//! - Child execs init_shell which communicates directly over TCP #![no_std] #![no_main] use core::panic::PanicInfo; -use libbreenix::fs; -use libbreenix::io::{self, close, dup2, poll, PollFd}; -use libbreenix::io::poll_events::{POLLERR, POLLHUP, POLLIN, POLLNVAL}; +use libbreenix::io::{self, close, dup2}; use libbreenix::process; -use libbreenix::pty; -use libbreenix::signal; use libbreenix::socket::{accept, bind, listen, socket, SockAddrIn, AF_INET, SOCK_STREAM}; use libbreenix::types::fd; const TELNET_PORT: u16 = 2323; -const EAGAIN: i64 = -11; const SHELL_PATH: &[u8] = b"/bin/init_shell\0"; -/// Write all data to fd, handling partial writes -fn write_all(file: u64, buf: &[u8]) -> i64 { - let mut offset = 0usize; - while offset < buf.len() { - let n = io::write(file, &buf[offset..]); - if n <= 0 { - return n; - } - offset += n as usize; - } - offset as i64 -} - -/// Open the PTY slave device given the path buffer from openpty() -fn open_slave(slave_path: &[u8; 32]) -> Option { - // Find the null terminator to get path length - let path_bytes = pty::slave_path_bytes(slave_path); - - // Create a null-terminated string for open() - // Path format is "/dev/pts/N" where N is 0-255, so max 12 chars + null - let mut path_buf = [0u8; 16]; - let path_len = path_bytes.len().min(path_buf.len() - 1); - path_buf[..path_len].copy_from_slice(&path_bytes[..path_len]); - path_buf[path_len] = 0; - - // Convert to &str for open() - let path_str = match core::str::from_utf8(&path_buf[..path_len + 1]) { - Ok(s) => s, - Err(_) => return None, - }; - - match fs::open(path_str, fs::O_RDWR) { - Ok(fd) => Some(fd), - Err(_) => None, - } -} - -/// Terminate the child process gracefully -fn terminate_child(pid: i32) { - // Try SIGTERM first - let _ = signal::kill(pid, signal::SIGTERM); - - // Wait a bit for graceful shutdown - for _ in 0..100 { - let status = process::waitpid(pid, core::ptr::null_mut(), process::WNOHANG); - if status == pid as i64 { - return; - } - process::yield_now(); - } - - // Force kill if still alive - let _ = signal::kill(pid, signal::SIGKILL); - - // Wait for termination - for _ in 0..100 { - let status = process::waitpid(pid, core::ptr::null_mut(), process::WNOHANG); - if status == pid as i64 { - return; - } - process::yield_now(); - } -} - -/// Handle a single telnet connection +/// Handle a single telnet connection by forking a shell fn handle_connection(client_fd: i32) { - // Create PTY pair - let (master_fd, slave_path) = match pty::openpty() { - Ok(pair) => pair, - Err(_) => { - io::print("TELNETD_ERROR: openpty failed\n"); - close(client_fd as u64); - return; - } - }; - - io::print("TELNETD_PTY_CREATED\n"); - // Fork child process for shell let pid = process::fork(); if pid == 0 { - // Child process: set up PTY slave as stdin/stdout/stderr and exec shell - close(master_fd as u64); - close(client_fd as u64); - - // Open the PTY slave - let slave_fd = match open_slave(&slave_path) { - Some(fd) => fd, - None => { - io::print("TELNETD_ERROR: open slave failed\n"); - process::exit(1); - } - }; + // Child process: redirect stdin/stdout/stderr to the TCP socket + let cfd = client_fd as u64; - // Redirect stdin/stdout/stderr to PTY slave - if dup2(slave_fd, fd::STDIN) < 0 { + // Redirect stdin/stdout/stderr to the TCP socket + if dup2(cfd, fd::STDIN) < 0 { process::exit(1); } - if dup2(slave_fd, fd::STDOUT) < 0 { + if dup2(cfd, fd::STDOUT) < 0 { process::exit(1); } - if dup2(slave_fd, fd::STDERR) < 0 { + if dup2(cfd, fd::STDERR) < 0 { process::exit(1); } - close(slave_fd); + + // Close the original client_fd (now duplicated to 0/1/2) + if client_fd as u64 > fd::STDERR { + close(cfd); + } // Execute shell let argv: [*const u8; 2] = [SHELL_PATH.as_ptr(), core::ptr::null()]; let _ = process::execv(SHELL_PATH, argv.as_ptr()); // If exec fails + io::print("TELNETD_ERROR: exec failed\n"); process::exit(127); } if pid < 0 { io::print("TELNETD_ERROR: fork failed\n"); - close(master_fd as u64); close(client_fd as u64); return; } io::print("TELNETD_SHELL_FORKED\n"); - // Parent: relay data between socket and PTY master - let mut fds = [ - PollFd::new(client_fd, POLLIN), - PollFd::new(master_fd, POLLIN), - ]; - - let mut buf = [0u8; 1024]; - loop { - // Clear revents before polling - for pfd in &mut fds { - pfd.revents = 0; - } - - let ready = poll(&mut fds, 1000); // 1 second timeout - - if ready < 0 { - break; - } - - // Check for errors on client socket - if fds[0].revents & (POLLERR | POLLHUP | POLLNVAL) != 0 { - break; - } - - // Check for errors on PTY master - if fds[1].revents & (POLLERR | POLLHUP | POLLNVAL) != 0 { - break; - } - - // Read from client, write to PTY master - if fds[0].revents & POLLIN != 0 { - let n = io::read(client_fd as u64, &mut buf); - if n > 0 { - if write_all(master_fd as u64, &buf[..n as usize]) < 0 { - break; - } - } else if n == 0 { - break; // Client disconnected - } else if n != EAGAIN { - break; - } - } - - // Read from PTY master, write to client - if fds[1].revents & POLLIN != 0 { - let n = io::read(master_fd as u64, &mut buf); - if n > 0 { - if write_all(client_fd as u64, &buf[..n as usize]) < 0 { - break; - } - } else if n == 0 { - break; // PTY closed - } else if n != EAGAIN { - break; - } - } - } + // Parent: close client fd (child owns it now) and return to accept more + close(client_fd as u64); - io::print("TELNETD_RELAY_DONE\n"); + // Wait for child to finish (blocking) + let mut status: i32 = 0; + let _ = process::waitpid(pid as i32, &mut status as *mut i32, 0); - close(master_fd as u64); - close(client_fd as u64); - terminate_child(pid as i32); + io::print("TELNETD_SESSION_ENDED\n"); } #[no_mangle] @@ -245,7 +103,6 @@ pub extern "C" fn _start() -> ! { Ok(client_fd) => { io::print("TELNETD_CONNECTED\n"); handle_connection(client_fd); - // After connection closes, continue accepting io::print("TELNETD_LISTENING\n"); } Err(_) => { @@ -255,7 +112,6 @@ pub extern "C" fn _start() -> ! { } } - // Unreachable in daemon mode, but keep for completeness #[allow(unreachable_code)] { io::print("TELNETD_SHUTDOWN\n"); diff --git a/userspace/examples/init_shell.rs b/userspace/examples/init_shell.rs index 10623614..b9a932dc 100644 --- a/userspace/examples/init_shell.rs +++ b/userspace/examples/init_shell.rs @@ -22,7 +22,7 @@ use core::sync::atomic::{AtomicBool, Ordering}; use libbreenix::fs::{access, open, O_DIRECTORY, O_RDONLY, O_WRONLY, X_OK}; use libbreenix::io::{close, dup2, pipe, print, println, read, write}; use libbreenix::process::{ - chdir, exec, execv, fork, getcwd, getpgrp, setpgid, waitpid, wexitstatus, wifexited, + chdir, exec, execv, fork, getcwd, getpgrp, getpid, setpgid, waitpid, wexitstatus, wifexited, wifsignaled, wifstopped, yield_now, WNOHANG, WUNTRACED, }; use libbreenix::signal::{kill, sigaction, Sigaction, SIGCHLD, SIGCONT, SIGINT}; @@ -1370,12 +1370,22 @@ fn read_line() -> Option<&'static str> { let mut c = [0u8; 1]; let n = read(STDIN, &mut c); - if n == -EAGAIN || n == 0 { + if n == -EAGAIN { // No data available - yield and retry yield_now(); continue; } + if n == 0 { + // EOF on stdin - if we're not PID 1 (e.g., running over telnet), + // exit gracefully. PID 1 (init) must never exit. + if getpid() != 1 { + libbreenix::process::exit(0); + } + yield_now(); + continue; + } + if n < 0 { // Other error (possibly EINTR from signal) - yield and retry yield_now(); From 3f0101366636828745fb22fe6112856c7e239e44 Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Fri, 6 Feb 2026 10:15:13 -0500 Subject: [PATCH 2/2] fix(arm64): fix SP_EL1 corruption on IRQ context switch to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IRQ return path (boot.S) restores SP from user_rsp_scratch before ERET: `mov sp, [user_rsp_scratch]; eret`. When a context switch routes through restore_userspace_context_arm64 or setup_first_userspace_entry_arm64, neither function updated user_rsp_scratch. This left SP_EL1 pointing to the *previous* thread's kernel stack after ERET to EL0. When the next timer interrupt fired from EL0, the IRQ handler's `sub sp, sp, #272` allocated its exception frame on the wrong kernel stack, corrupting memory — including other threads' SVC exception frames. This caused ELR fields to be overwritten with 0, leading to instruction aborts at address 0x0 when returning to userspace. Fix: set user_rsp_scratch = kernel_stack_top in both functions, ensuring SP_EL1 is correct for the next interrupt from EL0. Additional changes in this commit: - Add PROCESS tracing provider (0x06xx) with lock-free trace events for fork, exec, CoW faults, data aborts, stack mapping, and process exit - Replace serial_println! with lock-free trace events in CoW fault handler, data abort handler, fork/exec paths, and stack mapping - Add FORK_TOTAL, EXEC_TOTAL, COW_FAULT_TOTAL counters - Fix fork child x0=0: ARM64 syscall0 uses lateout("x0") so x0 is undefined at SVC entry; forked child must explicitly get x0=0 - Improve instruction abort handler: identify crashing process via TTBR0, terminate with SIGSEGV instead of hanging - Improve init_shell panic handler to print file:line location Co-Authored-By: Claude Opus 4.6 --- .../src/arch_impl/aarch64/context_switch.rs | 16 +- kernel/src/arch_impl/aarch64/exception.rs | 108 ++++--- kernel/src/memory/process_memory.rs | 61 +--- kernel/src/process/manager.rs | 24 +- kernel/src/tracing/core.rs | 12 + kernel/src/tracing/output.rs | 23 ++ kernel/src/tracing/providers/counters.rs | 82 ++++- kernel/src/tracing/providers/mod.rs | 10 +- kernel/src/tracing/providers/process.rs | 288 ++++++++++++++++++ userspace/examples/init_shell.rs | 27 +- 10 files changed, 535 insertions(+), 116 deletions(-) create mode 100644 kernel/src/tracing/providers/process.rs diff --git a/kernel/src/arch_impl/aarch64/context_switch.rs b/kernel/src/arch_impl/aarch64/context_switch.rs index b5d0518e..111e930d 100644 --- a/kernel/src/arch_impl/aarch64/context_switch.rs +++ b/kernel/src/arch_impl/aarch64/context_switch.rs @@ -580,6 +580,15 @@ fn restore_userspace_context_arm64(thread_id: u64, frame: &mut Aarch64ExceptionF // Switch TTBR0 if needed for different address space switch_ttbr0_if_needed(thread_id); + + // CRITICAL: Set user_rsp_scratch to this thread's kernel stack top. + // The IRQ return path (boot.S) uses `mov sp, [user_rsp_scratch]` before ERET. + // Without this, SP_EL1 retains the switching-out thread's stack pointer, + // causing the next IRQ from EL0 to allocate its exception frame on the + // wrong kernel stack — corrupting memory and other threads' SVC frames. + unsafe { + Aarch64PerCpu::set_user_rsp_scratch(Aarch64PerCpu::kernel_stack_top()); + } } /// Set up exception frame for first entry to userspace. @@ -644,7 +653,12 @@ fn setup_first_userspace_entry_arm64(thread_id: u64, frame: &mut Aarch64Exceptio // Switch TTBR0 for this thread's address space switch_ttbr0_if_needed(thread_id); - // NOTE: No logging - context switch path must be lock-free + // CRITICAL: Set user_rsp_scratch to this thread's kernel stack top. + // Same as restore_userspace_context_arm64 — the IRQ return path uses + // user_rsp_scratch for SP after ERET. Without this, SP_EL1 is wrong. + unsafe { + Aarch64PerCpu::set_user_rsp_scratch(Aarch64PerCpu::kernel_stack_top()); + } } /// Switch TTBR0_EL1 if the thread requires a different address space. diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs index 452b1167..deb78dec 100644 --- a/kernel/src/arch_impl/aarch64/exception.rs +++ b/kernel/src/arch_impl/aarch64/exception.rs @@ -81,10 +81,7 @@ pub extern "C" fn handle_sync_exception(frame: *mut Aarch64ExceptionFrame, esr: // Not a CoW fault or couldn't be handled let frame_ref = unsafe { &mut *frame }; - crate::serial_println!("[exception] Data abort at address {:#x}", far); - crate::serial_println!(" ELR: {:#x}, ESR: {:#x}", frame_ref.elr, esr); - crate::serial_println!(" ISS: {:#x} (WnR={}, DFSC={:#x})", - iss, (iss >> 6) & 1, iss & 0x3F); + let dfsc = (iss & 0x3F) as u16; // Check if from userspace (EL0) - SPSR[3:0] indicates source EL let from_el0 = (frame_ref.spsr & 0xF) == 0; @@ -92,31 +89,27 @@ pub extern "C" fn handle_sync_exception(frame: *mut Aarch64ExceptionFrame, esr: unsafe { core::arch::asm!("mrs {}, ttbr0_el1", out(reg) ttbr0, options(nomem, nostack)); } - crate::serial_println!( - "[DATA_ABORT] TTBR0_EL1={:#x}, FAR={:#x}, ESR={:#x}", - ttbr0, - far, - esr - ); + + // Lock-free trace: data abort event + crate::tracing::providers::process::trace_data_abort(0, dfsc); + + // One condensed serial_println for fatal crash visibility + crate::serial_println!("[DATA_ABORT] FAR={:#x} ESR={:#x} DFSC={:#x}", far, esr, dfsc); if from_el0 { // From userspace - terminate the process with SIGSEGV - crate::serial_println!("[exception] Terminating userspace process with SIGSEGV"); - // Get current TTBR0 to find the process let page_table_phys = ttbr0 & !0xFFFF_0000_0000_0FFF; // Find and terminate the process let mut terminated = false; crate::process::with_process_manager(|pm| { - if let Some((pid, process)) = pm.find_process_by_cr3_mut(page_table_phys) { - let name = process.name.clone(); - crate::serial_println!("[exception] Killing process {} (PID {}) due to data abort", - name, pid.as_u64()); + if let Some((pid, _process)) = pm.find_process_by_cr3_mut(page_table_phys) { + crate::tracing::providers::process::trace_process_exit(pid.as_u64() as u16, (-11i16) as u16); pm.exit_process(pid, -11); // SIGSEGV exit code terminated = true; } else { - crate::serial_println!("[exception] Could not find process with TTBR0={:#x}", page_table_phys); + // trace_data_abort already captured the fault } }); @@ -142,10 +135,51 @@ pub extern "C" fn handle_sync_exception(frame: *mut Aarch64ExceptionFrame, esr: } exception_class::INSTRUCTION_ABORT_LOWER | exception_class::INSTRUCTION_ABORT_SAME => { - let frame = unsafe { &*frame }; - crate::serial_println!("[exception] Instruction abort at address {:#x}", far); - crate::serial_println!(" ELR: {:#x}, ESR: {:#x}", frame.elr, esr); - // For now, hang + let frame_ref = unsafe { &mut *frame }; + let ifsc = (iss & 0x3F) as u16; + let from_el0 = (frame_ref.spsr & 0xF) == 0; + + let ttbr0: u64; + unsafe { + core::arch::asm!("mrs {}, ttbr0_el1", out(reg) ttbr0, options(nomem, nostack)); + } + + crate::serial_println!( + "[INSTRUCTION_ABORT] FAR={:#x} ELR={:#x} ESR={:#x} IFSC={:#x} TTBR0={:#x} from_el0={}", + far, frame_ref.elr, esr, ifsc, ttbr0, from_el0 + ); + + if from_el0 { + // From userspace - terminate the process with SIGSEGV + let page_table_phys = ttbr0 & !0xFFFF_0000_0000_0FFF; + + let mut terminated = false; + crate::process::with_process_manager(|pm| { + if let Some((pid, _process)) = pm.find_process_by_cr3_mut(page_table_phys) { + crate::serial_println!( + "[INSTRUCTION_ABORT] Terminating PID {} (SIGSEGV)", + pid.as_u64() + ); + crate::tracing::providers::process::trace_process_exit( + pid.as_u64() as u16, + (-11i16) as u16, + ); + pm.exit_process(pid, -11); // SIGSEGV + terminated = true; + } + }); + + if terminated { + crate::task::scheduler::set_need_resched(); + crate::task::scheduler::switch_to_idle(); + frame_ref.elr = + crate::arch_impl::aarch64::idle_loop_arm64 as *const () as u64; + frame_ref.spsr = 0x3c5; // EL1h, interrupts enabled + return; + } + } + + // From kernel or couldn't terminate - hang loop { unsafe { core::arch::asm!("wfi"); } } } @@ -515,12 +549,8 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { // Mask off ASID to get physical address let page_table_phys = ttbr0 & !0xFFFF_0000_0000_0FFF; - crate::serial_println!( - "[COW ARM64] fault at {:#x}, ttbr0={:#x}, pt_phys={:#x}", - far, - ttbr0, - page_table_phys - ); + // Lock-free trace: CoW fault entry (pid unknown yet, page index from far) + crate::tracing::providers::process::trace_cow_fault(0, (far >> 12) as u16); // Try to acquire process manager lock match crate::process::try_manager() { @@ -535,7 +565,6 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { let (_pid, process) = match pm.find_process_by_cr3_mut(page_table_phys) { Some(p) => p, None => { - crate::serial_println!("[COW] No process found for TTBR0"); return false; } }; @@ -551,23 +580,17 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { let (old_frame, old_flags) = match page_table.get_page_info(page) { Some(info) => info, None => { - crate::serial_println!("[COW] No page info for {:#x}", far); return false; } }; // Check if this is a CoW page if !is_cow_page(old_flags) { - crate::serial_println!("[COW] Not a CoW page"); return false; } - crate::serial_println!( - "[COW] Handling page {:#x}, frame={:#x}, shared={}", - far, - old_frame.start_address().as_u64(), - frame_is_shared(old_frame) - ); + // Lock-free trace: CoW handling with known PID + crate::tracing::providers::process::trace_cow_fault(_pid.as_u64() as u16, (far >> 12) as u16); // If we're the sole owner, just make it writable if !frame_is_shared(old_frame) { @@ -588,7 +611,7 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { ); } cow_stats::SOLE_OWNER_OPT.fetch_add(1, core::sync::atomic::Ordering::Relaxed); - crate::serial_println!("[COW] Made sole-owner page writable"); + crate::tracing::providers::process::trace_cow_copy(_pid.as_u64() as u16, (far >> 12) as u16); return true; } @@ -596,7 +619,6 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { let new_frame = match allocate_frame() { Some(f) => f, None => { - crate::serial_println!("[COW] Failed to allocate frame"); return false; } }; @@ -613,11 +635,9 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { // Unmap old page and map new one with write permissions let new_flags = make_private_flags(old_flags); if page_table.unmap_page(page).is_err() { - crate::serial_println!("[COW] Failed to unmap old page"); return false; } if page_table.map_page(page, new_frame, new_flags).is_err() { - crate::serial_println!("[COW] Failed to map new page"); return false; } @@ -638,17 +658,13 @@ fn handle_cow_fault_arm64(far: u64, iss: u32) -> bool { } cow_stats::PAGES_COPIED.fetch_add(1, core::sync::atomic::Ordering::Relaxed); - crate::serial_println!( - "[COW] Copied page from {:#x} to {:#x}", - old_frame.start_address().as_u64(), - new_frame.start_address().as_u64() - ); + crate::tracing::providers::process::trace_cow_copy(_pid.as_u64() as u16, (far >> 12) as u16); true } None => { cow_stats::DIRECT_PATH.fetch_add(1, core::sync::atomic::Ordering::Relaxed); - crate::serial_println!("[COW] Manager lock held, cannot handle"); + crate::tracing::providers::process::trace_cow_lock_fail(0); false } } diff --git a/kernel/src/memory/process_memory.rs b/kernel/src/memory/process_memory.rs index cb579699..d7ec5b30 100644 --- a/kernel/src/memory/process_memory.rs +++ b/kernel/src/memory/process_memory.rs @@ -383,7 +383,6 @@ impl ProcessPageTable { Some(frame) => { let frame_addr = frame.start_address().as_u64(); log::debug!("Successfully allocated frame: {:#x}", frame_addr); - crate::serial_println!("🔍 ProcessPageTable::new() allocated L4 frame={:#x}", frame_addr); // Check for problematic frames if frame_addr == 0x611000 { @@ -404,8 +403,6 @@ impl ProcessPageTable { "Allocated L4 frame: {:#x}", level_4_frame.start_address().as_u64() ); - crate::serial_println!("🔍 ProcessPageTable will use L4 frame={:#x}", level_4_frame.start_address().as_u64()); - // Get physical memory offset let phys_offset = crate::memory::physical_memory_offset(); @@ -2126,20 +2123,11 @@ pub fn map_user_stack_to_process( { // ARM64: user addresses live in TTBR0 and must be mapped directly into // the process page table, not copied from the kernel (TTBR1) mappings. - crate::serial_println!( - "map_user_stack_to_process [ARM64]: enter mapping {:#x} - {:#x}", - start_page.start_address().as_u64(), - end_page.start_address().as_u64() - ); let flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE | PageTableFlags::USER_ACCESSIBLE; for page in Page::range_inclusive(start_page, end_page) { - crate::serial_println!( - "map_user_stack_to_process [ARM64]: processing page {:#x}", - page.start_address().as_u64() - ); if let Some(existing_frame) = process_page_table.translate_page(page.start_address()) { let existing_frame = PhysFrame::::containing_address(existing_frame); log::trace!( @@ -2147,26 +2135,13 @@ pub fn map_user_stack_to_process( page.start_address().as_u64(), existing_frame.start_address().as_u64() ); - crate::serial_println!( - "map_user_stack_to_process [ARM64]: page already mapped to frame {:#x}", - existing_frame.start_address().as_u64() - ); mapped_pages += 1; continue; } let frame = match allocate_frame() { - Some(frame) => { - crate::serial_println!( - "map_user_stack_to_process [ARM64]: allocated frame {:#x}", - frame.start_address().as_u64() - ); - frame - } + Some(frame) => frame, None => { - crate::serial_println!( - "map_user_stack_to_process [ARM64]: allocate_frame failed (OOM)" - ); return Err("Out of memory for user stack"); } }; @@ -2178,11 +2153,6 @@ pub fn map_user_stack_to_process( page.start_address().as_u64(), frame.start_address().as_u64() ); - crate::serial_println!( - "map_user_stack_to_process [ARM64]: map_page ok {:#x} -> {:#x}", - page.start_address().as_u64(), - frame.start_address().as_u64() - ); } Err(e) => { log::error!( @@ -2190,12 +2160,6 @@ pub fn map_user_stack_to_process( page.start_address().as_u64(), e ); - crate::serial_println!( - "map_user_stack_to_process [ARM64]: map_page FAILED {:#x} -> {:#x}: {}", - page.start_address().as_u64(), - frame.start_address().as_u64(), - e - ); return Err("Failed to map user stack page"); } } @@ -2233,14 +2197,6 @@ pub fn map_user_stack_to_process_with_phys( let stack_size = user_stack_top.as_u64() - user_stack_bottom.as_u64(); let num_pages = stack_size / 4096; - crate::serial_println!( - "map_user_stack_to_process_with_phys: user {:#x}-{:#x}, phys {:#x}, {} pages", - user_stack_bottom.as_u64(), - user_stack_top.as_u64(), - phys_bottom, - num_pages - ); - let flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE | PageTableFlags::USER_ACCESSIBLE; @@ -2252,13 +2208,6 @@ pub fn map_user_stack_to_process_with_phys( let page = Page::::containing_address(user_vaddr); let frame = PhysFrame::::containing_address(phys_addr); - crate::serial_println!( - " page {}: user {:#x} -> phys {:#x}", - i, - user_vaddr.as_u64(), - phys_addr.as_u64() - ); - match process_page_table.map_page(page, frame, flags) { Ok(()) => { log::trace!( @@ -2268,8 +2217,8 @@ pub fn map_user_stack_to_process_with_phys( ); } Err(e) => { - crate::serial_println!( - " FAILED to map page {:#x} -> {:#x}: {}", + log::error!( + "Failed to map page {:#x} -> {:#x}: {}", user_vaddr.as_u64(), phys_addr.as_u64(), e @@ -2279,9 +2228,5 @@ pub fn map_user_stack_to_process_with_phys( } } - crate::serial_println!( - "map_user_stack_to_process_with_phys: mapped {} pages successfully", - num_pages - ); Ok(()) } diff --git a/kernel/src/process/manager.rs b/kernel/src/process/manager.rs index 5dba7c1a..81a0c2c2 100644 --- a/kernel/src/process/manager.rs +++ b/kernel/src/process/manager.rs @@ -1294,6 +1294,9 @@ impl ProcessManager { parent_context: crate::task::thread::CpuContext, mut child_page_table: Box, ) -> Result { + // Lock-free trace: fork entry + crate::tracing::providers::process::trace_fork_entry(parent_pid.as_u64() as u32); + // Get the parent process info let (parent_name, parent_entry_point, parent_pgid, parent_sid, parent_cwd, parent_thread_info, parent_heap_start, parent_heap_end) = { let parent = self @@ -1433,12 +1436,7 @@ impl ProcessManager { log::error!("ARM64 fork: Failed to map user stack: {}", e); "Failed to map user stack in child's page table" })?; - crate::serial_println!( - "ARM64 fork: child stack mapped {:#x}-{:#x} in child page table (L4 frame={:#x})", - child_stack_bottom.as_u64(), - child_stack_top.as_u64(), - child_page_table_ref.level_4_frame().start_address().as_u64() - ); + crate::tracing::providers::process::trace_stack_map(child_pid.as_u64() as u32); // Allocate a globally unique thread ID for the child's main thread let child_thread_id = crate::task::thread::allocate_thread_id(); @@ -1492,6 +1490,11 @@ impl ProcessManager { child_thread.context = parent_context.clone(); + // CRITICAL: Fork returns 0 to child. The parent_context captured x0 at + // SVC entry time, which is undefined (ARM64 syscall0 uses lateout("x0")). + // Without this, the child sees a random x0 and doesn't enter the child branch. + child_thread.context.x0 = 0; + // CRITICAL: Set has_started=true for forked children child_thread.has_started = true; @@ -1657,6 +1660,9 @@ impl ProcessManager { child_pid.as_u64() ); + // Lock-free trace: fork exit with child PID + crate::tracing::providers::process::trace_fork_exit(child_pid.as_u64() as u32); + Ok(child_pid) } @@ -2995,6 +3001,9 @@ impl ProcessManager { use crate::arch_impl::aarch64::constants::USER_STACK_REGION_START; use crate::memory::arch_stub::{Page, PageTableFlags, Size4KiB}; + // Lock-free trace: exec entry + crate::tracing::providers::process::trace_exec_entry(pid.as_u64() as u32); + log::info!( "exec_process [ARM64]: Replacing process {} with new program", pid.as_u64() @@ -3226,6 +3235,9 @@ impl ProcessManager { ); } + // Lock-free trace: exec exit + crate::tracing::providers::process::trace_exec_exit(pid.as_u64() as u32); + Ok(new_entry_point) } diff --git a/kernel/src/tracing/core.rs b/kernel/src/tracing/core.rs index aa3cea8d..e4c70615 100644 --- a/kernel/src/tracing/core.rs +++ b/kernel/src/tracing/core.rs @@ -163,6 +163,18 @@ impl TraceEventType { pub const LOCK_RELEASE: u16 = 0x0501; pub const LOCK_CONTEND: u16 = 0x0502; + // Process events (0x06xx) + pub const FORK_ENTRY: u16 = 0x0600; + pub const FORK_EXIT: u16 = 0x0601; + pub const EXEC_ENTRY: u16 = 0x0602; + pub const EXEC_EXIT: u16 = 0x0603; + pub const COW_FAULT: u16 = 0x0604; + pub const COW_COPY: u16 = 0x0605; + pub const STACK_MAP: u16 = 0x0606; + pub const DATA_ABORT: u16 = 0x0607; + pub const PROCESS_EXIT: u16 = 0x0608; + pub const COW_LOCK_FAIL: u16 = 0x0609; + // Debug markers (0xFFxx) pub const MARKER_A: u16 = 0xFF00; pub const MARKER_B: u16 = 0xFF01; diff --git a/kernel/src/tracing/output.rs b/kernel/src/tracing/output.rs index 342f53cf..e762ce16 100644 --- a/kernel/src/tracing/output.rs +++ b/kernel/src/tracing/output.rs @@ -281,6 +281,18 @@ pub fn event_type_name(event_type: u16) -> &'static str { TraceEventType::LOCK_RELEASE => "LOCK_RELEASE", TraceEventType::LOCK_CONTEND => "LOCK_CONTEND", + // Process events (0x06xx) + TraceEventType::FORK_ENTRY => "FORK_ENTRY", + TraceEventType::FORK_EXIT => "FORK_EXIT", + TraceEventType::EXEC_ENTRY => "EXEC_ENTRY", + TraceEventType::EXEC_EXIT => "EXEC_EXIT", + TraceEventType::COW_FAULT => "COW_FAULT", + TraceEventType::COW_COPY => "COW_COPY", + TraceEventType::STACK_MAP => "STACK_MAP", + TraceEventType::DATA_ABORT => "DATA_ABORT", + TraceEventType::PROCESS_EXIT => "PROCESS_EXIT", + TraceEventType::COW_LOCK_FAIL => "COW_LOCK_FAIL", + // Debug markers (0xFFxx) TraceEventType::MARKER_A => "MARKER_A", TraceEventType::MARKER_B => "MARKER_B", @@ -315,6 +327,16 @@ fn payload_description(event_type: u16) -> &'static str { TraceEventType::LOCK_ACQUIRE => "lock_id", TraceEventType::LOCK_RELEASE => "lock_id", TraceEventType::LOCK_CONTEND => "lock_id", + TraceEventType::FORK_ENTRY => "parent_pid", + TraceEventType::FORK_EXIT => "child_pid", + TraceEventType::EXEC_ENTRY => "pid", + TraceEventType::EXEC_EXIT => "pid", + TraceEventType::COW_FAULT => "pid<<16|page_idx", + TraceEventType::COW_COPY => "pid<<16|page_idx", + TraceEventType::STACK_MAP => "pid", + TraceEventType::DATA_ABORT => "pid<<16|dfsc", + TraceEventType::PROCESS_EXIT => "pid<<16|exit_code", + TraceEventType::COW_LOCK_FAIL => "pid", TraceEventType::MARKER_A => "user_data", TraceEventType::MARKER_B => "user_data", TraceEventType::MARKER_C => "user_data", @@ -719,6 +741,7 @@ pub fn dump_event_summary() { (0x03, "Syscall"), (0x04, "Memory"), (0x05, "Lock"), + (0x06, "Process"), (0xFF, "Marker/Debug"), ]; diff --git a/kernel/src/tracing/providers/counters.rs b/kernel/src/tracing/providers/counters.rs index 76a426a7..9db8371e 100644 --- a/kernel/src/tracing/providers/counters.rs +++ b/kernel/src/tracing/providers/counters.rs @@ -87,6 +87,42 @@ pub static TIMER_TICK_TOTAL: TraceCounter = TraceCounter::new( "Total timer tick interrupts", ); +/// Total fork operations across all CPUs. +/// +/// Incremented at fork entry, before the fork is performed. +/// Use `aggregate()` to get the total count across all CPUs. +/// +/// GDB: `print FORK_TOTAL` +#[no_mangle] +pub static FORK_TOTAL: TraceCounter = TraceCounter::new( + "FORK_TOTAL", + "Total fork operations", +); + +/// Total exec operations across all CPUs. +/// +/// Incremented at exec entry, before the exec is performed. +/// Use `aggregate()` to get the total count across all CPUs. +/// +/// GDB: `print EXEC_TOTAL` +#[no_mangle] +pub static EXEC_TOTAL: TraceCounter = TraceCounter::new( + "EXEC_TOTAL", + "Total exec operations", +); + +/// Total CoW fault operations across all CPUs. +/// +/// Incremented when a copy-on-write fault is triggered. +/// Use `aggregate()` to get the total count across all CPUs. +/// +/// GDB: `print COW_FAULT_TOTAL` +#[no_mangle] +pub static COW_FAULT_TOTAL: TraceCounter = TraceCounter::new( + "COW_FAULT_TOTAL", + "Total CoW fault operations", +); + // ============================================================================= // Initialization // ============================================================================= @@ -99,9 +135,12 @@ pub fn init() { register_counter(&IRQ_TOTAL); register_counter(&CTX_SWITCH_TOTAL); register_counter(&TIMER_TICK_TOTAL); + register_counter(&FORK_TOTAL); + register_counter(&EXEC_TOTAL); + register_counter(&COW_FAULT_TOTAL); log::info!( - "Tracing counters initialized: SYSCALL_TOTAL, IRQ_TOTAL, CTX_SWITCH_TOTAL, TIMER_TICK_TOTAL" + "Tracing counters initialized: SYSCALL_TOTAL, IRQ_TOTAL, CTX_SWITCH_TOTAL, TIMER_TICK_TOTAL, FORK_TOTAL, EXEC_TOTAL, COW_FAULT_TOTAL" ); } @@ -145,6 +184,33 @@ pub fn count_timer_tick() { TIMER_TICK_TOTAL.increment(); } +/// Increment the fork counter. +/// +/// This is an inline function for use in the fork path. +/// Compiles to a single atomic add instruction. +#[inline(always)] +pub fn count_fork() { + FORK_TOTAL.increment(); +} + +/// Increment the exec counter. +/// +/// This is an inline function for use in the exec path. +/// Compiles to a single atomic add instruction. +#[inline(always)] +pub fn count_exec() { + EXEC_TOTAL.increment(); +} + +/// Increment the CoW fault counter. +/// +/// This is an inline function for use in the CoW fault handler. +/// Compiles to a single atomic add instruction. +#[inline(always)] +pub fn count_cow_fault() { + COW_FAULT_TOTAL.increment(); +} + /// Get all counter values as a summary. /// /// Returns a tuple of (syscall_total, irq_total, ctx_switch_total, timer_tick_total). @@ -157,6 +223,17 @@ pub fn get_all_counters() -> (u64, u64, u64, u64) { ) } +/// Get process-related counter values. +/// +/// Returns a tuple of (fork_total, exec_total, cow_fault_total). +pub fn get_process_counters() -> (u64, u64, u64) { + ( + FORK_TOTAL.aggregate(), + EXEC_TOTAL.aggregate(), + COW_FAULT_TOTAL.aggregate(), + ) +} + /// Reset all built-in counters to zero. /// /// This is not atomic across counters - some increments may be @@ -166,4 +243,7 @@ pub fn reset_all() { IRQ_TOTAL.reset(); CTX_SWITCH_TOTAL.reset(); TIMER_TICK_TOTAL.reset(); + FORK_TOTAL.reset(); + EXEC_TOTAL.reset(); + COW_FAULT_TOTAL.reset(); } diff --git a/kernel/src/tracing/providers/mod.rs b/kernel/src/tracing/providers/mod.rs index 26713705..f0f1328b 100644 --- a/kernel/src/tracing/providers/mod.rs +++ b/kernel/src/tracing/providers/mod.rs @@ -28,11 +28,13 @@ pub mod counters; pub mod irq; +pub mod process; pub mod sched; pub mod syscall; // Re-export providers for convenient access pub use irq::IRQ_PROVIDER; +pub use process::PROCESS_PROVIDER; pub use sched::SCHED_PROVIDER; pub use syscall::SYSCALL_PROVIDER; @@ -47,13 +49,15 @@ pub fn init() { syscall::init(); sched::init(); irq::init(); + process::init(); counters::init(); log::info!( - "Tracing providers initialized: syscall={:#x}, sched={:#x}, irq={:#x}", + "Tracing providers initialized: syscall={:#x}, sched={:#x}, irq={:#x}, process={:#x}", syscall::PROVIDER_ID, sched::PROVIDER_ID, - irq::PROVIDER_ID + irq::PROVIDER_ID, + process::PROVIDER_ID ); } @@ -63,6 +67,7 @@ pub fn enable_all() { SYSCALL_PROVIDER.enable_all(); SCHED_PROVIDER.enable_all(); IRQ_PROVIDER.enable_all(); + PROCESS_PROVIDER.enable_all(); } /// Disable all built-in providers. @@ -71,4 +76,5 @@ pub fn disable_all() { SYSCALL_PROVIDER.disable_all(); SCHED_PROVIDER.disable_all(); IRQ_PROVIDER.disable_all(); + PROCESS_PROVIDER.disable_all(); } diff --git a/kernel/src/tracing/providers/process.rs b/kernel/src/tracing/providers/process.rs new file mode 100644 index 00000000..85c76bc5 --- /dev/null +++ b/kernel/src/tracing/providers/process.rs @@ -0,0 +1,288 @@ +//! Process lifecycle trace provider. +//! +//! This provider traces process fork, exec, CoW fault, stack mapping, +//! and data abort events for debugging process management. +//! +//! # Event Types +//! +//! - `FORK_ENTRY` (0x0600): Fork started, payload = parent_pid +//! - `FORK_EXIT` (0x0601): Fork complete, payload = child_pid (0 on failure) +//! - `EXEC_ENTRY` (0x0602): Exec started, payload = pid +//! - `EXEC_EXIT` (0x0603): Exec complete, payload = pid (0 on failure) +//! - `COW_FAULT` (0x0604): CoW fault triggered, payload = packed(pid, page_idx) +//! - `COW_COPY` (0x0605): CoW page copied, payload = packed(pid, page_idx) +//! - `STACK_MAP` (0x0606): Stack mapped for process, payload = pid +//! - `DATA_ABORT` (0x0607): Data abort from EL0, payload = packed(pid, dfsc) +//! - `PROCESS_EXIT` (0x0608): Process exiting, payload = packed(pid, exit_code) +//! - `COW_LOCK_FAIL` (0x0609): CoW handler couldn't acquire manager lock, payload = pid +//! +//! # Usage +//! +//! ```rust,ignore +//! use kernel::tracing::providers::process::{PROCESS_PROVIDER, FORK_ENTRY}; +//! use kernel::trace_event; +//! +//! // Enable all process tracing +//! PROCESS_PROVIDER.enable_all(); +//! +//! // In fork path: +//! trace_event!(PROCESS_PROVIDER, FORK_ENTRY, parent_pid as u32); +//! ``` + +use crate::tracing::provider::{register_provider, TraceProvider}; +use crate::tracing::providers::counters::{COW_FAULT_TOTAL, EXEC_TOTAL, FORK_TOTAL}; +use core::sync::atomic::AtomicU64; + +/// Provider ID for process events (0x06xx range). +pub const PROVIDER_ID: u8 = 0x06; + +/// Process trace provider. +/// +/// GDB: `print PROCESS_PROVIDER` +#[no_mangle] +pub static PROCESS_PROVIDER: TraceProvider = TraceProvider { + name: "process", + id: PROVIDER_ID, + enabled: AtomicU64::new(0), +}; + +// ============================================================================= +// Probe Definitions +// ============================================================================= + +/// Probe ID for fork entry. +pub const PROBE_FORK_ENTRY: u8 = 0x00; + +/// Probe ID for fork exit. +pub const PROBE_FORK_EXIT: u8 = 0x01; + +/// Probe ID for exec entry. +pub const PROBE_EXEC_ENTRY: u8 = 0x02; + +/// Probe ID for exec exit. +pub const PROBE_EXEC_EXIT: u8 = 0x03; + +/// Probe ID for CoW fault. +pub const PROBE_COW_FAULT: u8 = 0x04; + +/// Probe ID for CoW page copy. +pub const PROBE_COW_COPY: u8 = 0x05; + +/// Probe ID for stack mapping. +pub const PROBE_STACK_MAP: u8 = 0x06; + +/// Probe ID for data abort from EL0. +pub const PROBE_DATA_ABORT: u8 = 0x07; + +/// Probe ID for process exit. +pub const PROBE_PROCESS_EXIT: u8 = 0x08; + +/// Probe ID for CoW lock acquisition failure. +pub const PROBE_COW_LOCK_FAIL: u8 = 0x09; + +/// Event type for fork entry. +/// Payload: parent_pid. +pub const FORK_ENTRY: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_FORK_ENTRY as u16); + +/// Event type for fork exit. +/// Payload: child_pid (0 on failure). +pub const FORK_EXIT: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_FORK_EXIT as u16); + +/// Event type for exec entry. +/// Payload: pid. +pub const EXEC_ENTRY: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_EXEC_ENTRY as u16); + +/// Event type for exec exit. +/// Payload: pid (0 on failure). +pub const EXEC_EXIT: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_EXEC_EXIT as u16); + +/// Event type for CoW fault. +/// Payload: packed(pid, page_idx). +pub const COW_FAULT: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_COW_FAULT as u16); + +/// Event type for CoW page copy. +/// Payload: packed(pid, page_idx). +pub const COW_COPY: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_COW_COPY as u16); + +/// Event type for stack mapping. +/// Payload: pid. +pub const STACK_MAP: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_STACK_MAP as u16); + +/// Event type for data abort from EL0. +/// Payload: packed(pid, dfsc). +pub const DATA_ABORT: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_DATA_ABORT as u16); + +/// Event type for process exit. +/// Payload: packed(pid, exit_code). +pub const PROCESS_EXIT: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_PROCESS_EXIT as u16); + +/// Event type for CoW lock acquisition failure. +/// Payload: pid. +pub const COW_LOCK_FAIL: u16 = ((PROVIDER_ID as u16) << 8) | (PROBE_COW_LOCK_FAIL as u16); + +// ============================================================================= +// Initialization +// ============================================================================= + +/// Initialize the process provider. +/// +/// Registers the provider with the global registry. +pub fn init() { + register_provider(&PROCESS_PROVIDER); +} + +// ============================================================================= +// Inline Tracing Functions +// ============================================================================= + +/// Trace fork entry (inline for minimal overhead). +/// +/// Also increments the FORK_TOTAL counter (single atomic add, always runs). +/// +/// # Parameters +/// +/// - `parent_pid`: The PID of the forking parent process +#[inline(always)] +#[allow(dead_code)] +pub fn trace_fork_entry(parent_pid: u32) { + // Always increment the counter (single atomic add, ~3 cycles) + FORK_TOTAL.increment(); + + // Only record trace event if tracing is enabled + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(FORK_ENTRY, 0, parent_pid); + } +} + +/// Trace fork exit (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `child_pid`: The PID of the newly created child process (0 on failure) +#[inline(always)] +#[allow(dead_code)] +pub fn trace_fork_exit(child_pid: u32) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(FORK_EXIT, 0, child_pid); + } +} + +/// Trace exec entry (inline for minimal overhead). +/// +/// Also increments the EXEC_TOTAL counter (single atomic add, always runs). +/// +/// # Parameters +/// +/// - `pid`: The PID of the process calling exec +#[inline(always)] +#[allow(dead_code)] +pub fn trace_exec_entry(pid: u32) { + // Always increment the counter (single atomic add, ~3 cycles) + EXEC_TOTAL.increment(); + + // Only record trace event if tracing is enabled + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(EXEC_ENTRY, 0, pid); + } +} + +/// Trace exec exit (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the process (0 on failure) +#[inline(always)] +#[allow(dead_code)] +pub fn trace_exec_exit(pid: u32) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(EXEC_EXIT, 0, pid); + } +} + +/// Trace CoW fault (inline for minimal overhead). +/// +/// Also increments the COW_FAULT_TOTAL counter (single atomic add, always runs). +/// +/// # Parameters +/// +/// - `pid`: The PID of the faulting process +/// - `page_idx`: The page index that triggered the fault +#[inline(always)] +#[allow(dead_code)] +pub fn trace_cow_fault(pid: u16, page_idx: u16) { + // Always increment the counter (single atomic add, ~3 cycles) + COW_FAULT_TOTAL.increment(); + + // Only record trace event if tracing is enabled + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event_2(COW_FAULT, pid, page_idx); + } +} + +/// Trace CoW page copy (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the process +/// - `page_idx`: The page index being copied +#[inline(always)] +#[allow(dead_code)] +pub fn trace_cow_copy(pid: u16, page_idx: u16) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event_2(COW_COPY, pid, page_idx); + } +} + +/// Trace stack mapping (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the process whose stack was mapped +#[inline(always)] +#[allow(dead_code)] +pub fn trace_stack_map(pid: u32) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(STACK_MAP, 0, pid); + } +} + +/// Trace data abort from EL0 (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the faulting process +/// - `dfsc`: The Data Fault Status Code +#[inline(always)] +#[allow(dead_code)] +pub fn trace_data_abort(pid: u16, dfsc: u16) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event_2(DATA_ABORT, pid, dfsc); + } +} + +/// Trace process exit (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the exiting process +/// - `exit_code`: The exit code of the process +#[inline(always)] +#[allow(dead_code)] +pub fn trace_process_exit(pid: u16, exit_code: u16) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event_2(PROCESS_EXIT, pid, exit_code); + } +} + +/// Trace CoW lock acquisition failure (inline for minimal overhead). +/// +/// # Parameters +/// +/// - `pid`: The PID of the process that failed to acquire the lock +#[inline(always)] +#[allow(dead_code)] +pub fn trace_cow_lock_fail(pid: u32) { + if PROCESS_PROVIDER.is_enabled() && crate::tracing::is_enabled() { + crate::tracing::record_event(COW_LOCK_FAIL, 0, pid); + } +} diff --git a/userspace/examples/init_shell.rs b/userspace/examples/init_shell.rs index b9a932dc..4e2395af 100644 --- a/userspace/examples/init_shell.rs +++ b/userspace/examples/init_shell.rs @@ -1970,8 +1970,31 @@ pub extern "C" fn _start() -> ! { } #[panic_handler] -fn panic(_info: &PanicInfo) -> ! { - println("PANIC in init shell!"); +fn panic(info: &PanicInfo) -> ! { + print("PANIC in init shell!"); + if let Some(location) = info.location() { + print(" at "); + print(location.file()); + print(":"); + // Print line number as decimal + let mut line = location.line(); + if line == 0 { + print("0"); + } else { + let mut buf = [0u8; 10]; + let mut i = 10; + while line > 0 { + i -= 1; + buf[i] = b'0' + (line % 10) as u8; + line /= 10; + } + // Safety: buf[i..10] contains valid ASCII digits + if let Ok(s) = core::str::from_utf8(&buf[i..10]) { + print(s); + } + } + } + println(""); // Init cannot exit, so just loop forever loop { core::hint::spin_loop();