From d84d155c1c4291c1cd92b5d0bac0df70011f11fd Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Fri, 6 Feb 2026 18:20:32 -0500 Subject: [PATCH] feat(arm64): add init system, fix OOM with proper allocator, fix TTBR0 on syscall resume Implement ARM64 init system (/sbin/init) that spawns telnetd and init_shell, replacing direct init_shell boot. Fix three critical bugs discovered during testing: 1. Heap exhaustion (OOM): Replace bump allocator with linked_list_allocator. The bump allocator never reclaimed freed memory, exhausting the 32MB heap within seconds of boot as temporary Vec/String/BTreeMap allocations permanently consumed space. Also remove unnecessary Box::leak calls in exec syscall paths (ARM64 and x86_64) that leaked entire ELF binaries. 2. TTBR0 stale after blocking syscall: When a userspace thread blocked in a syscall (e.g., read() on stdin) was context-switched back in, setup_kernel_thread_return_arm64 did not restore the process's page table. TTBR0 retained the previously-running process's value, causing instruction abort permission faults when the thread returned to EL0 with the wrong address space. 3. Register clobber fixes in ERET paths: Use per-CPU eret_scratch field to save/restore registers across SP switches in both syscall and IRQ return paths, preventing x0/x1 corruption (syscall) and x16 corruption (IRQ). 4. TTBR0 restoration after exec: Update saved_process_cr3 after exec switches to the new program's page table, preventing the assembly ERET path from restoring the old (freed) page table. Co-Authored-By: Claude Opus 4.6 --- kernel/Cargo.toml | 3 +- kernel/src/arch_impl/aarch64/boot.S | 46 ++++-- kernel/src/arch_impl/aarch64/constants.rs | 4 + .../src/arch_impl/aarch64/context_switch.rs | 18 ++- kernel/src/arch_impl/aarch64/syscall_entry.S | 132 ++++++++++++------ kernel/src/arch_impl/aarch64/syscall_entry.rs | 15 +- kernel/src/main_aarch64.rs | 6 +- kernel/src/memory/heap.rs | 104 ++------------ kernel/src/per_cpu_aarch64.rs | 11 +- kernel/src/process/manager.rs | 30 ++++ kernel/src/syscall/handlers.rs | 37 +++-- kernel/src/syscall/io.rs | 12 -- run.sh | 35 +++++ scripts/create_ext2_disk.sh | 4 +- userspace/bin/services/init.rs | 114 +++++++++++++++ userspace/tests/Cargo.toml | 4 + userspace/tests/build-aarch64.sh | 1 + userspace/tests/build.sh | 2 + 18 files changed, 388 insertions(+), 190 deletions(-) create mode 100644 userspace/bin/services/init.rs diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index af7bd725..44fbf8b7 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [lints.rust] -unexpected_cfgs = { level = "warn", check-cfg = ["cfg(never)"] } +unexpected_cfgs = { level = "warn", check-cfg = ["cfg(never)", "cfg(feature, values(\"particle_animation\"))"] } [[bin]] name = "kernel" @@ -49,6 +49,7 @@ log = { version = "0.4.17", default-features = false } spin = "0.9.8" crossbeam-queue = { version = "0.3", default-features = false, features = ["alloc"] } futures-util = { version = "0.3.17", default-features = false, features = ["alloc"] } +linked_list_allocator = "0.10" noto-sans-mono-bitmap = { version = "0.3", default-features = false, features = ["size_16", "regular", "unicode-basic-latin", "unicode-specials"] } [target.'cfg(target_arch = "x86_64")'.dependencies] diff --git a/kernel/src/arch_impl/aarch64/boot.S b/kernel/src/arch_impl/aarch64/boot.S index ea330569..43459a40 100644 --- a/kernel/src/arch_impl/aarch64/boot.S +++ b/kernel/src/arch_impl/aarch64/boot.S @@ -212,7 +212,7 @@ curr_el_spx_serror: // Lower EL using AArch64 (user mode) .balign 0x80 lower_el_aarch64_sync: - b sync_exception_handler + b lower_el_sync_dispatch .balign 0x80 lower_el_aarch64_irq: b irq_handler @@ -347,6 +347,25 @@ zero_table_loop: .section .text +/* + * Dispatch handler for synchronous exceptions from EL0 (userspace). + * + * SVCs (syscalls) are routed to syscall_entry_from_el0 in syscall_entry.S, + * which has proper interrupt masking, reschedule checks, TTBR0 handling, + * and PREEMPT_ACTIVE management. + * + * All other sync exceptions (page faults, etc.) go to the generic + * sync_exception_handler which passes ESR/FAR to the Rust handler. + * + * Uses x16/x17 as scratch (intra-procedure call scratch registers per ABI). + */ +lower_el_sync_dispatch: + mrs x16, esr_el1 + lsr x17, x16, #26 // Extract EC field (bits [31:26]) + cmp x17, #0x15 // EC 0x15 = SVC instruction from AArch64 + b.eq syscall_entry_from_el0 + b sync_exception_handler + sync_exception_handler: // Save all registers sub sp, sp, #272 // 33 registers × 8 bytes + 8 padding @@ -452,6 +471,7 @@ irq_handler: // Restore all general-purpose registers from the exception frame // x0 is particularly critical - it contains the fork() return value! + // NOTE: x16 is restored LATER via per-CPU scratch (see below) ldp x0, x1, [sp, #0] ldp x2, x3, [sp, #16] ldp x4, x5, [sp, #32] @@ -460,7 +480,8 @@ irq_handler: ldp x10, x11, [sp, #80] ldp x12, x13, [sp, #96] ldp x14, x15, [sp, #112] - ldp x16, x17, [sp, #128] + // Skip x16 here - will be restored via per-CPU scratch after SP switch + ldr x17, [sp, #136] // Restore x17 only ldp x18, x19, [sp, #144] ldp x20, x21, [sp, #160] ldp x22, x23, [sp, #176] @@ -469,13 +490,22 @@ irq_handler: ldp x28, x29, [sp, #224] ldr x30, [sp, #240] - // Load new SP from user_rsp_scratch - // For userspace returns: this is sp+272 (just popping the exception frame) - // For kernel thread switches: this may be a different thread's saved SP - // Use x16 as scratch - it's an intra-procedure scratch register in ARM64 ABI + // Save frame.x16 to per-CPU ERET scratch (offset 96), using x16/x17 as scratch + mrs x16, tpidr_el1 // x16 = percpu base + ldr x17, [sp, #128] // x17 = frame.x16 (temp) + str x17, [x16, #96] // percpu.eret_scratch = frame.x16 + + // Re-restore x17 from frame (was used as temp above) + ldr x17, [sp, #136] // x17 = frame.x17 (final value) + + // Set SP from user_rsp_scratch (offset 40) + ldr x16, [x16, #40] // x16 = user_rsp_scratch + mov sp, x16 // SP = correct stack top + + // Restore x16 from per-CPU ERET scratch mrs x16, tpidr_el1 - ldr x16, [x16, #40] - mov sp, x16 + ldr x16, [x16, #96] // x16 = saved frame.x16 + eret unhandled_exception: diff --git a/kernel/src/arch_impl/aarch64/constants.rs b/kernel/src/arch_impl/aarch64/constants.rs index c93860be..f95cdc18 100644 --- a/kernel/src/arch_impl/aarch64/constants.rs +++ b/kernel/src/arch_impl/aarch64/constants.rs @@ -133,6 +133,10 @@ pub const PERCPU_SAVED_PROCESS_CR3_OFFSET: usize = 80; /// Offset of exception_cleanup_context flag in PerCpuData. pub const PERCPU_EXCEPTION_CLEANUP_CONTEXT_OFFSET: usize = 88; +/// Offset of scratch register save area in PerCpuData. +/// Used by assembly ERET paths to save/restore one register across SP switches. +pub const PERCPU_ERET_SCRATCH_OFFSET: usize = 96; + // ============================================================================ // Preempt Count Bit Layout (Linux-compatible) // ============================================================================ diff --git a/kernel/src/arch_impl/aarch64/context_switch.rs b/kernel/src/arch_impl/aarch64/context_switch.rs index 111e930d..a3103d38 100644 --- a/kernel/src/arch_impl/aarch64/context_switch.rs +++ b/kernel/src/arch_impl/aarch64/context_switch.rs @@ -23,10 +23,12 @@ use crate::task::thread::{CpuContext, ThreadPrivilege, ThreadState}; #[inline(always)] #[allow(dead_code)] fn raw_uart_char(c: u8) { - // QEMU virt machine UART base address - const UART_BASE: u64 = 0x0900_0000; + // QEMU virt machine UART via HHDM (TTBR1-mapped, safe during context switch) + // Physical 0x0900_0000 is mapped at HHDM_BASE + 0x0900_0000 + const HHDM_BASE: u64 = 0xFFFF_0000_0000_0000; + const UART_VIRT: u64 = HHDM_BASE + 0x0900_0000; unsafe { - let ptr = UART_BASE as *mut u8; + let ptr = UART_VIRT as *mut u8; core::ptr::write_volatile(ptr, c); } } @@ -328,6 +330,16 @@ fn switch_to_thread_arm64(thread_id: u64, frame: &mut Aarch64ExceptionFrame) { // Kernel threads and userspace threads blocked in syscall both need // kernel context restoration (they're running in kernel mode) setup_kernel_thread_return_arm64(thread_id, frame); + + // CRITICAL: For userspace threads blocked in syscall, set up TTBR0 so + // the correct process page table is active when the syscall completes + // and returns to userspace. Without this, TTBR0 retains the previously- + // running process's page table, causing instruction aborts when the + // thread returns to EL0 with the wrong address space. + if is_blocked_in_syscall && !is_kernel_thread { + set_next_ttbr0_for_thread(thread_id); + switch_ttbr0_if_needed(thread_id); + } } else { restore_userspace_context_arm64(thread_id, frame); } diff --git a/kernel/src/arch_impl/aarch64/syscall_entry.S b/kernel/src/arch_impl/aarch64/syscall_entry.S index f2ffd919..9f20b743 100644 --- a/kernel/src/arch_impl/aarch64/syscall_entry.S +++ b/kernel/src/arch_impl/aarch64/syscall_entry.S @@ -66,36 +66,62 @@ syscall_entry_from_el0: * ARM64 uses SP_EL1 for kernel exceptions, but we need to * ensure we're using a proper kernel stack for this thread. * - * Read kernel_stack_top from per-CPU data via TPIDR_EL1. + * CRITICAL: We must save x9/x10 BEFORE using them as scratch + * for the per-CPU lookup. Otherwise userspace gets kernel + * addresses leaked into x9/x10 on syscall return. + * + * Strategy: temporarily push x9/x10 to the current SP_EL1 stack, + * do the per-CPU lookup, then either: + * - No switch: pop x9/x10 and save normally + * - Switch: copy stashed x9/x10 from old stack into new frame */ - mrs x9, tpidr_el1 /* x9 = per-CPU base */ - cbz x9, .Lno_percpu_switch /* Skip if not initialized */ + stp x9, x10, [sp, #-16]! /* Stash user x9/x10 on current stack */ - /* Read kernel_stack_top from offset 16 */ + mrs x9, tpidr_el1 /* x9 = per-CPU base */ + cbz x9, .Lno_stack_switch ldr x10, [x9, #16] /* x10 = kernel_stack_top */ - cbz x10, .Lno_percpu_switch /* Skip if not set */ - - /* Use the kernel stack */ - mov sp, x10 + cbz x10, .Lno_stack_switch -.Lno_percpu_switch: /* - * Allocate exception frame on stack. - * Frame size: 272 bytes (34 * 8, 16-byte aligned) - * This matches Aarch64ExceptionFrame layout. + * Stack switch needed: x10 = new kernel stack top. + * User x9/x10 are stashed at [sp] on the OLD stack. + * Save old SP so we can copy them into the new frame. + */ + mov x9, sp /* x9 = old SP (stash location) */ + mov sp, x10 /* Switch to per-CPU kernel stack */ + sub sp, sp, #272 /* Allocate exception frame */ + + /* Copy real user x9/x10 from old stack stash into frame */ + ldr x10, [x9] /* x10 = user x9 */ + str x10, [sp, #72] /* frame.x9 = user x9 */ + ldr x10, [x9, #8] /* x10 = user x10 */ + str x10, [sp, #80] /* frame.x10 = user x10 */ + /* x8, x11 were NOT clobbered in this path */ + str x8, [sp, #64] /* frame.x8 = syscall number */ + str x11, [sp, #88] /* frame.x11 = user x11 */ + b .Lsave_common + +.Lno_stack_switch: + /* + * No stack switch needed. Pop stashed x9/x10 (restoring + * their real user values) and save normally. */ - sub sp, sp, #272 + ldp x9, x10, [sp], #16 /* Pop user x9/x10 from stash */ + sub sp, sp, #272 /* Allocate exception frame */ + stp x8, x9, [sp, #64] /* frame.x8 = syscall number, frame.x9 */ + stp x10, x11, [sp, #80] /* frame.x10, frame.x11 */ +.Lsave_common: /* - * Save all general-purpose registers x0-x29. - * STP stores two 64-bit registers per instruction. + * Save all other general-purpose registers. + * x8-x11 are already saved by the path-specific code above. + * All other registers are unclobbered in both paths. */ stp x0, x1, [sp, #0] stp x2, x3, [sp, #16] stp x4, x5, [sp, #32] stp x6, x7, [sp, #48] - stp x8, x9, [sp, #64] /* x8 = syscall number */ - stp x10, x11, [sp, #80] + /* x8-x11 already saved above */ stp x12, x13, [sp, #96] stp x14, x15, [sp, #112] stp x16, x17, [sp, #128] @@ -175,6 +201,18 @@ syscall_entry_from_el0: str w10, [x9, #32] .Lskip_preempt_set: + /* + * Pre-set user_rsp_scratch = sp + 272 (the return SP if no context switch). + * If a context switch occurs, the Rust code will overwrite this with the + * new thread's SP. This mirrors the IRQ handler's approach in boot.S. + * PERCPU_USER_RSP_SCRATCH_OFFSET = 40 + */ + mrs x9, tpidr_el1 + cbz x9, .Lskip_rsp_scratch_set + add x10, sp, #272 + str x10, [x9, #40] +.Lskip_rsp_scratch_set: + /* * Check if rescheduling is needed before returning to userspace. * Pass frame pointer for potential context switch. @@ -285,37 +323,45 @@ syscall_entry_from_el0: .Lno_ttbr_switch: /* - * Trace: about to return to userspace. - * Call trace function BEFORE restoring x0/x1 (it will clobber them). - * Pass ELR and SPSR for debugging if needed. - */ - mrs x0, elr_el1 - mrs x1, spsr_el1 - bl trace_eret_to_el0 - - /* Now restore x0/x1 from the frame (frame still valid on kernel stack) */ - ldp x0, x1, [sp, #0] - - /* Deallocate frame */ - add sp, sp, #272 - /* - * Clear PREEMPT_ACTIVE now that registers are restored. - * Without this, PREEMPT_ACTIVE persists and blocks scheduling. + * Clear PREEMPT_ACTIVE before restoring x0/x1. + * Use x0/x1 as scratch (they haven't been restored yet). + * Must clear BEFORE ERET so next IRQ can do context switches. */ - mrs x9, tpidr_el1 - cbz x9, .Lskip_preempt_final_clear - ldr w10, [x9, #32] - bic w10, w10, #0x10000000 /* Clear bit 28 */ - str w10, [x9, #32] + mrs x0, tpidr_el1 + cbz x0, .Lskip_preempt_final_clear + ldr w1, [x0, #32] + bic w1, w1, #0x10000000 /* Clear bit 28 */ + str w1, [x0, #32] .Lskip_preempt_final_clear: /* - * Return to userspace via ERET. - * ERET will: - * - Restore PSTATE from SPSR_EL1 - * - Jump to ELR_EL1 - * - Switch to EL0 + * Restore x0/x1 and switch SP for ERET. + * + * Challenge: we need to set SP = user_rsp_scratch AND restore x0/x1 + * from the frame, but after changing SP the frame may not be addressable + * (if a context switch moved us to a different kernel stack). + * + * Solution: save frame.x0 to per-CPU scratch (offset 96), restore x1 + * from the frame, switch SP, then restore x0 from per-CPU scratch. + * + * x0 = tpidr_el1 from PREEMPT_ACTIVE clear above. */ + + /* Save frame.x0 to per-CPU ERET scratch */ + ldr x1, [sp, #0] /* x1 = frame.x0 */ + str x1, [x0, #96] /* percpu.eret_scratch = frame.x0 */ + + /* Restore x1 from frame (final value) */ + ldr x1, [sp, #8] /* x1 = frame.x1 */ + + /* Set SP from user_rsp_scratch */ + ldr x0, [x0, #40] /* x0 = user_rsp_scratch */ + mov sp, x0 /* SP = correct kernel stack top */ + + /* Restore x0 from per-CPU ERET scratch */ + mrs x0, tpidr_el1 + ldr x0, [x0, #96] /* x0 = saved frame.x0 */ + eret /* Should never reach here - debug marker */ diff --git a/kernel/src/arch_impl/aarch64/syscall_entry.rs b/kernel/src/arch_impl/aarch64/syscall_entry.rs index b52d060a..e97a825c 100644 --- a/kernel/src/arch_impl/aarch64/syscall_entry.rs +++ b/kernel/src/arch_impl/aarch64/syscall_entry.rs @@ -1165,9 +1165,7 @@ fn sys_exec_aarch64( // Trace: ELF file loaded from filesystem super::trace::trace_exec(b'L'); - let boxed_slice = elf_vec.into_boxed_slice(); - let elf_data = Box::leak(boxed_slice) as &'static [u8]; - let leaked_name: &'static str = Box::leak(program_name.into_boxed_str()); + let elf_data = elf_vec.as_slice(); let current_pid = { let manager_guard = crate::process::manager(); @@ -1202,7 +1200,7 @@ fn sys_exec_aarch64( // Trace: calling exec_process_with_argv (process manager) super::trace::trace_exec(b'M'); - match manager.exec_process_with_argv(current_pid, elf_data, Some(leaked_name), &argv_slices) { + match manager.exec_process_with_argv(current_pid, elf_data, Some(&program_name), &argv_slices) { Ok((new_entry_point, new_rsp)) => { // Trace: exec_process_with_argv succeeded super::trace::trace_exec(b'S'); @@ -1277,6 +1275,15 @@ fn sys_exec_aarch64( } // Trace: TTBR0 page table switched super::trace::trace_exec(b'P'); + + // CRITICAL: Update saved_process_cr3 so the assembly ERET + // path doesn't restore the OLD (now-freed) page table. + // Without this, the .Lrestore_saved_ttbr path in syscall_entry.S + // switches TTBR0 back to the pre-exec page table, which has + // been deallocated by exec_process_with_argv. + unsafe { + Aarch64PerCpu::set_saved_process_cr3(new_ttbr0); + } } } diff --git a/kernel/src/main_aarch64.rs b/kernel/src/main_aarch64.rs index aa43a5a9..13453472 100644 --- a/kernel/src/main_aarch64.rs +++ b/kernel/src/main_aarch64.rs @@ -421,11 +421,11 @@ pub extern "C" fn kernel_main() -> ! { // Try to load and run userspace init_shell from ext2 or test disk if device_count > 0 { boot_raw_char(b'2'); // Inside if - serial_println!("[boot] Loading userspace init_shell from ext2..."); + serial_println!("[boot] Loading userspace init from ext2..."); boot_raw_char(b'3'); // After serial_println - match run_userspace_from_ext2("/bin/init_shell") { + match run_userspace_from_ext2("/sbin/init") { Err(e) => { - serial_println!("[boot] Failed to load init_shell from ext2: {}", e); + serial_println!("[boot] Failed to load init from ext2: {}", e); serial_println!("[boot] Loading userspace init_shell from test disk..."); match kernel::boot::test_disk::run_userspace_from_disk("init_shell") { Err(e) => { diff --git a/kernel/src/memory/heap.rs b/kernel/src/memory/heap.rs index 1726f8fe..51a3b59a 100644 --- a/kernel/src/memory/heap.rs +++ b/kernel/src/memory/heap.rs @@ -1,4 +1,4 @@ -use spin::Mutex; +use linked_list_allocator::LockedHeap; #[cfg(target_arch = "x86_64")] use x86_64::structures::paging::{Mapper, OffsetPageTable, Page, PageTableFlags, Size4KiB}; #[cfg(target_arch = "x86_64")] @@ -17,95 +17,22 @@ pub const HEAP_START: u64 = 0x_4444_4444_0000; // Heap must be placed AFTER the frame allocator to avoid collision! pub const HEAP_START: u64 = crate::arch_impl::aarch64::constants::HHDM_BASE + 0x5000_0000; -/// Heap size of 4 MiB. +/// Heap size: 32 MiB. /// -/// This size was chosen to support concurrent process tests which require: -/// - Multiple child processes (4+) running simultaneously after fork() -/// - Each process needs: fd table (~6KB), pipe buffers (4KB each), ProcessInfo struct, -/// Thread structs, page tables, and kernel stack allocations -/// - Total per-process overhead is approximately 50-100KB depending on fd usage -/// -/// IMPORTANT: We use a bump allocator which only reclaims memory when ALL allocations -/// are freed. This means memory fragmentation is effectively permanent during a test run. -/// The 4 MiB size provides sufficient headroom for: -/// - Boot initialization allocations (~500KB) +/// This provides sufficient headroom for: +/// - Boot initialization allocations /// - Running 10+ concurrent processes with full fd tables -/// - Pipe buffers for IPC testing -/// - Safety margin for test variations -/// -/// Reduced sizes (1-2 MiB) caused OOM during concurrent fork/pipe tests. -/// Increased from 1 MiB based on empirical testing of pipe_concurrent_test scenarios. -/// Increased from 4 MiB to 32 MiB to accommodate ext2 filesystem operations which -/// allocate Vec buffers that aren't freed by the bump allocator. -/// The test suite runs 43+ processes, each needing kernel stacks (64KB), page tables, -/// file descriptor tables, etc. The bump allocator never reclaims until ALL allocations -/// are freed, so memory accumulates across the entire test run. +/// - ext2 filesystem operations +/// - Network stack buffers pub const HEAP_SIZE: u64 = 32 * 1024 * 1024; -/// A simple bump allocator -struct BumpAllocator { - heap_start: u64, - heap_end: u64, - next: u64, - allocations: usize, -} - -impl BumpAllocator { - /// Creates a new bump allocator - pub const fn new() -> Self { - Self { - heap_start: 0, - heap_end: 0, - next: 0, - allocations: 0, - } - } - - /// Initializes the bump allocator with the given heap bounds - pub unsafe fn init(&mut self, heap_start: u64, heap_size: u64) { - self.heap_start = heap_start; - self.heap_end = heap_start + heap_size; - self.next = heap_start; - } -} - -/// Wrapper for the global allocator -pub struct GlobalAllocator(Mutex); - -unsafe impl core::alloc::GlobalAlloc for GlobalAllocator { - unsafe fn alloc(&self, layout: core::alloc::Layout) -> *mut u8 { - let mut allocator = self.0.lock(); - - // Align the start address - let alloc_start = align_up(allocator.next, layout.align() as u64); - let alloc_end = match alloc_start.checked_add(layout.size() as u64) { - Some(end) => end, - None => return core::ptr::null_mut(), - }; - - if alloc_end > allocator.heap_end { - core::ptr::null_mut() // out of memory - } else { - allocator.next = alloc_end; - allocator.allocations += 1; - alloc_start as *mut u8 - } - } - - unsafe fn dealloc(&self, _ptr: *mut u8, _layout: core::alloc::Layout) { - let mut allocator = self.0.lock(); - - allocator.allocations -= 1; - if allocator.allocations == 0 { - allocator.next = allocator.heap_start; - } - } -} - -/// Global allocator instance -/// Defined for all architectures +/// Global allocator instance using a proper free-list allocator. +/// +/// Unlike the previous bump allocator, linked_list_allocator properly +/// reclaims freed memory, preventing heap exhaustion from temporary +/// allocations (Vec clones, BTreeMap nodes, etc.). #[global_allocator] -static ALLOCATOR: GlobalAllocator = GlobalAllocator(Mutex::new(BumpAllocator::new())); +static ALLOCATOR: LockedHeap = LockedHeap::empty(); /// Initialize the heap allocator pub fn init(mapper: &OffsetPageTable<'static>) -> Result<(), &'static str> { @@ -159,7 +86,7 @@ pub fn init(mapper: &OffsetPageTable<'static>) -> Result<(), &'static str> { // Initialize the allocator unsafe { - ALLOCATOR.0.lock().init(HEAP_START, HEAP_SIZE); + ALLOCATOR.lock().init(HEAP_START as *mut u8, HEAP_SIZE as usize); } log::info!( @@ -171,11 +98,6 @@ pub fn init(mapper: &OffsetPageTable<'static>) -> Result<(), &'static str> { Ok(()) } -/// Align the given address upwards to the given alignment -fn align_up(addr: u64, align: u64) -> u64 { - (addr + align - 1) & !(align - 1) -} - /// Handle allocation errors #[alloc_error_handler] fn alloc_error_handler(layout: core::alloc::Layout) -> ! { diff --git a/kernel/src/per_cpu_aarch64.rs b/kernel/src/per_cpu_aarch64.rs index 2fb2259f..d05fe617 100644 --- a/kernel/src/per_cpu_aarch64.rs +++ b/kernel/src/per_cpu_aarch64.rs @@ -48,8 +48,13 @@ pub struct PerCpuData { pub saved_process_ttbr0: u64, /// Exception cleanup context flag (offset 88) pub exception_cleanup_context: u8, + /// Padding to align eret_scratch to 8 bytes + _pad3a: [u8; 7], + /// Scratch register save area for ERET paths (offset 96) + /// Used by assembly to save one register across SP switches during ERET. + pub eret_scratch: u64, /// Padding to match x86_64 layout - _pad3: [u8; 103], + _pad3: [u8; 88], } const _: () = assert!(core::mem::size_of::() == 192, "PerCpuData must be 192 bytes"); @@ -73,7 +78,9 @@ impl PerCpuData { kernel_ttbr0: 0, saved_process_ttbr0: 0, exception_cleanup_context: 0, - _pad3: [0; 103], + _pad3a: [0; 7], + eret_scratch: 0, + _pad3: [0; 88], } } } diff --git a/kernel/src/process/manager.rs b/kernel/src/process/manager.rs index 81a0c2c2..6c17f678 100644 --- a/kernel/src/process/manager.rs +++ b/kernel/src/process/manager.rs @@ -2946,6 +2946,22 @@ impl ProcessManager { "exec_process_with_argv [ARM64]: Updated thread {} context for new program", thread_id ); + + // CRITICAL: Sync updated context to the scheduler's copy of this thread. + // The process manager and scheduler maintain SEPARATE Thread objects (cloned + // at process creation). Without this sync, the scheduler would restore stale + // context (e.g., elr_el1=0) on the next context switch, causing ELR=0x0 crashes. + let ctx = thread.context.clone(); + let st = thread.stack_top; + let sb = thread.stack_bottom; + let kst = thread.kernel_stack_top; + crate::task::scheduler::with_thread_mut(thread_id, |sched_thread| { + sched_thread.context = ctx; + sched_thread.stack_top = st; + sched_thread.stack_bottom = sb; + sched_thread.kernel_stack_top = kst; + sched_thread.state = crate::task::thread::ThreadState::Ready; + }); } if is_current_process { @@ -3198,6 +3214,20 @@ impl ProcessManager { "exec_process [ARM64]: Updated thread {} context for new program", thread_id ); + + // CRITICAL: Sync updated context to the scheduler's copy of this thread. + // See exec_process_with_argv for detailed explanation of the dual-storage issue. + let ctx = thread.context.clone(); + let st = thread.stack_top; + let sb = thread.stack_bottom; + let kst = thread.kernel_stack_top; + crate::task::scheduler::with_thread_mut(thread_id, |sched_thread| { + sched_thread.context = ctx; + sched_thread.stack_top = st; + sched_thread.stack_bottom = sb; + sched_thread.kernel_stack_top = kst; + sched_thread.state = crate::task::thread::ThreadState::Ready; + }); } log::info!( diff --git a/kernel/src/syscall/handlers.rs b/kernel/src/syscall/handlers.rs index 9e0ad99d..9cf6b6dd 100644 --- a/kernel/src/syscall/handlers.rs +++ b/kernel/src/syscall/handlers.rs @@ -1432,7 +1432,10 @@ pub fn sys_exec_with_frame( // Load the program by name from the test disk // We need both the ELF data and the program name for exec_process - let (elf_data, exec_program_name): (&'static [u8], Option<&'static str>) = if program_name_ptr != 0 { + // Owned data must live long enough for exec_process to borrow + let mut _elf_vec_storage: Option> = None; + let mut _name_storage: Option = None; + let (elf_data, exec_program_name): (&[u8], Option<&str>) = if program_name_ptr != 0 { // Read the program name from userspace log::info!("sys_exec: Reading program name from userspace"); @@ -1469,13 +1472,9 @@ pub fn sys_exec_with_frame( { // Load the binary from the test disk by name let elf_vec = crate::userspace_test::get_test_binary(program_name); - // Leak the vector to get a static slice (needed for exec_process) - let boxed_slice = elf_vec.into_boxed_slice(); - let elf_data = Box::leak(boxed_slice) as &'static [u8]; - // Also leak the program name so we can pass it to exec_process - let name_string = alloc::string::String::from(program_name); - let leaked_name: &'static str = Box::leak(name_string.into_boxed_str()); - (elf_data, Some(leaked_name)) + _elf_vec_storage = Some(elf_vec); + _name_storage = Some(alloc::string::String::from(program_name)); + (_elf_vec_storage.as_ref().unwrap().as_slice(), Some(_name_storage.as_ref().unwrap().as_str())) } #[cfg(not(feature = "testing"))] { @@ -1804,10 +1803,7 @@ pub fn sys_execv_with_frame( } } }; - let boxed_slice = elf_vec.into_boxed_slice(); - let elf_data = Box::leak(boxed_slice) as &'static [u8]; - let name_string = alloc::string::String::from(program_name); - let leaked_name: &'static str = Box::leak(name_string.into_boxed_str()); + let elf_data = elf_vec.as_slice(); // Find current process let current_pid = { @@ -1839,7 +1835,7 @@ pub fn sys_execv_with_frame( Cpu::without_interrupts(|| { let mut manager_guard = crate::process::manager(); if let Some(ref mut manager) = *manager_guard { - match manager.exec_process_with_argv(current_pid, elf_data, Some(leaked_name), &argv_slices) { + match manager.exec_process_with_argv(current_pid, elf_data, Some(program_name), &argv_slices) { Ok((new_entry_point, new_rsp)) => { log::info!( "sys_execv: Successfully replaced process address space, entry={:#x}, rsp={:#x}", @@ -1949,7 +1945,10 @@ pub fn sys_exec(program_name_ptr: u64, elf_data_ptr: u64) -> SyscallResult { // Load the program by name from the test disk // In a real implementation, this would come from the filesystem // We need both the ELF data and the program name for exec_process - let (_elf_data, _exec_program_name): (&'static [u8], Option<&'static str>) = if program_name_ptr != 0 { + // Owned data must live long enough for exec_process to borrow + let mut _elf_vec_storage2: Option> = None; + let mut _name_storage2: Option = None; + let (_elf_data, _exec_program_name): (&[u8], Option<&str>) = if program_name_ptr != 0 { // Read the program name from userspace log::info!("sys_exec: Reading program name from userspace"); @@ -1978,13 +1977,9 @@ pub fn sys_exec(program_name_ptr: u64, elf_data_ptr: u64) -> SyscallResult { { // Load the binary from the test disk by name let elf_vec = crate::userspace_test::get_test_binary(program_name); - // Leak the vector to get a static slice (needed for exec_process) - let boxed_slice = elf_vec.into_boxed_slice(); - let elf_data = Box::leak(boxed_slice) as &'static [u8]; - // Also leak the program name so we can pass it to exec_process - let name_string = alloc::string::String::from(program_name); - let leaked_name: &'static str = Box::leak(name_string.into_boxed_str()); - (elf_data, Some(leaked_name)) + _elf_vec_storage2 = Some(elf_vec); + _name_storage2 = Some(alloc::string::String::from(program_name)); + (_elf_vec_storage2.as_ref().unwrap().as_slice(), Some(_name_storage2.as_ref().unwrap().as_str())) } #[cfg(not(feature = "testing"))] { diff --git a/kernel/src/syscall/io.rs b/kernel/src/syscall/io.rs index 9dc1b5fd..c111b304 100644 --- a/kernel/src/syscall/io.rs +++ b/kernel/src/syscall/io.rs @@ -9,18 +9,6 @@ use super::SyscallResult; use alloc::vec::Vec; use crate::syscall::userptr::validate_user_buffer; -/// Raw serial debug output - write a string without locks or allocations. -/// Safe to call from any context including interrupt handlers and syscalls. -#[allow(dead_code)] // Debug utility, kept for future use -#[inline(always)] -fn raw_serial_str(s: &[u8]) { - let base = crate::memory::physical_memory_offset().as_u64(); - let addr = (base + 0x0900_0000) as *mut u32; - for &c in s { - unsafe { core::ptr::write_volatile(addr, c as u32); } - } -} - /// Copy a byte buffer from userspace. fn copy_from_user_bytes(ptr: u64, len: usize) -> Result, u64> { if len == 0 { diff --git a/run.sh b/run.sh index e653a5bd..458999ab 100755 --- a/run.sh +++ b/run.sh @@ -5,6 +5,7 @@ # # Usage: # ./run.sh # ARM64 with native cocoa display (default) +# ./run.sh --clean # Full rebuild (userspace + ext2 disk + kernel), then run # ./run.sh --x86 # x86_64 with VNC display # ./run.sh --headless # ARM64 with serial output only # ./run.sh --x86 --headless # x86_64 with serial output only @@ -23,6 +24,7 @@ BREENIX_ROOT="$SCRIPT_DIR" # Defaults: ARM64 with graphics ARCH="arm64" HEADLESS=false +CLEAN=false # Parse arguments while [[ $# -gt 0 ]]; do @@ -35,6 +37,10 @@ while [[ $# -gt 0 ]]; do ARCH="arm64" shift ;; + --clean) + CLEAN=true + shift + ;; --headless|--serial) HEADLESS=true shift @@ -47,6 +53,7 @@ while [[ $# -gt 0 ]]; do echo "Usage: ./run.sh [options]" echo "" echo "Options:" + echo " --clean Full rebuild: userspace, ext2 disk, kernel" echo " --x86, --x86_64, --amd64 Run x86_64 kernel (default: ARM64)" echo " --arm64, --aarch64 Run ARM64 kernel (default)" echo " --headless, --serial Run without display (serial only)" @@ -95,6 +102,34 @@ echo "" echo "Architecture: $ARCH" echo "Mode: $([ "$HEADLESS" = true ] && echo "headless (serial only)" || echo "graphics (VNC)")" +# --clean: full rebuild of userspace, ext2 disk, and kernel +if [ "$CLEAN" = true ]; then + echo "" + echo "Clean build: rebuilding everything..." + echo "" + + if [ "$ARCH" = "arm64" ]; then + echo "[1/3] Building userspace binaries (aarch64)..." + (cd "$BREENIX_ROOT/userspace/tests" && ./build-aarch64.sh) + + echo "" + echo "[2/3] Creating ext2 disk image..." + "$BREENIX_ROOT/scripts/create_ext2_disk.sh" --arch aarch64 + else + echo "[1/3] Building userspace binaries (x86_64)..." + (cd "$BREENIX_ROOT/userspace/tests" && ./build.sh) + + echo "" + echo "[2/3] Creating ext2 disk image..." + "$BREENIX_ROOT/scripts/create_ext2_disk.sh" + fi + + echo "" + echo "[3/3] Building kernel..." + eval $BUILD_CMD + echo "" +fi + # Check if kernel exists, offer to build if [ ! -f "$KERNEL" ]; then echo "" diff --git a/scripts/create_ext2_disk.sh b/scripts/create_ext2_disk.sh index 1213dfc4..36dcc6ba 100755 --- a/scripts/create_ext2_disk.sh +++ b/scripts/create_ext2_disk.sh @@ -120,7 +120,7 @@ if [[ "$(uname)" == "Darwin" ]]; then for elf_file in /binaries/*.elf; do if [ -f "$elf_file" ]; then bin_name=$(basename "$elf_file" .elf) - if [ "$bin_name" = "true" ] || [ "$bin_name" = "telnetd" ]; then + if [ "$bin_name" = "true" ] || [ "$bin_name" = "telnetd" ] || [ "$bin_name" = "init" ]; then cp "$elf_file" /mnt/ext2/sbin/${bin_name} chmod 755 /mnt/ext2/sbin/${bin_name} sbin_count=$((sbin_count + 1)) @@ -215,7 +215,7 @@ else for elf_file in "$USERSPACE_DIR"/*.elf; do if [ -f "$elf_file" ]; then bin_name=$(basename "$elf_file" .elf) - if [ "$bin_name" = "true" ] || [ "$bin_name" = "telnetd" ]; then + if [ "$bin_name" = "true" ] || [ "$bin_name" = "telnetd" ] || [ "$bin_name" = "init" ]; then cp "$elf_file" "$MOUNT_DIR/sbin/${bin_name}" chmod 755 "$MOUNT_DIR/sbin/${bin_name}" sbin_count=$((sbin_count + 1)) diff --git a/userspace/bin/services/init.rs b/userspace/bin/services/init.rs new file mode 100644 index 00000000..c5c1323d --- /dev/null +++ b/userspace/bin/services/init.rs @@ -0,0 +1,114 @@ +//! Breenix init process (/sbin/init) +//! +//! PID 1 - spawns system services and the interactive shell, then reaps zombies. +//! +//! Spawns: +//! - /sbin/telnetd (background service) +//! - /bin/init_shell (foreground shell on serial console) +//! +//! Main loop reaps terminated children with waitpid(WNOHANG) and respawns +//! crashed services. + +#![no_std] +#![no_main] + +use core::panic::PanicInfo; +use libbreenix::io; +use libbreenix::process; + +const TELNETD_PATH: &[u8] = b"/sbin/telnetd\0"; +const SHELL_PATH: &[u8] = b"/bin/init_shell\0"; + +/// Fork and exec a binary. Returns the child PID on success, -1 on failure. +fn spawn(path: &[u8], name: &str) -> i64 { + let pid = process::fork(); + if pid == 0 { + // Child: exec the binary + let argv: [*const u8; 2] = [path.as_ptr(), core::ptr::null()]; + let _ = process::execv(path, argv.as_ptr()); + // exec failed + io::print("[init] ERROR: exec failed for "); + io::print(name); + io::print("\n"); + process::exit(127); + } + if pid < 0 { + io::print("[init] ERROR: fork failed for "); + io::print(name); + io::print("\n"); + return -1; + } + pid +} + +#[no_mangle] +pub extern "C" fn _start() -> ! { + let pid = process::getpid(); + io::print("[init] Breenix init starting (PID "); + print_i32(pid as i32); + io::print(")\n"); + + // Start telnetd + io::print("[init] Starting /sbin/telnetd...\n"); + let mut telnetd_pid = spawn(TELNETD_PATH, "telnetd"); + + // Start interactive shell + io::print("[init] Starting /bin/init_shell...\n"); + let mut shell_pid = spawn(SHELL_PATH, "init_shell"); + + // Main loop: reap zombies and respawn crashed services + let mut status: i32 = 0; + loop { + let reaped = process::waitpid(-1, &mut status as *mut i32, process::WNOHANG); + if reaped > 0 { + if reaped == shell_pid { + // Shell exited — respawn it + io::print("[init] Shell exited, respawning...\n"); + shell_pid = spawn(SHELL_PATH, "init_shell"); + } else if reaped == telnetd_pid { + // Telnetd crashed — respawn it + io::print("[init] telnetd exited, respawning...\n"); + telnetd_pid = spawn(TELNETD_PATH, "telnetd"); + } + } + process::yield_now(); + } +} + +/// Print an i32 as decimal to serial output. +fn print_i32(mut n: i32) { + if n < 0 { + io::print("-"); + // Handle i32::MIN edge case + if n == i32::MIN { + io::print("2147483648"); + return; + } + n = -n; + } + if n == 0 { + io::print("0"); + return; + } + let mut buf = [0u8; 10]; + let mut i = 0usize; + while n > 0 { + buf[i] = b'0' + (n % 10) as u8; + n /= 10; + i += 1; + } + // Reverse and print + while i > 0 { + i -= 1; + let ch = [buf[i]]; + // SAFETY: buf[i] is always a valid ASCII digit + let s = unsafe { core::str::from_utf8_unchecked(&ch) }; + io::print(s); + } +} + +#[panic_handler] +fn panic(_info: &PanicInfo) -> ! { + io::print("[init] PANIC\n"); + process::exit(101); +} diff --git a/userspace/tests/Cargo.toml b/userspace/tests/Cargo.toml index 64ca6c73..f469e16e 100644 --- a/userspace/tests/Cargo.toml +++ b/userspace/tests/Cargo.toml @@ -109,6 +109,10 @@ path = "../bin/coreutils/resolution.rs" name = "telnetd" path = "../bin/services/telnetd.rs" +[[bin]] +name = "init" +path = "../bin/services/init.rs" + # ============================================================================ # TESTS - Actual test programs # ============================================================================ diff --git a/userspace/tests/build-aarch64.sh b/userspace/tests/build-aarch64.sh index 5fad101d..50fa280e 100755 --- a/userspace/tests/build-aarch64.sh +++ b/userspace/tests/build-aarch64.sh @@ -60,6 +60,7 @@ BINARIES=( # SERVICES - Daemons and network services # ============================================================================ "telnetd" + "init" # ============================================================================ # TESTS - Process and fork tests diff --git a/userspace/tests/build.sh b/userspace/tests/build.sh index 83e6a013..dfb74668 100755 --- a/userspace/tests/build.sh +++ b/userspace/tests/build.sh @@ -142,6 +142,8 @@ BINARIES=( "ls_test" # PTY telnet server "telnetd" + # Init system + "init" # PTY integration test "pty_test" # Graphics syscall tests