diff --git a/kernel/arch/aarch64/syscall/linux_syscalls.h b/kernel/arch/aarch64/syscall/linux_syscalls.h index ec54ab4..ab03472 100644 --- a/kernel/arch/aarch64/syscall/linux_syscalls.h +++ b/kernel/arch/aarch64/syscall/linux_syscalls.h @@ -5,15 +5,15 @@ namespace syscall::linux_nr { +constexpr uint64_t FCNTL = 25; +constexpr uint64_t IOCTL = 29; +constexpr uint64_t UNLINKAT = 35; +constexpr uint64_t FTRUNCATE = 46; +constexpr uint64_t OPENAT = 56; +constexpr uint64_t CLOSE = 57; constexpr uint64_t READ = 63; constexpr uint64_t WRITE = 64; -constexpr uint64_t CLOSE = 57; -constexpr uint64_t IOCTL = 29; constexpr uint64_t WRITEV = 66; -constexpr uint64_t OPENAT = 56; -constexpr uint64_t MUNMAP = 215; -constexpr uint64_t MMAP = 222; -constexpr uint64_t MPROTECT = 226; constexpr uint64_t EXIT = 93; constexpr uint64_t EXIT_GROUP = 94; constexpr uint64_t SET_TID_ADDRESS = 96; @@ -23,7 +23,10 @@ constexpr uint64_t BIND = 200; constexpr uint64_t LISTEN = 201; constexpr uint64_t ACCEPT = 202; constexpr uint64_t CONNECT = 203; -constexpr uint64_t FCNTL = 25; +constexpr uint64_t MUNMAP = 215; +constexpr uint64_t MMAP = 222; +constexpr uint64_t MPROTECT = 226; +constexpr uint64_t MEMFD_CREATE = 279; } // namespace syscall::linux_nr diff --git a/kernel/arch/x86_64/syscall/linux_syscalls.h b/kernel/arch/x86_64/syscall/linux_syscalls.h index 1c1804e..4b11107 100644 --- a/kernel/arch/x86_64/syscall/linux_syscalls.h +++ b/kernel/arch/x86_64/syscall/linux_syscalls.h @@ -9,23 +9,26 @@ constexpr uint64_t READ = 0; constexpr uint64_t WRITE = 1; constexpr uint64_t OPEN = 2; constexpr uint64_t CLOSE = 3; -constexpr uint64_t IOCTL = 16; -constexpr uint64_t WRITEV = 20; constexpr uint64_t MMAP = 9; constexpr uint64_t MPROTECT = 10; constexpr uint64_t MUNMAP = 11; -constexpr uint64_t OPENAT = 257; -constexpr uint64_t EXIT = 60; -constexpr uint64_t ARCH_PRCTL = 158; -constexpr uint64_t SET_TID_ADDRESS = 218; -constexpr uint64_t EXIT_GROUP = 231; +constexpr uint64_t IOCTL = 16; +constexpr uint64_t WRITEV = 20; constexpr uint64_t SOCKET = 41; constexpr uint64_t CONNECT = 42; constexpr uint64_t ACCEPT = 43; constexpr uint64_t BIND = 49; constexpr uint64_t LISTEN = 50; constexpr uint64_t SOCKETPAIR = 53; +constexpr uint64_t EXIT = 60; constexpr uint64_t FCNTL = 72; +constexpr uint64_t FTRUNCATE = 77; +constexpr uint64_t ARCH_PRCTL = 158; +constexpr uint64_t SET_TID_ADDRESS = 218; +constexpr uint64_t EXIT_GROUP = 231; +constexpr uint64_t OPENAT = 257; +constexpr uint64_t UNLINKAT = 263; +constexpr uint64_t MEMFD_CREATE = 319; } // namespace syscall::linux_nr diff --git a/kernel/fs/fs.cpp b/kernel/fs/fs.cpp index 81bd6ed..10ca3f0 100644 --- a/kernel/fs/fs.cpp +++ b/kernel/fs/fs.cpp @@ -61,6 +61,7 @@ int32_t node::open(file*, uint32_t) { return OK; } int32_t node::on_close(file*) { return OK; } int32_t node::readlink(char*, size_t, size_t*) { return ERR_NOSYS; } int32_t node::create_socket(const char*, size_t, void*, node**) { return ERR_NOSYS; } +int32_t node::truncate(size_t) { return ERR_NOSYS; } int32_t node::getattr(vattr* attr) { if (!attr) return ERR_INVAL; diff --git a/kernel/fs/fstypes.h b/kernel/fs/fstypes.h index d6def25..1e18521 100644 --- a/kernel/fs/fstypes.h +++ b/kernel/fs/fstypes.h @@ -22,6 +22,7 @@ constexpr uint32_t O_RDONLY = 0; constexpr uint32_t O_WRONLY = 1; constexpr uint32_t O_RDWR = 2; constexpr uint32_t O_CREAT = 0x40; +constexpr uint32_t O_EXCL = 0x80; constexpr uint32_t O_TRUNC = 0x200; constexpr uint32_t O_APPEND = 0x400; constexpr uint32_t O_NONBLOCK = 0x800; diff --git a/kernel/fs/node.h b/kernel/fs/node.h index 034e81e..164a1a0 100644 --- a/kernel/fs/node.h +++ b/kernel/fs/node.h @@ -47,6 +47,7 @@ class node : public rc::ref_counted { // --- Metadata --- virtual int32_t getattr(vattr* attr); + virtual int32_t truncate(size_t size); // --- Symlink --- virtual int32_t readlink(char* buf, size_t size, size_t* out_len); diff --git a/kernel/fs/ramfs/ramfs.cpp b/kernel/fs/ramfs/ramfs.cpp index d8cef3b..bdf8bc2 100644 --- a/kernel/fs/ramfs/ramfs.cpp +++ b/kernel/fs/ramfs/ramfs.cpp @@ -481,4 +481,55 @@ int32_t file_node::getattr(fs::vattr* attr) { return fs::OK; } +int32_t file_node::truncate(size_t size) { + size_t max_alignable = ~(pmm::PAGE_SIZE - 1); + if (size > max_alignable) { + return fs::ERR_INVAL; + } + + sync::irq_lock_guard guard(m_lock); + + uint32_t needed = static_cast( + pmm::page_align_up(size) / pmm::PAGE_SIZE); + + if (needed > m_page_count) { + int32_t rc = ensure_capacity(needed); + if (rc != fs::OK) { + return rc; + } + + for (uint32_t i = m_page_count; i < needed; i++) { + pmm::phys_addr_t phys = pmm::alloc_page(); + if (phys == 0) { + m_size = static_cast(i) * pmm::PAGE_SIZE; + m_page_count = i; + return fs::ERR_NOMEM; + } + m_pages[i] = static_cast(paging::phys_to_virt(phys)); + string::memset(m_pages[i], 0, pmm::PAGE_SIZE); + } + m_page_count = needed; + } else if (needed < m_page_count) { + for (uint32_t i = needed; i < m_page_count; i++) { + if (m_pages[i]) { + pmm::phys_addr_t phys = + reinterpret_cast(m_pages[i]) - g_boot_info.hhdm_offset; + pmm::free_page(phys); + m_pages[i] = nullptr; + } + } + m_page_count = needed; + } + + if (size < m_size && needed > 0) { + size_t tail_off = size % pmm::PAGE_SIZE; + if (tail_off != 0 && m_pages[needed - 1]) { + string::memset(m_pages[needed - 1] + tail_off, 0, pmm::PAGE_SIZE - tail_off); + } + } + + m_size = size; + return fs::OK; +} + } // namespace ramfs diff --git a/kernel/fs/ramfs/ramfs.h b/kernel/fs/ramfs/ramfs.h index fe13ed8..4e5aae6 100644 --- a/kernel/fs/ramfs/ramfs.h +++ b/kernel/fs/ramfs/ramfs.h @@ -45,6 +45,7 @@ class file_node : public fs::node { ssize_t write(fs::file* f, const void* buf, size_t count) override; int64_t seek(fs::file* f, int64_t offset, int whence) override; int32_t getattr(fs::vattr* attr) override; + int32_t truncate(size_t size) override; private: int32_t ensure_capacity(uint32_t needed_pages); diff --git a/kernel/mm/shmem.cpp b/kernel/mm/shmem.cpp new file mode 100644 index 0000000..4ad34ce --- /dev/null +++ b/kernel/mm/shmem.cpp @@ -0,0 +1,259 @@ +#include "mm/shmem.h" + +#include "common/string.h" +#include "dynpriv/dynpriv.h" +#include "mm/heap.h" +#include "mm/paging.h" +#include "mm/pmm.h" + +namespace mm { + +namespace { + +constexpr size_t INITIAL_PAGE_CAPACITY = 4; + +bool ensure_capacity(shmem* s, size_t needed) { + if (needed <= s->m_capacity) { + return true; + } + + size_t new_cap = s->m_capacity ? s->m_capacity : INITIAL_PAGE_CAPACITY; + while (new_cap < needed) { + new_cap *= 2; + } + + auto* new_pages = static_cast( + heap::kzalloc(new_cap * sizeof(pmm::phys_addr_t))); + if (!new_pages) { + return false; + } + + if (s->m_pages && s->m_capacity > 0) { + string::memcpy(new_pages, s->m_pages, + s->m_capacity * sizeof(pmm::phys_addr_t)); + heap::kfree(s->m_pages); + } + + s->m_pages = new_pages; + s->m_capacity = new_cap; + return true; +} + +} // namespace + +void shmem::ref_destroy(shmem* self) { + if (!self) { + return; + } + + RUN_ELEVATED({ + if (self->m_pages) { + for (size_t i = 0; i < self->m_capacity; i++) { + if (self->m_pages[i] != 0) { + pmm::free_page(self->m_pages[i]); + } + } + heap::kfree(self->m_pages); + } + + heap::kfree_delete(self); + }); +} + +shmem* shmem_create(size_t initial_size) { + shmem* s = nullptr; + RUN_ELEVATED({ + s = heap::kalloc_new(); + }); + if (!s) { + return nullptr; + } + + RUN_ELEVATED({ + s->m_pages = nullptr; + s->m_page_count = 0; + s->m_capacity = 0; + s->m_size = 0; + s->lock.init(); + }); + + if (initial_size > 0) { + int32_t rc = SHMEM_OK; + RUN_ELEVATED({ + sync::mutex_lock(s->lock); + rc = shmem_resize_locked(s, initial_size); + sync::mutex_unlock(s->lock); + }); + if (rc != SHMEM_OK) { + shmem::ref_destroy(s); + return nullptr; + } + } + + return s; +} + +int32_t shmem_resize_locked(shmem* s, size_t new_size) { + if (!s) { + return SHMEM_ERR_INVAL; + } + + size_t max_alignable = ~(pmm::PAGE_SIZE - 1); + if (new_size > max_alignable) { + return SHMEM_ERR_INVAL; + } + + int32_t result = SHMEM_OK; + RUN_ELEVATED({ + size_t new_page_count = pmm::page_align_up(new_size) / pmm::PAGE_SIZE; + size_t old_page_count = s->m_page_count; + + if (new_page_count > old_page_count) { + if (!ensure_capacity(s, new_page_count)) { + result = SHMEM_ERR_NO_MEM; + } else { + for (size_t i = old_page_count; i < new_page_count; i++) { + if (s->m_pages[i] != 0) { + string::memset(paging::phys_to_virt(s->m_pages[i]), 0, pmm::PAGE_SIZE); + continue; + } + pmm::phys_addr_t phys = pmm::alloc_page(); + if (phys == 0) { + if (i > old_page_count) { + s->m_page_count = i; + s->m_size = i * pmm::PAGE_SIZE; + } + result = SHMEM_ERR_NO_MEM; + break; + } + string::memset(paging::phys_to_virt(phys), 0, pmm::PAGE_SIZE); + s->m_pages[i] = phys; + } + } + } + + if (result == SHMEM_OK) { + if (new_size < s->m_size && new_page_count > 0) { + size_t tail_off = new_size % pmm::PAGE_SIZE; + if (tail_off != 0 && s->m_pages[new_page_count - 1] != 0) { + auto* page = static_cast( + paging::phys_to_virt(s->m_pages[new_page_count - 1])); + string::memset(page + tail_off, 0, pmm::PAGE_SIZE - tail_off); + } + } + s->m_page_count = new_page_count; + s->m_size = new_size; + } + }); + return result; +} + +pmm::phys_addr_t shmem_get_page_locked(shmem* s, size_t page_index) { + pmm::phys_addr_t phys = 0; + RUN_ELEVATED({ + if (s && page_index < s->m_page_count) { + phys = s->m_pages[page_index]; + } + }); + return phys; +} + +ssize_t shmem_read(shmem* s, size_t offset, void* dst, size_t count) { + if (!s || !dst) { + return SHMEM_ERR_INVAL; + } + + ssize_t result = 0; + RUN_ELEVATED({ + sync::mutex_lock(s->lock); + + if (offset >= s->m_size) { + result = 0; + } else { + if (offset + count > s->m_size) { + count = s->m_size - offset; + } + + auto* out = static_cast(dst); + size_t remaining = count; + size_t pos = offset; + + while (remaining > 0) { + size_t page_idx = pos / pmm::PAGE_SIZE; + size_t page_off = pos % pmm::PAGE_SIZE; + size_t chunk = pmm::PAGE_SIZE - page_off; + if (chunk > remaining) { + chunk = remaining; + } + + if (page_idx < s->m_page_count && s->m_pages[page_idx] != 0) { + auto* src_page = static_cast( + paging::phys_to_virt(s->m_pages[page_idx])); + string::memcpy(out, src_page + page_off, chunk); + } else { + string::memset(out, 0, chunk); + } + + out += chunk; + pos += chunk; + remaining -= chunk; + } + + result = static_cast(count); + } + + sync::mutex_unlock(s->lock); + }); + return result; +} + +ssize_t shmem_write(shmem* s, size_t offset, const void* src, size_t count) { + if (!s || !src) { + return SHMEM_ERR_INVAL; + } + + ssize_t result = 0; + RUN_ELEVATED({ + sync::mutex_lock(s->lock); + + if (offset >= s->m_size) { + result = 0; + } else { + if (offset + count > s->m_size) { + count = s->m_size - offset; + } + + auto* in = static_cast(src); + size_t remaining = count; + size_t pos = offset; + + while (remaining > 0) { + size_t page_idx = pos / pmm::PAGE_SIZE; + size_t page_off = pos % pmm::PAGE_SIZE; + size_t chunk = pmm::PAGE_SIZE - page_off; + if (chunk > remaining) { + chunk = remaining; + } + + if (page_idx >= s->m_page_count || s->m_pages[page_idx] == 0) { + break; + } + + auto* dst_page = static_cast( + paging::phys_to_virt(s->m_pages[page_idx])); + string::memcpy(dst_page + page_off, in, chunk); + + in += chunk; + pos += chunk; + remaining -= chunk; + } + + result = static_cast(count - remaining); + } + + sync::mutex_unlock(s->lock); + }); + return result; +} + +} // namespace mm diff --git a/kernel/mm/shmem.h b/kernel/mm/shmem.h new file mode 100644 index 0000000..a1ec696 --- /dev/null +++ b/kernel/mm/shmem.h @@ -0,0 +1,73 @@ +#ifndef STELLUX_MM_SHMEM_H +#define STELLUX_MM_SHMEM_H + +#include "common/types.h" +#include "mm/pmm_types.h" +#include "rc/ref_counted.h" +#include "sync/mutex.h" + +namespace mm { + +constexpr int32_t SHMEM_OK = 0; +constexpr int32_t SHMEM_ERR_INVAL = -1; +constexpr int32_t SHMEM_ERR_NO_MEM = -2; + +/** + * Ref-counted shared memory backing object. + * + * Holds an array of physical pages that can be mapped into multiple + * mm_contexts simultaneously. Pages are freed only in ref_destroy + * when the last reference is released. + * + * Lock order: when holding both mm_ctx->lock and shmem->lock, + * always acquire mm_ctx->lock first. + */ +struct shmem final : rc::ref_counted { + pmm::phys_addr_t* m_pages; + size_t m_page_count; + size_t m_capacity; + size_t m_size; + sync::mutex lock; + + static void ref_destroy(shmem* self); +}; + +/** + * @brief Create a new shmem with the given initial size. + * Pages are allocated and zeroed. Size 0 is valid (no pages allocated). + * @return New shmem on success, nullptr on failure. + */ +[[nodiscard]] shmem* shmem_create(size_t initial_size); + +/** + * @brief Resize the shmem backing. + * Grow: allocate and zero new pages. + * Shrink: update m_size and m_page_count only; do NOT free tail pages + * (they may still be mapped). Tail pages are freed in ref_destroy. + * Caller must hold s->lock. + */ +int32_t shmem_resize_locked(shmem* s, size_t new_size); + +/** + * @brief Get the physical address of a page in the shmem. + * Returns 0 if page_index >= m_page_count (hole). + * Caller must hold s->lock. + */ +[[nodiscard]] pmm::phys_addr_t shmem_get_page_locked( + shmem* s, size_t page_index); + +/** + * @brief Read from shmem at the given byte offset. + * @return Number of bytes read, or negative error. + */ +ssize_t shmem_read(shmem* s, size_t offset, void* dst, size_t count); + +/** + * @brief Write to shmem at the given byte offset. + * @return Number of bytes written, or negative error. + */ +ssize_t shmem_write(shmem* s, size_t offset, const void* src, size_t count); + +} // namespace mm + +#endif // STELLUX_MM_SHMEM_H diff --git a/kernel/mm/vma.cpp b/kernel/mm/vma.cpp index 42b089c..85df170 100644 --- a/kernel/mm/vma.cpp +++ b/kernel/mm/vma.cpp @@ -4,13 +4,14 @@ #include "mm/heap.h" #include "mm/paging.h" #include "mm/pmm.h" +#include "mm/shmem.h" namespace mm { namespace { constexpr uint32_t MM_MAP_ALLOWED_FLAGS = - MM_MAP_PRIVATE | MM_MAP_ANONYMOUS | MM_MAP_FIXED | + MM_MAP_SHARED | MM_MAP_PRIVATE | MM_MAP_ANONYMOUS | MM_MAP_FIXED | MM_MAP_FIXED_NOREPLACE | MM_MAP_STACK; inline bool is_page_aligned(uintptr_t value) { @@ -54,9 +55,19 @@ inline paging::page_flags_t prot_to_page_flags(uint32_t prot) { } inline bool vma_can_merge(const vma& left, const vma& right) { - return left.end == right.start && - left.prot == right.prot && - left.flags == right.flags; + if (left.end != right.start || left.prot != right.prot || + left.flags != right.flags) { + return false; + } + if ((left.flags & VMA_FLAG_SHARED) || (right.flags & VMA_FLAG_SHARED)) { + if (left.shmem_backing.ptr() != right.shmem_backing.ptr()) { + return false; + } + if (left.backing_offset + (left.end - left.start) != right.backing_offset) { + return false; + } + } + return true; } vma* alloc_vma(uintptr_t start, uintptr_t end, uint32_t prot, uint32_t flags) { @@ -70,6 +81,7 @@ vma* alloc_vma(uintptr_t start, uintptr_t end, uint32_t prot, uint32_t flags) { node->prot = prot; node->flags = flags; node->addr_link = {}; + node->backing_offset = 0; return node; } @@ -93,6 +105,15 @@ __PRIVILEGED_CODE void unmap_and_free_pages(mm_context* mm_ctx, uintptr_t start, } } +__PRIVILEGED_CODE void unmap_pages_only(mm_context* mm_ctx, uintptr_t start, uintptr_t end) { + for (uintptr_t vaddr = start; vaddr < end; vaddr += pmm::PAGE_SIZE) { + if (!paging::is_mapped(vaddr, mm_ctx->pt_root)) { + continue; + } + paging::unmap_page(vaddr, mm_ctx->pt_root); + } +} + __PRIVILEGED_CODE void rollback_new_pages(mm_context* mm_ctx, uintptr_t start, uintptr_t mapped_end) { unmap_and_free_pages(mm_ctx, start, mapped_end); } @@ -124,6 +145,9 @@ vma* split_vma_locked(mm_context* mm_ctx, vma* node, uintptr_t split_addr) { return nullptr; } + right->shmem_backing = node->shmem_backing; + right->backing_offset = node->backing_offset + (split_addr - node->start); + uintptr_t old_end = node->end; node->end = split_addr; if (!vma_insert_locked(mm_ctx, right)) { @@ -155,7 +179,11 @@ __PRIVILEGED_CODE int32_t unmap_range_locked(mm_context* mm_ctx, uintptr_t start } } - unmap_and_free_pages(mm_ctx, overlap->start, overlap->end); + if (overlap->flags & VMA_FLAG_SHARED) { + unmap_pages_only(mm_ctx, overlap->start, overlap->end); + } else { + unmap_and_free_pages(mm_ctx, overlap->start, overlap->end); + } mm_ctx->vmas.remove(*overlap); free_vma(overlap); } @@ -200,7 +228,11 @@ __PRIVILEGED_CODE void mm_context::ref_destroy(mm_context* self) { sync::mutex_lock(self->lock); while (vma* node = self->vmas.min()) { - unmap_and_free_pages(self, node->start, node->end); + if (node->flags & VMA_FLAG_SHARED) { + unmap_pages_only(self, node->start, node->end); + } else { + unmap_and_free_pages(self, node->start, node->end); + } self->vmas.remove(*node); free_vma(node); } @@ -656,6 +688,138 @@ __PRIVILEGED_CODE int32_t mm_context_mprotect( return MM_CTX_OK; } +/** + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t mm_context_map_shared( + mm_context* mm_ctx, + shmem* backing, + uint64_t offset, + size_t length, + uint32_t prot, + uint32_t map_flags, + uintptr_t addr, + uintptr_t* out_addr +) { + if (!mm_ctx || !backing || !out_addr) { + return MM_CTX_ERR_INVALID_ARG; + } + if ((prot & ~MM_PROT_MASK) != 0) { + return MM_CTX_ERR_INVALID_ARG; + } + if (!(map_flags & MM_MAP_SHARED)) { + return MM_CTX_ERR_INVALID_ARG; + } + + size_t aligned_len = pmm::page_align_up(length); + if (aligned_len == 0) { + return MM_CTX_ERR_INVALID_ARG; + } + if (offset % pmm::PAGE_SIZE != 0) { + return MM_CTX_ERR_INVALID_ARG; + } + + const bool fixed = (map_flags & (MM_MAP_FIXED | MM_MAP_FIXED_NOREPLACE)) != 0; + const bool no_replace = (map_flags & MM_MAP_FIXED_NOREPLACE) != 0; + + uintptr_t start = 0; + uintptr_t end = 0; + + sync::mutex_lock(mm_ctx->lock); + + if (fixed) { + if (!is_page_aligned(addr)) { + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_INVALID_ARG; + } + start = addr; + if (!range_from_len(start, aligned_len, end)) { + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_INVALID_ARG; + } + if (start < mm_ctx->mmap_base || end > mm_ctx->mmap_end) { + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_NO_VIRT; + } + + if (no_replace && vma_find_overlap_locked(mm_ctx, start, end)) { + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_EXISTS; + } + if (!no_replace) { + int32_t rc = unmap_range_locked(mm_ctx, start, end); + if (rc != MM_CTX_OK) { + sync::mutex_unlock(mm_ctx->lock); + return rc; + } + } + } else { + start = vma_find_gap_topdown_locked(mm_ctx, aligned_len); + if (start == 0) { + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_NO_VIRT; + } + end = start + aligned_len; + } + + sync::mutex_lock(backing->lock); + + size_t backed_size = backing->m_page_count * pmm::PAGE_SIZE; + if (aligned_len > backed_size || offset > backed_size - aligned_len) { + sync::mutex_unlock(backing->lock); + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_INVALID_ARG; + } + + paging::page_flags_t page_flags = prot_to_page_flags(prot); + size_t pages = aligned_len / pmm::PAGE_SIZE; + size_t page_offset = static_cast(offset / pmm::PAGE_SIZE); + + for (size_t i = 0; i < pages; i++) { + pmm::phys_addr_t phys = shmem_get_page_locked(backing, page_offset + i); + if (phys == 0) { + unmap_pages_only(mm_ctx, start, start + i * pmm::PAGE_SIZE); + sync::mutex_unlock(backing->lock); + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_NO_MEM; + } + + uintptr_t vaddr = start + i * pmm::PAGE_SIZE; + if (paging::map_page(vaddr, phys, page_flags, mm_ctx->pt_root) != paging::OK) { + unmap_pages_only(mm_ctx, start, vaddr); + sync::mutex_unlock(backing->lock); + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_MAP_FAILED; + } + } + + sync::mutex_unlock(backing->lock); + + vma* node = alloc_vma(start, end, prot, VMA_FLAG_SHARED); + if (!node) { + unmap_pages_only(mm_ctx, start, end); + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_NO_MEM; + } + + backing->add_ref(); + node->shmem_backing = rc::strong_ref::adopt(backing); + node->backing_offset = offset; + + if (!vma_insert_locked(mm_ctx, node)) { + unmap_pages_only(mm_ctx, start, end); + free_vma(node); + sync::mutex_unlock(mm_ctx->lock); + return MM_CTX_ERR_EXISTS; + } + + coalesce_all_locked(mm_ctx); + sync::mutex_unlock(mm_ctx->lock); + + *out_addr = start; + return MM_CTX_OK; +} + /** * @note Privilege: **required** */ diff --git a/kernel/mm/vma.h b/kernel/mm/vma.h index e40dcf0..b57a42b 100644 --- a/kernel/mm/vma.h +++ b/kernel/mm/vma.h @@ -4,8 +4,10 @@ #include "common/types.h" #include "common/rb_tree.h" #include "mm/pmm_types.h" +#include "mm/shmem.h" #include "sync/mutex.h" #include "rc/ref_counted.h" +#include "rc/strong_ref.h" namespace mm { @@ -22,6 +24,7 @@ constexpr uint32_t MM_PROT_WRITE = (1u << 1); constexpr uint32_t MM_PROT_EXEC = (1u << 2); constexpr uint32_t MM_PROT_MASK = MM_PROT_READ | MM_PROT_WRITE | MM_PROT_EXEC; +constexpr uint32_t MM_MAP_SHARED = 0x00000001u; constexpr uint32_t MM_MAP_PRIVATE = 0x00000002u; constexpr uint32_t MM_MAP_FIXED = 0x00000010u; constexpr uint32_t MM_MAP_ANONYMOUS = 0x00000020u; @@ -32,6 +35,7 @@ constexpr uint32_t VMA_FLAG_PRIVATE = (1u << 0); constexpr uint32_t VMA_FLAG_ANONYMOUS = (1u << 1); constexpr uint32_t VMA_FLAG_ELF = (1u << 2); constexpr uint32_t VMA_FLAG_STACK = (1u << 3); +constexpr uint32_t VMA_FLAG_SHARED = (1u << 4); constexpr uintptr_t MMAP_BASE_DEFAULT = 0x00000080000000ULL; constexpr uintptr_t USER_STACK_TOP = 0x00007FFFFFF00000ULL; @@ -44,6 +48,8 @@ struct vma { uint32_t prot; uint32_t flags; rbt::node addr_link; + rc::strong_ref shmem_backing; + uint64_t backing_offset; }; struct vma_addr_cmp { @@ -179,6 +185,30 @@ __PRIVILEGED_CODE int32_t mm_context_mprotect( uint32_t prot ); +/** + * @brief Map a shmem backing into a user mm_context with MAP_SHARED semantics. + * Pages come from the backing; they are not allocated per-mapping. + * @param backing Shmem backing. Must have sufficient size for offset+length. + * @param offset Byte offset into backing (must be page-aligned). + * @param length Number of bytes to map (rounded up to page boundary). + * @param prot MM_PROT_READ / MM_PROT_WRITE / MM_PROT_EXEC. + * @param map_flags MM_MAP_SHARED, optionally MM_MAP_FIXED / MM_MAP_FIXED_NOREPLACE. + * @param addr Hint or fixed address. + * @param out_addr Receives the mapped virtual address. + * @return MM_CTX_OK on success, error code on failure. + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t mm_context_map_shared( + mm_context* mm_ctx, + shmem* backing, + uint64_t offset, + size_t length, + uint32_t prot, + uint32_t map_flags, + uintptr_t addr, + uintptr_t* out_addr +); + /** * @brief Return current VMA count. * @note Privilege: **required** diff --git a/kernel/resource/providers/file_provider.cpp b/kernel/resource/providers/file_provider.cpp index d20d22d..b9b3c7d 100644 --- a/kernel/resource/providers/file_provider.cpp +++ b/kernel/resource/providers/file_provider.cpp @@ -116,4 +116,15 @@ __PRIVILEGED_CODE int32_t open_file_resource( return OK; } +/** + * @note Privilege: **required** + */ +__PRIVILEGED_CODE fs::file* get_file(resource_object* obj) { + if (!obj || obj->type != resource_type::FILE || !obj->impl) { + return nullptr; + } + auto* impl = static_cast(obj->impl); + return impl->file; +} + } // namespace resource::file_provider diff --git a/kernel/resource/providers/file_provider.h b/kernel/resource/providers/file_provider.h index aa1d9cd..2decc8f 100644 --- a/kernel/resource/providers/file_provider.h +++ b/kernel/resource/providers/file_provider.h @@ -3,6 +3,8 @@ #include "resource/resource.h" +namespace fs { class file; } + namespace resource::file_provider { /** @@ -16,6 +18,13 @@ __PRIVILEGED_CODE int32_t open_file_resource( resource_object** out_obj ); +/** + * @brief Get the fs::file from a FILE resource_object. + * Returns nullptr if obj is not a FILE resource or has no impl. + * @note Privilege: **required** + */ +[[nodiscard]] __PRIVILEGED_CODE fs::file* get_file(resource_object* obj); + } // namespace resource::file_provider #endif // STELLUX_RESOURCE_PROVIDERS_FILE_PROVIDER_H diff --git a/kernel/resource/providers/shm_provider.cpp b/kernel/resource/providers/shm_provider.cpp new file mode 100644 index 0000000..401eba9 --- /dev/null +++ b/kernel/resource/providers/shm_provider.cpp @@ -0,0 +1,192 @@ +#include "resource/providers/shm_provider.h" +#include "resource/providers/shmem_resource_provider.h" +#include "common/hash.h" +#include "common/hashmap.h" +#include "common/string.h" +#include "fs/fstypes.h" +#include "mm/heap.h" +#include "mm/shmem.h" +#include "rc/strong_ref.h" +#include "sync/mutex.h" +#include "sync/spinlock.h" + +namespace resource::shm_provider { + +namespace { + +constexpr size_t SHM_PREFIX_LEN = 9; // strlen("/dev/shm/") +constexpr size_t SHM_REGISTRY_BUCKETS = 32; + +struct shm_entry { + char name[fs::NAME_MAX + 1]; + rc::strong_ref backing; + hashmap::node hash_link; +}; + +struct shm_key_ops { + using key_type = const char*; + static key_type key_of(const shm_entry& e) { return e.name; } + static uint64_t hash(const key_type& k) { return hash::string(k); } + static bool equal(const key_type& a, const key_type& b) { + return string::strcmp(a, b) == 0; + } +}; + +using shm_map = hashmap::map; + +__PRIVILEGED_DATA sync::spinlock g_shm_lock = sync::SPINLOCK_INIT; +__PRIVILEGED_BSS hashmap::bucket g_shm_buckets[SHM_REGISTRY_BUCKETS]; +__PRIVILEGED_BSS shm_map g_shm_registry; +__PRIVILEGED_BSS bool g_shm_inited; + +void ensure_init() { + if (!g_shm_inited) { + g_shm_registry.init(g_shm_buckets, SHM_REGISTRY_BUCKETS); + g_shm_inited = true; + } +} + +bool extract_shm_name( + const char* path, const char** out_name, size_t* out_len +) { + if (string::strncmp(path, "/dev/shm", 8) != 0) { + return false; + } + if (path[8] != '/' && path[8] != '\0') { + return false; + } + if (path[8] == '\0') { + return false; + } + + const char* name = path + SHM_PREFIX_LEN; + size_t len = 0; + while (name[len] != '\0' && name[len] != '/') { + len++; + } + if (name[len] == '/') { + return false; + } + if (len == 0) { + return false; + } + if (len > fs::NAME_MAX) { + return false; + } + + *out_name = name; + *out_len = len; + return true; +} + +} // namespace + +bool is_shm_path(const char* path) { + if (!path) { + return false; + } + if (string::strncmp(path, "/dev/shm", 8) != 0) { + return false; + } + return path[8] == '/' || path[8] == '\0'; +} + +/** + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t open_shm_resource( + const char* path, + uint32_t flags, + resource_object** out_obj +) { + if (!path || !out_obj) { + return ERR_INVAL; + } + + const char* name = nullptr; + size_t name_len = 0; + if (!extract_shm_name(path, &name, &name_len)) { + return ERR_INVAL; + } + + bool create = (flags & fs::O_CREAT) != 0; + bool excl = create && (flags & fs::O_EXCL) != 0; + + rc::strong_ref backing_ref; + bool need_trunc = false; + + { + sync::irq_lock_guard guard(g_shm_lock); + ensure_init(); + + shm_entry* existing = g_shm_registry.find(name); + + if (existing) { + if (excl) { + return ERR_EXIST; + } + backing_ref = existing->backing; + need_trunc = (flags & fs::O_TRUNC) != 0; + } else if (!create) { + return ERR_NOENT; + } else { + mm::shmem* raw = mm::shmem_create(0); + if (!raw) { + return ERR_NOMEM; + } + + auto* entry = heap::kalloc_new(); + if (!entry) { + mm::shmem::ref_destroy(raw); + return ERR_NOMEM; + } + + string::memcpy(entry->name, name, name_len); + entry->name[name_len] = '\0'; + entry->backing = rc::strong_ref::adopt(raw); + entry->hash_link = {}; + + g_shm_registry.insert(entry); + + backing_ref = entry->backing; + } + } + + if (need_trunc) { + sync::mutex_lock(backing_ref->lock); + mm::shmem_resize_locked(backing_ref.ptr(), 0); + sync::mutex_unlock(backing_ref->lock); + } + + return shmem_resource_provider::create_shmem_resource_with_backing( + backing_ref.ptr(), flags, out_obj); +} + +/** + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t unlink_shm(const char* path) { + if (!path) { + return ERR_INVAL; + } + + const char* name = nullptr; + size_t name_len = 0; + if (!extract_shm_name(path, &name, &name_len)) { + return ERR_INVAL; + } + + sync::irq_lock_guard guard(g_shm_lock); + ensure_init(); + + shm_entry* entry = g_shm_registry.find(name); + if (!entry) { + return ERR_NOENT; + } + + g_shm_registry.remove(*entry); + heap::kfree_delete(entry); + return OK; +} + +} // namespace resource::shm_provider diff --git a/kernel/resource/providers/shm_provider.h b/kernel/resource/providers/shm_provider.h new file mode 100644 index 0000000..55e8b40 --- /dev/null +++ b/kernel/resource/providers/shm_provider.h @@ -0,0 +1,35 @@ +#ifndef STELLUX_RESOURCE_PROVIDERS_SHM_PROVIDER_H +#define STELLUX_RESOURCE_PROVIDERS_SHM_PROVIDER_H + +#include "resource/resource.h" + +namespace resource::shm_provider { + +/** + * @brief Check if a path is a /dev/shm path. + * @return true if path starts with "/dev/shm/" or equals "/dev/shm". + */ +bool is_shm_path(const char* path); + +/** + * @brief Open or create a named shared memory resource. + * Path must be "/dev/shm/"; name is extracted and used as key. + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t open_shm_resource( + const char* path, + uint32_t flags, + resource_object** out_obj +); + +/** + * @brief Unlink a named shared memory object. + * Removes the name from the registry; backing persists until + * all fds are closed and all mappings are unmapped. + * @note Privilege: **required** + */ +__PRIVILEGED_CODE int32_t unlink_shm(const char* path); + +} // namespace resource::shm_provider + +#endif // STELLUX_RESOURCE_PROVIDERS_SHM_PROVIDER_H diff --git a/kernel/resource/providers/shmem_resource_provider.cpp b/kernel/resource/providers/shmem_resource_provider.cpp new file mode 100644 index 0000000..5d3caf5 --- /dev/null +++ b/kernel/resource/providers/shmem_resource_provider.cpp @@ -0,0 +1,181 @@ +#include "resource/providers/shmem_resource_provider.h" +#include "dynpriv/dynpriv.h" +#include "mm/heap.h" +#include "mm/shmem.h" +#include "rc/strong_ref.h" + +namespace resource::shmem_resource_provider { + +struct shmem_resource_impl { + rc::strong_ref backing; + size_t offset; + + ~shmem_resource_impl() = default; +}; + +static ssize_t shmem_resource_read( + resource_object* obj, void* kdst, size_t count, uint32_t flags +) { + (void)flags; + if (!obj || !kdst) { + return ERR_INVAL; + } + + ssize_t result = ERR_INVAL; + RUN_ELEVATED({ + if (!obj->impl) { + result = ERR_INVAL; + } else { + auto* impl = static_cast(obj->impl); + if (impl->backing) { + result = mm::shmem_read(impl->backing.ptr(), impl->offset, kdst, count); + if (result > 0) { + impl->offset += static_cast(result); + } + } + } + }); + return result; +} + +static ssize_t shmem_resource_write( + resource_object* obj, const void* ksrc, size_t count, uint32_t flags +) { + (void)flags; + if (!obj || !ksrc) { + return ERR_INVAL; + } + + ssize_t result = ERR_INVAL; + RUN_ELEVATED({ + if (!obj->impl) { + result = ERR_INVAL; + } else { + auto* impl = static_cast(obj->impl); + if (impl->backing) { + result = mm::shmem_write(impl->backing.ptr(), impl->offset, ksrc, count); + if (result > 0) { + impl->offset += static_cast(result); + } + } + } + }); + return result; +} + +static void shmem_resource_close(resource_object* obj) { + if (!obj) { + return; + } + RUN_ELEVATED({ + if (obj->impl) { + auto* impl = static_cast(obj->impl); + heap::kfree_delete(impl); + obj->impl = nullptr; + } + }); +} + +static const resource_ops g_shmem_resource_ops = { + shmem_resource_read, + shmem_resource_write, + shmem_resource_close, +}; + +int32_t create_shmem_resource( + uint32_t flags, + resource_object** out_obj +) { + (void)flags; + if (!out_obj) { + return ERR_INVAL; + } + + mm::shmem* backing = mm::shmem_create(0); + if (!backing) { + return ERR_NOMEM; + } + + int32_t result = OK; + resource_object* obj = nullptr; + shmem_resource_impl* impl = nullptr; + bool backing_still_owned = true; + RUN_ELEVATED({ + impl = heap::kalloc_new(); + if (!impl) { + result = ERR_NOMEM; + } else { + impl->backing = rc::strong_ref::adopt(backing); + impl->offset = 0; + backing_still_owned = false; + + obj = heap::kalloc_new(); + if (!obj) { + heap::kfree_delete(impl); + impl = nullptr; + result = ERR_NOMEM; + } + } + + if (result == OK) { + obj->type = resource_type::SHMEM; + obj->ops = &g_shmem_resource_ops; + obj->impl = impl; + *out_obj = obj; + } + }); + + if (result != OK && backing_still_owned) { + mm::shmem::ref_destroy(backing); + } + return result; +} + +int32_t create_shmem_resource_with_backing( + mm::shmem* backing, + uint32_t flags, + resource_object** out_obj +) { + (void)flags; + if (!backing || !out_obj) { + return ERR_INVAL; + } + + int32_t result = OK; + resource_object* obj = nullptr; + RUN_ELEVATED({ + auto* impl = heap::kalloc_new(); + if (!impl) { + result = ERR_NOMEM; + } else { + backing->add_ref(); + impl->backing = rc::strong_ref::adopt(backing); + impl->offset = 0; + + obj = heap::kalloc_new(); + if (!obj) { + heap::kfree_delete(impl); + result = ERR_NOMEM; + } else { + obj->type = resource_type::SHMEM; + obj->ops = &g_shmem_resource_ops; + obj->impl = impl; + *out_obj = obj; + } + } + }); + return result; +} + +mm::shmem* get_shmem_backing(resource_object* obj) { + mm::shmem* backing = nullptr; + RUN_ELEVATED({ + if (obj && obj->type == resource_type::SHMEM && obj->impl) { + auto* impl = static_cast(obj->impl); + backing = impl->backing.ptr(); + } + }); + return backing; +} + +} // namespace resource::shmem_resource_provider diff --git a/kernel/resource/providers/shmem_resource_provider.h b/kernel/resource/providers/shmem_resource_provider.h new file mode 100644 index 0000000..24b181c --- /dev/null +++ b/kernel/resource/providers/shmem_resource_provider.h @@ -0,0 +1,36 @@ +#ifndef STELLUX_RESOURCE_PROVIDERS_SHMEM_RESOURCE_PROVIDER_H +#define STELLUX_RESOURCE_PROVIDERS_SHMEM_RESOURCE_PROVIDER_H + +#include "resource/resource.h" +#include "mm/shmem.h" + +namespace resource::shmem_resource_provider { + +/** + * @brief Create a SHMEM resource backed by a new shmem (size 0). + * The returned resource_object has one owned reference. + */ +int32_t create_shmem_resource( + uint32_t flags, + resource_object** out_obj +); + +/** + * @brief Create a SHMEM resource backed by an existing shmem. + * Adds a reference to the backing. Used by shm_provider. + */ +int32_t create_shmem_resource_with_backing( + mm::shmem* backing, + uint32_t flags, + resource_object** out_obj +); + +/** + * @brief Get the shmem backing from a SHMEM resource_object. + * Returns nullptr if obj is not SHMEM or has no impl. + */ +[[nodiscard]] mm::shmem* get_shmem_backing(resource_object* obj); + +} // namespace resource::shmem_resource_provider + +#endif // STELLUX_RESOURCE_PROVIDERS_SHMEM_RESOURCE_PROVIDER_H diff --git a/kernel/resource/resource.cpp b/kernel/resource/resource.cpp index dd89c84..e61b519 100644 --- a/kernel/resource/resource.cpp +++ b/kernel/resource/resource.cpp @@ -1,5 +1,6 @@ #include "resource/resource.h" #include "resource/providers/file_provider.h" +#include "resource/providers/shm_provider.h" #include "sched/task.h" #include "fs/fstypes.h" #include "mm/heap.h" @@ -58,7 +59,7 @@ __PRIVILEGED_CODE static bool valid_open_flags(uint32_t flags) { } __PRIVILEGED_CODE static uint32_t normalize_open_flags(uint32_t flags) { - return flags & (fs::ACCESS_MODE_MASK | fs::O_CREAT | fs::O_TRUNC | fs::O_APPEND); + return flags & (fs::ACCESS_MODE_MASK | fs::O_CREAT | fs::O_EXCL | fs::O_TRUNC | fs::O_APPEND); } __PRIVILEGED_CODE static uint32_t rights_from_open_flags(uint32_t flags) { @@ -91,13 +92,22 @@ __PRIVILEGED_CODE int32_t open( uint32_t fs_flags = normalize_open_flags(flags); resource_object* obj = nullptr; - int32_t rc = file_provider::open_file_resource(kpath, fs_flags, &obj); + int32_t rc; + resource_type rtype; + + if (shm_provider::is_shm_path(kpath)) { + rc = shm_provider::open_shm_resource(kpath, fs_flags, &obj); + rtype = resource_type::SHMEM; + } else { + rc = file_provider::open_file_resource(kpath, fs_flags, &obj); + rtype = resource_type::FILE; + } if (rc != OK) { return rc; } uint32_t rights = rights_from_open_flags(fs_flags); - rc = alloc_handle(&owner->handles, obj, resource_type::FILE, rights, out_handle); + rc = alloc_handle(&owner->handles, obj, rtype, rights, out_handle); if (rc != HANDLE_OK) { resource_release(obj); return (rc == HANDLE_ERR_NOSPC) ? ERR_TABLEFULL : ERR_IO; diff --git a/kernel/resource/resource.h b/kernel/resource/resource.h index c618d58..5707b38 100644 --- a/kernel/resource/resource.h +++ b/kernel/resource/resource.h @@ -50,6 +50,7 @@ constexpr int32_t ERR_CONNREFUSED = -13; constexpr int32_t ERR_ADDRINUSE = -14; constexpr int32_t ERR_ISCONN = -15; constexpr int32_t ERR_AGAIN = -16; +constexpr int32_t ERR_EXIST = -17; /** * @brief Initialize handle table storage in task. diff --git a/kernel/resource/resource_types.h b/kernel/resource/resource_types.h index 75e684e..f8f60fb 100644 --- a/kernel/resource/resource_types.h +++ b/kernel/resource/resource_types.h @@ -9,6 +9,7 @@ enum class resource_type : uint16_t { UNKNOWN = 0, FILE = 1, SOCKET = 2, + SHMEM = 3, }; using handle_t = int32_t; diff --git a/kernel/syscall/handlers/sys_fd.cpp b/kernel/syscall/handlers/sys_fd.cpp index 8a44c29..c3d707b 100644 --- a/kernel/syscall/handlers/sys_fd.cpp +++ b/kernel/syscall/handlers/sys_fd.cpp @@ -1,10 +1,12 @@ #include "syscall/handlers/sys_fd.h" #include "resource/resource.h" +#include "resource/providers/shm_provider.h" #include "sched/sched.h" #include "sched/task.h" #include "mm/uaccess.h" #include "mm/heap.h" +#include "fs/fs.h" #include "fs/fstypes.h" namespace { @@ -43,12 +45,46 @@ inline int64_t map_resource_error(int64_t rc) { return syscall::EISCONN; case resource::ERR_AGAIN: return syscall::EAGAIN; + case resource::ERR_EXIST: + return syscall::EEXIST; case resource::ERR_IO: default: return syscall::EIO; } } +inline int64_t map_fs_error(int32_t rc) { + switch (rc) { + case fs::ERR_NOENT: + return syscall::ENOENT; + case fs::ERR_EXIST: + return syscall::EEXIST; + case fs::ERR_NOTDIR: + return syscall::ENOTDIR; + case fs::ERR_ISDIR: + return syscall::EISDIR; + case fs::ERR_NOMEM: + return syscall::ENOMEM; + case fs::ERR_INVAL: + return syscall::EINVAL; + case fs::ERR_NAMETOOLONG: + return syscall::ENAMETOOLONG; + case fs::ERR_NOTEMPTY: + return syscall::ENOTEMPTY; + case fs::ERR_NOSYS: + return syscall::ENOSYS; + case fs::ERR_BUSY: + return syscall::EBUSY; + case fs::ERR_LOOP: + return syscall::ELOOP; + case fs::ERR_BADF: + return syscall::EBADF; + case fs::ERR_IO: + default: + return syscall::EIO; + } +} + int64_t do_open_common(int64_t dirfd, uint64_t pathname, uint64_t flags, uint64_t mode) { (void)mode; @@ -265,3 +301,40 @@ DEFINE_SYSCALL3(fcntl, fd, cmd, arg) { return syscall::EINVAL; } + +DEFINE_SYSCALL3(unlinkat, dirfd, pathname, flags_val) { + (void)flags_val; + + if (static_cast(dirfd) != AT_FDCWD) { + return syscall::EINVAL; + } + + char kpath[fs::PATH_MAX]; + int32_t copy_rc = mm::uaccess::copy_cstr_from_user( + kpath, sizeof(kpath), + reinterpret_cast(pathname)); + if (copy_rc != mm::uaccess::OK) { + if (copy_rc == mm::uaccess::ERR_NAMETOOLONG) { + return syscall::ENAMETOOLONG; + } + return syscall::EFAULT; + } + + if (kpath[0] != '/') { + return syscall::EINVAL; + } + + if (resource::shm_provider::is_shm_path(kpath)) { + int32_t rc = resource::shm_provider::unlink_shm(kpath); + if (rc != resource::OK) { + return map_resource_error(rc); + } + return 0; + } + + int32_t rc = fs::unlink(kpath); + if (rc != fs::OK) { + return map_fs_error(rc); + } + return 0; +} diff --git a/kernel/syscall/handlers/sys_fd.h b/kernel/syscall/handlers/sys_fd.h index 7491868..39e67fb 100644 --- a/kernel/syscall/handlers/sys_fd.h +++ b/kernel/syscall/handlers/sys_fd.h @@ -9,5 +9,6 @@ DECLARE_SYSCALL(read); DECLARE_SYSCALL(write); DECLARE_SYSCALL(close); DECLARE_SYSCALL(fcntl); +DECLARE_SYSCALL(unlinkat); #endif // STELLUX_SYSCALL_HANDLERS_SYS_FD_H diff --git a/kernel/syscall/handlers/sys_memfd.cpp b/kernel/syscall/handlers/sys_memfd.cpp new file mode 100644 index 0000000..3b6618f --- /dev/null +++ b/kernel/syscall/handlers/sys_memfd.cpp @@ -0,0 +1,126 @@ +#include "syscall/handlers/sys_memfd.h" +#include "resource/providers/shmem_resource_provider.h" +#include "resource/providers/file_provider.h" +#include "resource/handle_table.h" +#include "mm/shmem.h" +#include "mm/uaccess.h" +#include "fs/file.h" +#include "fs/fs.h" +#include "fs/node.h" +#include "sched/sched.h" +#include "sched/task.h" + +namespace { + +constexpr size_t MEMFD_NAME_MAX = 249; + +inline int64_t map_fs_truncate_error(int32_t rc) { + switch (rc) { + case fs::ERR_INVAL: + return syscall::EINVAL; + case fs::ERR_NOMEM: + return syscall::ENOMEM; + case fs::ERR_NOSYS: + return syscall::EINVAL; + default: + return syscall::EIO; + } +} +constexpr uint32_t MFD_CLOEXEC = 0x0001u; +constexpr uint32_t MFD_ALLOWED = MFD_CLOEXEC; + +} // namespace + +DEFINE_SYSCALL2(memfd_create, u_name, u_flags) { + uint32_t flags = static_cast(u_flags); + if (flags & ~MFD_ALLOWED) { + return syscall::EINVAL; + } + + char kname[MEMFD_NAME_MAX + 1]; + if (u_name != 0) { + int32_t rc = mm::uaccess::copy_cstr_from_user( + kname, sizeof(kname), + reinterpret_cast(u_name)); + if (rc == mm::uaccess::ERR_NAMETOOLONG) { + return syscall::ENAMETOOLONG; + } + if (rc != mm::uaccess::OK) { + return syscall::EFAULT; + } + } + + sched::task* task = sched::current(); + if (!task) { + return syscall::ENOMEM; + } + + resource::resource_object* obj = nullptr; + int32_t rc = resource::shmem_resource_provider::create_shmem_resource(flags, &obj); + if (rc != resource::OK) { + return syscall::ENOMEM; + } + + resource::handle_t handle = -1; + uint32_t rights = resource::RIGHT_READ | resource::RIGHT_WRITE; + rc = resource::alloc_handle( + &task->handles, obj, resource::resource_type::SHMEM, rights, &handle); + if (rc != resource::HANDLE_OK) { + resource::resource_release(obj); + return syscall::EMFILE; + } + + resource::resource_release(obj); + return static_cast(handle); +} + +DEFINE_SYSCALL2(ftruncate, fd_val, length) { + int32_t fd = static_cast(fd_val); + int64_t signed_len = static_cast(length); + if (signed_len < 0) { + return syscall::EINVAL; + } + size_t new_size = static_cast(signed_len); + + sched::task* task = sched::current(); + if (!task) { + return syscall::ENOMEM; + } + + resource::resource_object* obj = nullptr; + int32_t rc = resource::get_handle_object( + &task->handles, fd, resource::RIGHT_WRITE, &obj); + if (rc != resource::HANDLE_OK) { + return (rc == resource::HANDLE_ERR_ACCESS) ? syscall::EACCES : syscall::EBADF; + } + + if (obj->type == resource::resource_type::SHMEM) { + mm::shmem* backing = resource::shmem_resource_provider::get_shmem_backing(obj); + if (!backing) { + resource::resource_release(obj); + return syscall::EINVAL; + } + + sync::mutex_lock(backing->lock); + int32_t resize_rc = mm::shmem_resize_locked(backing, new_size); + sync::mutex_unlock(backing->lock); + + resource::resource_release(obj); + return (resize_rc != mm::SHMEM_OK) ? syscall::ENOMEM : 0; + } + + if (obj->type == resource::resource_type::FILE) { + fs::file* f = resource::file_provider::get_file(obj); + if (!f || !f->get_node()) { + resource::resource_release(obj); + return syscall::EINVAL; + } + + int32_t trunc_rc = f->get_node()->truncate(new_size); + resource::resource_release(obj); + return (trunc_rc != 0) ? map_fs_truncate_error(trunc_rc) : 0; + } + + resource::resource_release(obj); + return syscall::EINVAL; +} diff --git a/kernel/syscall/handlers/sys_memfd.h b/kernel/syscall/handlers/sys_memfd.h new file mode 100644 index 0000000..2b6fb67 --- /dev/null +++ b/kernel/syscall/handlers/sys_memfd.h @@ -0,0 +1,9 @@ +#ifndef STELLUX_SYSCALL_HANDLERS_SYS_MEMFD_H +#define STELLUX_SYSCALL_HANDLERS_SYS_MEMFD_H + +#include "syscall/syscall_table.h" + +DECLARE_SYSCALL(memfd_create); +DECLARE_SYSCALL(ftruncate); + +#endif // STELLUX_SYSCALL_HANDLERS_SYS_MEMFD_H diff --git a/kernel/syscall/handlers/sys_mmap.cpp b/kernel/syscall/handlers/sys_mmap.cpp index 68c2059..cd293a7 100644 --- a/kernel/syscall/handlers/sys_mmap.cpp +++ b/kernel/syscall/handlers/sys_mmap.cpp @@ -1,6 +1,9 @@ #include "syscall/handlers/sys_mmap.h" #include "mm/vma.h" +#include "mm/shmem.h" +#include "resource/resource.h" +#include "resource/providers/shmem_resource_provider.h" #include "sched/sched.h" #include "sched/task.h" @@ -11,14 +14,15 @@ constexpr uint64_t LINUX_PROT_WRITE = 0x2; constexpr uint64_t LINUX_PROT_EXEC = 0x4; constexpr uint64_t LINUX_PROT_MASK = LINUX_PROT_READ | LINUX_PROT_WRITE | LINUX_PROT_EXEC; +constexpr uint64_t LINUX_MAP_SHARED = 0x00000001; constexpr uint64_t LINUX_MAP_PRIVATE = 0x00000002; constexpr uint64_t LINUX_MAP_FIXED = 0x00000010; constexpr uint64_t LINUX_MAP_ANONYMOUS = 0x00000020; constexpr uint64_t LINUX_MAP_STACK = 0x00020000; constexpr uint64_t LINUX_MAP_FIXED_NOREPLACE = 0x00100000; constexpr uint64_t LINUX_MAP_ALLOWED_MASK = - LINUX_MAP_PRIVATE | LINUX_MAP_FIXED | LINUX_MAP_ANONYMOUS | - LINUX_MAP_STACK | LINUX_MAP_FIXED_NOREPLACE; + LINUX_MAP_SHARED | LINUX_MAP_PRIVATE | LINUX_MAP_FIXED | + LINUX_MAP_ANONYMOUS | LINUX_MAP_STACK | LINUX_MAP_FIXED_NOREPLACE; inline uint32_t linux_prot_to_mm(uint64_t prot) { uint32_t mm_prot = 0; @@ -30,13 +34,10 @@ inline uint32_t linux_prot_to_mm(uint64_t prot) { inline uint32_t linux_map_to_mm(uint64_t flags) { uint32_t mm_flags = 0; + if (flags & LINUX_MAP_SHARED) mm_flags |= mm::MM_MAP_SHARED; if (flags & LINUX_MAP_PRIVATE) mm_flags |= mm::MM_MAP_PRIVATE; if (flags & LINUX_MAP_FIXED) mm_flags |= mm::MM_MAP_FIXED; if (flags & LINUX_MAP_ANONYMOUS) mm_flags |= mm::MM_MAP_ANONYMOUS; - // Linux MAP_STACK is effectively a no-op hint for anonymous mappings. - // Keep it accepted at the syscall boundary, but do not propagate it into - // mm_context_map_anonymous where MM_MAP_STACK is reserved for internal - // kernel stack mappings. if (flags & LINUX_MAP_FIXED_NOREPLACE) mm_flags |= mm::MM_MAP_FIXED_NOREPLACE; return mm_flags; } @@ -75,13 +76,83 @@ DEFINE_SYSCALL6(mmap, addr, length, prot, flags, fd, offset) { if ((flags & ~LINUX_MAP_ALLOWED_MASK) != 0) { return syscall::EINVAL; } - if (!(flags & LINUX_MAP_PRIVATE)) { + + bool has_shared = (flags & LINUX_MAP_SHARED) != 0; + bool has_private = (flags & LINUX_MAP_PRIVATE) != 0; + bool has_anon = (flags & LINUX_MAP_ANONYMOUS) != 0; + int64_t fd_val = static_cast(fd); + + if (has_shared == has_private) { + return syscall::EINVAL; + } + + sched::task* task = sched::current(); + if (!task || !task->exec.mm_ctx) { + return syscall::ENOMEM; + } + + if (fd_val != -1 && !has_anon) { + if (!has_shared) { + return syscall::EINVAL; + } + if (!is_page_aligned(offset)) { + return syscall::EINVAL; + } + if (offset + length < length) { + return syscall::EINVAL; + } + + uint32_t required_rights = 0; + if (prot & LINUX_PROT_READ) required_rights |= resource::RIGHT_READ; + if (prot & LINUX_PROT_WRITE) required_rights |= resource::RIGHT_WRITE; + + resource::resource_object* obj = nullptr; + int32_t rc = resource::get_handle_object( + &task->handles, static_cast(fd_val), + required_rights, &obj); + if (rc != resource::HANDLE_OK) { + return (rc == resource::HANDLE_ERR_ACCESS) ? + syscall::EACCES : syscall::EBADF; + } + + if (obj->type != resource::resource_type::SHMEM) { + resource::resource_release(obj); + return syscall::EINVAL; + } + + mm::shmem* backing = resource::shmem_resource_provider::get_shmem_backing(obj); + if (!backing) { + resource::resource_release(obj); + return syscall::EINVAL; + } + + uintptr_t mapped_addr = 0; + int32_t map_rc = mm::mm_context_map_shared( + task->exec.mm_ctx, + backing, + static_cast(offset), + static_cast(length), + linux_prot_to_mm(prot), + linux_map_to_mm(flags), + static_cast(addr), + &mapped_addr + ); + + resource::resource_release(obj); + + if (map_rc != mm::MM_CTX_OK) { + return mm_status_to_errno(map_rc); + } + return static_cast(mapped_addr); + } + + if (!has_private) { return syscall::EINVAL; } - if (!(flags & LINUX_MAP_ANONYMOUS)) { + if (!has_anon) { return syscall::EINVAL; } - if (static_cast(fd) != -1) { + if (fd_val != -1) { return syscall::EINVAL; } if (offset != 0) { @@ -93,11 +164,6 @@ DEFINE_SYSCALL6(mmap, addr, length, prot, flags, fd, offset) { return syscall::EINVAL; } - sched::task* task = sched::current(); - if (!task || !task->exec.mm_ctx) { - return syscall::ENOMEM; - } - uintptr_t mapped_addr = 0; int32_t rc = mm::mm_context_map_anonymous( task->exec.mm_ctx, diff --git a/kernel/syscall/syscall_table.cpp b/kernel/syscall/syscall_table.cpp index c5feaed..c0720a5 100644 --- a/kernel/syscall/syscall_table.cpp +++ b/kernel/syscall/syscall_table.cpp @@ -7,6 +7,7 @@ #include "syscall/handlers/sys_fd.h" #include "syscall/handlers/sys_mmap.h" #include "syscall/handlers/sys_socket.h" +#include "syscall/handlers/sys_memfd.h" namespace syscall { @@ -40,6 +41,10 @@ __PRIVILEGED_CODE void init_syscall_table() { REGISTER_SYSCALL(linux_nr::CONNECT, connect); REGISTER_SYSCALL(linux_nr::FCNTL, fcntl); + REGISTER_SYSCALL(linux_nr::MEMFD_CREATE, memfd_create); + REGISTER_SYSCALL(linux_nr::FTRUNCATE, ftruncate); + REGISTER_SYSCALL(linux_nr::UNLINKAT, unlinkat); + REGISTER_SYSCALL(SYS_ELEVATE, elevate); register_arch_syscalls(); diff --git a/kernel/syscall/syscall_table.h b/kernel/syscall/syscall_table.h index cdcee81..fc0aaa9 100644 --- a/kernel/syscall/syscall_table.h +++ b/kernel/syscall/syscall_table.h @@ -13,15 +13,20 @@ constexpr int64_t ENOENT = -2; constexpr int64_t EIO = -5; constexpr int64_t EBADF = -9; constexpr int64_t ENOMEM = -12; +constexpr int64_t EACCES = -13; constexpr int64_t EFAULT = -14; +constexpr int64_t EEXIST = -17; constexpr int64_t ENOTDIR = -20; -constexpr int64_t EMFILE = -24; -constexpr int64_t EINVAL = -22; -constexpr int64_t EEXIST = -17; +constexpr int64_t EISDIR = -21; +constexpr int64_t EINVAL = -22; +constexpr int64_t EMFILE = -24; constexpr int64_t ENOTTY = -25; constexpr int64_t ENAMETOOLONG = -36; -constexpr int64_t ENOSYS = -38; -constexpr int64_t EAGAIN = -11; +constexpr int64_t ENOSYS = -38; +constexpr int64_t ENOTEMPTY = -39; +constexpr int64_t ELOOP = -40; +constexpr int64_t EAGAIN = -11; +constexpr int64_t EBUSY = -16; constexpr int64_t EPIPE = -32; constexpr int64_t EADDRINUSE = -98; constexpr int64_t EISCONN = -106; diff --git a/kernel/tests/memory/shmem.test.cpp b/kernel/tests/memory/shmem.test.cpp new file mode 100644 index 0000000..2811f9c --- /dev/null +++ b/kernel/tests/memory/shmem.test.cpp @@ -0,0 +1,201 @@ +#define STLX_TEST_TIER TIER_MM_CORE + +#include "stlx_unit_test.h" +#include "mm/shmem.h" +#include "mm/vma.h" +#include "mm/paging.h" +#include "mm/pmm.h" +#include "common/string.h" +#include "sync/mutex.h" + +TEST_SUITE(shmem_test); + +namespace { + +uint64_t g_initial_free_pages = 0; + +int32_t shmem_before_all() { + g_initial_free_pages = pmm::free_page_count(); + return 0; +} + +int32_t shmem_after_all() { + uint64_t final_free = pmm::free_page_count(); + if (final_free != g_initial_free_pages) { + return -1; + } + return 0; +} + +constexpr size_t PAGE = pmm::PAGE_SIZE; + +} // namespace + +BEFORE_ALL(shmem_test, shmem_before_all); +AFTER_ALL(shmem_test, shmem_after_all); + +TEST(shmem_test, create_and_destroy) { + uint64_t before = pmm::free_page_count(); + mm::shmem* s = mm::shmem_create(0); + ASSERT_NOT_NULL(s); + EXPECT_EQ(s->m_size, static_cast(0)); + EXPECT_EQ(s->m_page_count, static_cast(0)); + + mm::shmem::ref_destroy(s); + uint64_t after = pmm::free_page_count(); + EXPECT_EQ(after, before); +} + +TEST(shmem_test, create_with_initial_size) { + uint64_t before = pmm::free_page_count(); + mm::shmem* s = mm::shmem_create(2 * PAGE); + ASSERT_NOT_NULL(s); + EXPECT_EQ(s->m_size, 2 * PAGE); + EXPECT_EQ(s->m_page_count, static_cast(2)); + + sync::mutex_lock(s->lock); + pmm::phys_addr_t p0 = mm::shmem_get_page_locked(s, 0); + pmm::phys_addr_t p1 = mm::shmem_get_page_locked(s, 1); + pmm::phys_addr_t p2 = mm::shmem_get_page_locked(s, 2); + sync::mutex_unlock(s->lock); + + EXPECT_NE(p0, static_cast(0)); + EXPECT_NE(p1, static_cast(0)); + EXPECT_EQ(p2, static_cast(0)); + + mm::shmem::ref_destroy(s); + EXPECT_EQ(pmm::free_page_count(), before); +} + +TEST(shmem_test, resize_grow_and_shrink) { + mm::shmem* s = mm::shmem_create(PAGE); + ASSERT_NOT_NULL(s); + + sync::mutex_lock(s->lock); + ASSERT_EQ(mm::shmem_resize_locked(s, 3 * PAGE), mm::SHMEM_OK); + EXPECT_EQ(s->m_page_count, static_cast(3)); + EXPECT_EQ(s->m_size, 3 * PAGE); + + ASSERT_EQ(mm::shmem_resize_locked(s, PAGE), mm::SHMEM_OK); + EXPECT_EQ(s->m_page_count, static_cast(1)); + EXPECT_EQ(s->m_size, PAGE); + sync::mutex_unlock(s->lock); + + mm::shmem::ref_destroy(s); +} + +TEST(shmem_test, read_write_roundtrip) { + mm::shmem* s = mm::shmem_create(PAGE); + ASSERT_NOT_NULL(s); + + uint8_t wbuf[8] = {0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE}; + ssize_t written = mm::shmem_write(s, 0, wbuf, 8); + EXPECT_EQ(written, static_cast(8)); + + uint8_t rbuf[8] = {}; + ssize_t rd = mm::shmem_read(s, 0, rbuf, 8); + EXPECT_EQ(rd, static_cast(8)); + EXPECT_EQ(string::memcmp(wbuf, rbuf, 8), 0); + + mm::shmem::ref_destroy(s); +} + +TEST(shmem_test, shared_map_two_contexts_same_backing) { + uint64_t before = pmm::free_page_count(); + + mm::shmem* s = mm::shmem_create(PAGE); + ASSERT_NOT_NULL(s); + + uint8_t pattern[4] = {0x11, 0x22, 0x33, 0x44}; + mm::shmem_write(s, 0, pattern, 4); + + mm::mm_context* ctx_a = mm::mm_context_create(); + ASSERT_NOT_NULL(ctx_a); + mm::mm_context* ctx_b = mm::mm_context_create(); + ASSERT_NOT_NULL(ctx_b); + + uintptr_t addr_a = 0; + int32_t rc = mm::mm_context_map_shared( + ctx_a, s, 0, PAGE, + mm::MM_PROT_READ | mm::MM_PROT_WRITE, + mm::MM_MAP_SHARED, 0, &addr_a); + ASSERT_EQ(rc, mm::MM_CTX_OK); + EXPECT_NE(addr_a, static_cast(0)); + + uintptr_t addr_b = 0; + rc = mm::mm_context_map_shared( + ctx_b, s, 0, PAGE, + mm::MM_PROT_READ | mm::MM_PROT_WRITE, + mm::MM_MAP_SHARED, 0, &addr_b); + ASSERT_EQ(rc, mm::MM_CTX_OK); + EXPECT_NE(addr_b, static_cast(0)); + + pmm::phys_addr_t phys_a = paging::get_physical(addr_a, ctx_a->pt_root); + pmm::phys_addr_t phys_b = paging::get_physical(addr_b, ctx_b->pt_root); + EXPECT_EQ(phys_a, phys_b); + + EXPECT_EQ(mm::mm_context_vma_count(ctx_a), static_cast(1)); + EXPECT_EQ(mm::mm_context_vma_count(ctx_b), static_cast(1)); + + mm::mm_context_release(ctx_a); + + sync::mutex_lock(s->lock); + pmm::phys_addr_t still_valid = mm::shmem_get_page_locked(s, 0); + sync::mutex_unlock(s->lock); + EXPECT_NE(still_valid, static_cast(0)); + + phys_b = paging::get_physical(addr_b, ctx_b->pt_root); + EXPECT_EQ(phys_b, still_valid); + + mm::mm_context_release(ctx_b); + mm::shmem::ref_destroy(s); + + EXPECT_EQ(pmm::free_page_count(), before); +} + +TEST(shmem_test, unmap_shared_does_not_free_pages) { + uint64_t before = pmm::free_page_count(); + + mm::shmem* s = mm::shmem_create(PAGE); + ASSERT_NOT_NULL(s); + + mm::mm_context* ctx = mm::mm_context_create(); + ASSERT_NOT_NULL(ctx); + + uintptr_t addr = 0; + int32_t rc = mm::mm_context_map_shared( + ctx, s, 0, PAGE, + mm::MM_PROT_READ | mm::MM_PROT_WRITE, + mm::MM_MAP_SHARED, 0, &addr); + ASSERT_EQ(rc, mm::MM_CTX_OK); + + rc = mm::mm_context_unmap(ctx, addr, PAGE); + ASSERT_EQ(rc, mm::MM_CTX_OK); + + sync::mutex_lock(s->lock); + pmm::phys_addr_t still_valid = mm::shmem_get_page_locked(s, 0); + sync::mutex_unlock(s->lock); + EXPECT_NE(still_valid, static_cast(0)); + + mm::mm_context_release(ctx); + mm::shmem::ref_destroy(s); + + EXPECT_EQ(pmm::free_page_count(), before); +} + +TEST(shmem_test, map_shared_rejects_hole) { + mm::shmem* s = mm::shmem_create(0); + ASSERT_NOT_NULL(s); + + mm::mm_context* ctx = mm::mm_context_create(); + ASSERT_NOT_NULL(ctx); + + uintptr_t addr = 0; + int32_t rc = mm::mm_context_map_shared( + ctx, s, 0, PAGE, + mm::MM_PROT_READ, mm::MM_MAP_SHARED, 0, &addr); + EXPECT_NE(rc, mm::MM_CTX_OK); + + mm::mm_context_release(ctx); + mm::shmem::ref_destroy(s); +} diff --git a/kernel/tests/socket/socket.test.cpp b/kernel/tests/socket/socket.test.cpp index 032ca57..d3e04db 100644 --- a/kernel/tests/socket/socket.test.cpp +++ b/kernel/tests/socket/socket.test.cpp @@ -8,10 +8,8 @@ #include "resource/handle_table.h" #include "sched/sched.h" #include "sched/task.h" -#include "mm/heap.h" #include "common/string.h" #include "fs/fstypes.h" -#include "fs/fs.h" #include "fs/socket_node.h" TEST_SUITE(socket_test); @@ -52,9 +50,12 @@ TEST(socket_test, ring_buffer_multiple_writes_single_read) { auto* rb = socket::ring_buffer_create(256); ASSERT_NOT_NULL(rb); - socket::ring_buffer_write(rb, reinterpret_cast("aaa"), 3); - socket::ring_buffer_write(rb, reinterpret_cast("bbb"), 3); - socket::ring_buffer_write(rb, reinterpret_cast("ccc"), 3); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("aaa"), 3), + static_cast(3)); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("bbb"), 3), + static_cast(3)); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("ccc"), 3), + static_cast(3)); uint8_t buf[32] = {}; ssize_t nr = socket::ring_buffer_read(rb, buf, sizeof(buf)); @@ -68,7 +69,8 @@ TEST(socket_test, ring_buffer_short_read) { auto* rb = socket::ring_buffer_create(256); ASSERT_NOT_NULL(rb); - socket::ring_buffer_write(rb, reinterpret_cast("xyz"), 3); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("xyz"), 3), + static_cast(3)); uint8_t buf[1] = {}; ssize_t nr = socket::ring_buffer_read(rb, buf, 1); @@ -86,7 +88,8 @@ TEST(socket_test, ring_buffer_eof_after_close_write) { auto* rb = socket::ring_buffer_create(256); ASSERT_NOT_NULL(rb); - socket::ring_buffer_write(rb, reinterpret_cast("ab"), 2); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("ab"), 2), + static_cast(2)); socket::ring_buffer_close_write(rb); uint8_t buf[32] = {}; @@ -144,7 +147,8 @@ TEST(socket_test, ring_buffer_nonblock_with_data_returns_data) { auto* rb = socket::ring_buffer_create(256); ASSERT_NOT_NULL(rb); - socket::ring_buffer_write(rb, reinterpret_cast("test"), 4); + ASSERT_EQ(socket::ring_buffer_write(rb, reinterpret_cast("test"), 4), + static_cast(4)); uint8_t buf[32] = {}; ssize_t nr = socket::ring_buffer_read(rb, buf, sizeof(buf), true);