From faefa45a542c14c75a1f1f3d26724263412a2672 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 10 Jun 2026 13:07:37 +0800 Subject: [PATCH] Correct mempolicy syscalls and add setfs{uid,gid} Dispatch table is generated from abi.h, so a wrong SYS_* number silently routes a handler to the wrong slot. SYS_get_mempolicy was defined as 239 and SYS_set_mempolicy as 238; the asm-generic numbers are 236 and 237. Guest calls to the real 236/237 fell through to unimplemented-syscall warning, while real 238/239 (migrate_pages, move_pages) silently invoked the misplaced mempolicy handlers: get_mempolicy on slot 239 wrote to whatever pointers the move_pages caller had loaded into x0/x1. setfsuid (151) and setfsgid (152) were never wired. procps brackets its /proc walks with setfsuid(uid) / setfsuid(0), so ps and top warned twice on every refresh. Linux returns the previous fs{uid,gid} from these calls; elfuse does not track fsuid separately from euid, so the stub returns proc_get_{euid,egid}() and ignores the argument. procps observes a stable cred snapshot on both brackets, which is what its access-bracketing pattern needs. The glibc query idiom setfsuid(-1) also reports the current euid, so permission-validation callers (setfsuid(target); setfsuid(-1) != target) still detect a failed transition correctly. The cred fields in proc-identity.c are _Atomic uint32_t, so unbracketed read in the stub is word-tear safe and matches the existing sc_geteuid pattern; CRED_BRACKETED is reserved for mutating syscalls that need to raise ATTN_BIT_CRED for sibling vCPUs. --- src/syscall/abi.h | 6 ++++-- src/syscall/dispatch.tbl | 2 ++ src/syscall/syscall.c | 9 +++++++++ tests/test-credentials.c | 35 +++++++++++++++++++++++++++++++++++ tests/test-tier-a.c | 4 ++-- 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/syscall/abi.h b/src/syscall/abi.h index d81d2b1..bf23484 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -117,6 +117,8 @@ #define SYS_getresuid 148 #define SYS_setresgid 149 #define SYS_getresgid 150 +#define SYS_setfsuid 151 +#define SYS_setfsgid 152 #define SYS_setpgid 154 #define SYS_getpgid 155 #define SYS_getsid 156 @@ -225,8 +227,8 @@ #define SYS_mlockall 230 #define SYS_munlockall 231 /* memory policy stubs */ -#define SYS_set_mempolicy 238 -#define SYS_get_mempolicy 239 +#define SYS_get_mempolicy 236 +#define SYS_set_mempolicy 237 /* System V IPC */ #define SYS_msgget 186 #define SYS_msgctl 187 diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index 2111449..3bb0156 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -157,6 +157,8 @@ SYS_setresuid sc_setresuid 1 SYS_getresuid sc_getresuid 0 SYS_setresgid sc_setresgid 1 SYS_getresgid sc_getresgid 0 +SYS_setfsuid sc_setfsuid 0 +SYS_setfsgid sc_setfsgid 0 SYS_setpgid sc_setpgid 0 SYS_getpgid sc_getpgid 1 SYS_getsid sc_getsid 1 diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index e8f7d7c..92c17eb 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -542,6 +542,15 @@ SC_FORWARD(sc_setreuid, CRED_BRACKETED(g, proc_sys_setreuid((uint32_t) x0, (uin SC_FORWARD(sc_setregid, CRED_BRACKETED(g, proc_sys_setregid((uint32_t) x0, (uint32_t) x1))) SC_FORWARD(sc_setresuid, CRED_BRACKETED(g, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) SC_FORWARD(sc_setresgid, CRED_BRACKETED(g, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) + +/* setfs{uid,gid}: Linux returns the previous fs{uid,gid} and only mutates state + * on a permitted transition. elfuse does not track fsuid separately from euid, + * so report the current e{uid,gid} and ignore the change. procps brackets /proc + * walks with setfsuid(uid)/setfsuid(0); both calls observe a stable cred + * snapshot, which is what it needs. + */ +SC_FORWARD(sc_setfsuid, (int64_t) proc_get_euid()) +SC_FORWARD(sc_setfsgid, (int64_t) proc_get_egid()) SC_FORWARD(sc_setpgid, proc_sys_setpgid(g, (int64_t) x0, (int64_t) x1)) SC_STUB(sc_fadvise64, 0) SC_STUB(sc_sched_yield, (sched_yield(), 0)) diff --git a/tests/test-credentials.c b/tests/test-credentials.c index 0fbb4af..e90d29f 100644 --- a/tests/test-credentials.c +++ b/tests/test-credentials.c @@ -18,6 +18,8 @@ #define __NR_getresuid 148 #define __NR_getresgid 150 #define __NR_setreuid 145 +#define __NR_setfsuid 151 +#define __NR_setfsgid 152 #define __NR_capset 91 #define __NR_setpriority 140 #define __NR_getpriority 141 @@ -119,6 +121,39 @@ int main(void) TEST("setgid(0) returns -EPERM"); EXPECT_TRUE(raw_syscall1(__NR_setgid, 0) == -1, "expected -EPERM"); + /* setfsuid / setfsgid: Linux contract is to return the previous fsuid / + * fsgid. elfuse reports the current euid / egid (1000) on every call, + * with no state mutation, which is what procps relies on when it + * brackets /proc reads with setfsuid(uid) / setfsuid(0). + */ + TEST("setfsuid(0) returns 1000"); + EXPECT_TRUE(raw_syscall1(__NR_setfsuid, 0) == 1000, + "setfsuid(0) did not return current euid"); + + TEST("setfsuid(1000) returns 1000"); + EXPECT_TRUE(raw_syscall1(__NR_setfsuid, 1000) == 1000, + "setfsuid(1000) did not return current euid"); + + TEST("setfsgid(0) returns 1000"); + EXPECT_TRUE(raw_syscall1(__NR_setfsgid, 0) == 1000, + "setfsgid(0) did not return current egid"); + + TEST("setfsgid(1000) returns 1000"); + EXPECT_TRUE(raw_syscall1(__NR_setfsgid, 1000) == 1000, + "setfsgid(1000) did not return current egid"); + + /* setfsuid(-1) / setfsgid(-1) is the canonical glibc "read fsuid without + * changing it" idiom: -1 is never a valid uid, so the kernel only + * reports the current fsuid. + */ + TEST("setfsuid(-1) reports current fsuid"); + EXPECT_TRUE(raw_syscall1(__NR_setfsuid, (long) (unsigned) -1) == 1000, + "setfsuid(-1) did not report current euid"); + + TEST("setfsgid(-1) reports current fsgid"); + EXPECT_TRUE(raw_syscall1(__NR_setfsgid, (long) (unsigned) -1) == 1000, + "setfsgid(-1) did not report current egid"); + /* capset: unprivileged process cannot set capabilities */ TEST("capset returns -EPERM"); { diff --git a/tests/test-tier-a.c b/tests/test-tier-a.c index c0f7128..f39f29a 100644 --- a/tests/test-tier-a.c +++ b/tests/test-tier-a.c @@ -168,10 +168,10 @@ static void test_msgctl_rmid(void) * Use raw syscall since glibc may not wrap them. */ #ifndef __NR_get_mempolicy -#define __NR_get_mempolicy 239 +#define __NR_get_mempolicy 236 #endif #ifndef __NR_set_mempolicy -#define __NR_set_mempolicy 238 +#define __NR_set_mempolicy 237 #endif static void test_get_mempolicy(void)