From ed82ad691bccb337fa9c0101699a3fbc89365948 Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 4 Aug 2025 19:01:01 +0800 Subject: [PATCH 01/39] riscv: use ".L" local labels in assembly when applicable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7-rc1 commit b18f7296fbfdb2ad0871f00f3042fc74663d52ac category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- For the sake of coherency, use local labels in assembly when applicable. This also avoid kprobes being confused when applying a kprobe since the size of function is computed by checking where the next visible symbol is located. This might end up in computing some function size to be way shorter than expected and thus failing to apply kprobes to the specified offset. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231024132655.730417-2-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/kernel/entry.S | 6 ++--- arch/riscv/kernel/head.S | 18 ++++++------- arch/riscv/kernel/mcount.S | 10 +++---- arch/riscv/lib/memmove.S | 54 +++++++++++++++++++------------------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 1f90fee24a8ba..07e81835a7245 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -25,9 +25,9 @@ SYM_CODE_START(handle_exception) * register will contain 0, and we should continue on the current TP. */ csrrw tp, CSR_SCRATCH, tp - bnez tp, _save_context + bnez tp, .Lsave_context -_restore_kernel_tpsp: +.Lrestore_kernel_tpsp: csrr tp, CSR_SCRATCH REG_S sp, TASK_TI_KERNEL_SP(tp) @@ -39,7 +39,7 @@ _restore_kernel_tpsp: REG_L sp, TASK_TI_KERNEL_SP(tp) #endif -_save_context: +.Lsave_context: REG_S sp, TASK_TI_USER_SP(tp) REG_L sp, TASK_TI_KERNEL_SP(tp) addi sp, sp, -(PT_SIZE_ON_STACK) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 9691fa8f2faa7..f6f2a384abc7b 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -169,12 +169,12 @@ secondary_start_sbi: XIP_FIXUP_OFFSET a0 call relocate_enable_mmu #endif - call setup_trap_vector + call .Lsetup_trap_vector tail smp_callin #endif /* CONFIG_SMP */ .align 2 -setup_trap_vector: +.Lsetup_trap_vector: /* Set trap vector to exception handler */ la a0, handle_exception csrw CSR_TVEC, a0 @@ -211,7 +211,7 @@ ENTRY(_start_kernel) * not implement PMPs, so we set up a quick trap handler to just skip * touching the PMPs on any trap. */ - la a0, pmp_done + la a0, .Lpmp_done csrw CSR_TVEC, a0 li a0, -1 @@ -219,7 +219,7 @@ ENTRY(_start_kernel) li a0, (PMP_A_NAPOT | PMP_R | PMP_W | PMP_X) csrw CSR_PMPCFG0, a0 .align 2 -pmp_done: +.Lpmp_done: /* * The hartid in a0 is expected later on, and we have no firmware @@ -283,12 +283,12 @@ pmp_done: /* Clear BSS for flat non-ELF images */ la a3, __bss_start la a4, __bss_stop - ble a4, a3, clear_bss_done -clear_bss: + ble a4, a3, .Lclear_bss_done +.Lclear_bss: REG_S zero, (a3) add a3, a3, RISCV_SZPTR - blt a3, a4, clear_bss -clear_bss_done: + blt a3, a4, .Lclear_bss +.Lclear_bss_done: #endif la a2, boot_cpu_hartid XIP_FIXUP_OFFSET a2 @@ -315,7 +315,7 @@ clear_bss_done: call relocate_enable_mmu #endif /* CONFIG_MMU */ - call setup_trap_vector + call .Lsetup_trap_vector /* Restore C environment */ la tp, init_task la sp, init_thread_union + THREAD_SIZE diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 8818a8fa9ff3a..ab4dd0594fe7b 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -85,16 +85,16 @@ ENTRY(MCOUNT_NAME) #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return REG_L t1, 0(t0) - bne t1, t4, do_ftrace_graph_caller + bne t1, t4, .Ldo_ftrace_graph_caller la t3, ftrace_graph_entry REG_L t2, 0(t3) la t6, ftrace_graph_entry_stub - bne t2, t6, do_ftrace_graph_caller + bne t2, t6, .Ldo_ftrace_graph_caller #endif la t3, ftrace_trace_function REG_L t5, 0(t3) - bne t5, t4, do_trace + bne t5, t4, .Ldo_trace ret #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -102,7 +102,7 @@ ENTRY(MCOUNT_NAME) * A pseudo representation for the function graph tracer: * prepare_to_return(&ra_to_caller_of_caller, ra_to_caller) */ -do_ftrace_graph_caller: +.Ldo_ftrace_graph_caller: addi a0, s0, -SZREG mv a1, ra #ifdef HAVE_FUNCTION_GRAPH_FP_TEST @@ -118,7 +118,7 @@ do_ftrace_graph_caller: * A pseudo representation for the function tracer: * (*ftrace_trace_function)(ra_to_caller, ra_to_caller_of_caller) */ -do_trace: +.Ldo_trace: REG_L a1, -SZREG(s0) mv a0, ra diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S index 838ff2022fe32..1930b388c3a0c 100644 --- a/arch/riscv/lib/memmove.S +++ b/arch/riscv/lib/memmove.S @@ -26,8 +26,8 @@ SYM_FUNC_START_WEAK(memmove) */ /* Return if nothing to do */ - beq a0, a1, return_from_memmove - beqz a2, return_from_memmove + beq a0, a1, .Lreturn_from_memmove + beqz a2, .Lreturn_from_memmove /* * Register Uses @@ -60,7 +60,7 @@ SYM_FUNC_START_WEAK(memmove) * small enough not to bother. */ andi t0, a2, -(2 * SZREG) - beqz t0, byte_copy + beqz t0, .Lbyte_copy /* * Now solve for t5 and t6. @@ -87,14 +87,14 @@ SYM_FUNC_START_WEAK(memmove) */ xor t0, a0, a1 andi t1, t0, (SZREG - 1) - beqz t1, coaligned_copy + beqz t1, .Lcoaligned_copy /* Fall through to misaligned fixup copy */ -misaligned_fixup_copy: - bltu a1, a0, misaligned_fixup_copy_reverse +.Lmisaligned_fixup_copy: + bltu a1, a0, .Lmisaligned_fixup_copy_reverse -misaligned_fixup_copy_forward: - jal t0, byte_copy_until_aligned_forward +.Lmisaligned_fixup_copy_forward: + jal t0, .Lbyte_copy_until_aligned_forward andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ @@ -153,10 +153,10 @@ misaligned_fixup_copy_forward: mv t3, t6 /* Fix the dest pointer in case the loop was broken */ add a1, t3, a5 /* Restore the src pointer */ - j byte_copy_forward /* Copy any remaining bytes */ + j .Lbyte_copy_forward /* Copy any remaining bytes */ -misaligned_fixup_copy_reverse: - jal t0, byte_copy_until_aligned_reverse +.Lmisaligned_fixup_copy_reverse: + jal t0, .Lbyte_copy_until_aligned_reverse andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ @@ -215,18 +215,18 @@ misaligned_fixup_copy_reverse: mv t4, t5 /* Fix the dest pointer in case the loop was broken */ add a4, t4, a5 /* Restore the src pointer */ - j byte_copy_reverse /* Copy any remaining bytes */ + j .Lbyte_copy_reverse /* Copy any remaining bytes */ /* * Simple copy loops for SZREG co-aligned memory locations. * These also make calls to do byte copies for any unaligned * data at their terminations. */ -coaligned_copy: - bltu a1, a0, coaligned_copy_reverse +.Lcoaligned_copy: + bltu a1, a0, .Lcoaligned_copy_reverse -coaligned_copy_forward: - jal t0, byte_copy_until_aligned_forward +.Lcoaligned_copy_forward: + jal t0, .Lbyte_copy_until_aligned_forward 1: REG_L t1, ( 0 * SZREG)(a1) @@ -235,10 +235,10 @@ coaligned_copy_forward: REG_S t1, (-1 * SZREG)(t3) bne t3, t6, 1b - j byte_copy_forward /* Copy any remaining bytes */ + j .Lbyte_copy_forward /* Copy any remaining bytes */ -coaligned_copy_reverse: - jal t0, byte_copy_until_aligned_reverse +.Lcoaligned_copy_reverse: + jal t0, .Lbyte_copy_until_aligned_reverse 1: REG_L t1, (-1 * SZREG)(a4) @@ -247,7 +247,7 @@ coaligned_copy_reverse: REG_S t1, ( 0 * SZREG)(t4) bne t4, t5, 1b - j byte_copy_reverse /* Copy any remaining bytes */ + j .Lbyte_copy_reverse /* Copy any remaining bytes */ /* * These are basically sub-functions within the function. They @@ -258,7 +258,7 @@ coaligned_copy_reverse: * up from where they were left and we avoid code duplication * without any overhead except the call in and return jumps. */ -byte_copy_until_aligned_forward: +.Lbyte_copy_until_aligned_forward: beq t3, t5, 2f 1: lb t1, 0(a1) @@ -269,7 +269,7 @@ byte_copy_until_aligned_forward: 2: jalr zero, 0x0(t0) /* Return to multibyte copy loop */ -byte_copy_until_aligned_reverse: +.Lbyte_copy_until_aligned_reverse: beq t4, t6, 2f 1: lb t1, -1(a4) @@ -285,10 +285,10 @@ byte_copy_until_aligned_reverse: * These will byte copy until they reach the end of data to copy. * At that point, they will call to return from memmove. */ -byte_copy: - bltu a1, a0, byte_copy_reverse +.Lbyte_copy: + bltu a1, a0, .Lbyte_copy_reverse -byte_copy_forward: +.Lbyte_copy_forward: beq t3, t4, 2f 1: lb t1, 0(a1) @@ -299,7 +299,7 @@ byte_copy_forward: 2: ret -byte_copy_reverse: +.Lbyte_copy_reverse: beq t4, t3, 2f 1: lb t1, -1(a4) @@ -309,7 +309,7 @@ byte_copy_reverse: bne t4, t3, 1b 2: -return_from_memmove: +.Lreturn_from_memmove: ret SYM_FUNC_END(memmove) From d031b42c1e4358352234063b339f2d5a14b4d83f Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 4 Aug 2025 19:11:02 +0800 Subject: [PATCH 02/39] riscv: Use SYM_*() assembly macros instead of deprecated ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7-rc1 commit 76329c693924d8f37afbf361f0d8daab594e1644 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- ENTRY()/END()/WEAK() macros are deprecated and we should make use of the new SYM_*() macros [1] for better annotation of symbols. Replace the deprecated ones with the new ones and fix wrong usage of END()/ENDPROC() to correctly describe the symbols. [1] https://docs.kernel.org/core-api/asm-annotations.html Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231024132655.730417-3-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/kernel/copy-unaligned.S | 8 ++++---- arch/riscv/kernel/fpu.S | 8 ++++---- arch/riscv/kernel/head.S | 12 +++++------ arch/riscv/kernel/hibernate-asm.S | 12 +++++------ arch/riscv/kernel/mcount-dyn.S | 20 ++++++++----------- arch/riscv/kernel/mcount.S | 8 ++++---- arch/riscv/kernel/probes/rethook_trampoline.S | 4 ++-- arch/riscv/kernel/suspend_entry.S | 4 ++-- arch/riscv/kernel/vdso/flush_icache.S | 4 ++-- arch/riscv/kernel/vdso/getcpu.S | 4 ++-- arch/riscv/kernel/vdso/rt_sigreturn.S | 4 ++-- arch/riscv/kernel/vdso/sys_hwprobe.S | 4 ++-- arch/riscv/lib/memcpy.S | 6 +++--- arch/riscv/lib/memmove.S | 3 +-- arch/riscv/lib/memset.S | 6 +++--- arch/riscv/lib/uaccess.S | 11 +++++----- arch/riscv/purgatory/entry.S | 15 +++----------- 17 files changed, 59 insertions(+), 74 deletions(-) diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S index cfdecfbaad627..2b3d9398c113f 100644 --- a/arch/riscv/kernel/copy-unaligned.S +++ b/arch/riscv/kernel/copy-unaligned.S @@ -9,7 +9,7 @@ /* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of 8 * SZREG */ -ENTRY(__riscv_copy_words_unaligned) +SYM_FUNC_START(__riscv_copy_words_unaligned) andi a4, a2, ~((8*SZREG)-1) beqz a4, 2f add a3, a1, a4 @@ -36,12 +36,12 @@ ENTRY(__riscv_copy_words_unaligned) 2: ret -END(__riscv_copy_words_unaligned) +SYM_FUNC_END(__riscv_copy_words_unaligned) /* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -ENTRY(__riscv_copy_bytes_unaligned) +SYM_FUNC_START(__riscv_copy_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 @@ -68,4 +68,4 @@ ENTRY(__riscv_copy_bytes_unaligned) 2: ret -END(__riscv_copy_bytes_unaligned) +SYM_FUNC_END(__riscv_copy_bytes_unaligned) diff --git a/arch/riscv/kernel/fpu.S b/arch/riscv/kernel/fpu.S index dd2205473de78..6201afcd6b45f 100644 --- a/arch/riscv/kernel/fpu.S +++ b/arch/riscv/kernel/fpu.S @@ -19,7 +19,7 @@ #include #include -ENTRY(__fstate_save) +SYM_FUNC_START(__fstate_save) li a2, TASK_THREAD_F0 add a0, a0, a2 li t1, SR_FS @@ -60,9 +60,9 @@ ENTRY(__fstate_save) sw t0, TASK_THREAD_FCSR_F0(a0) csrc CSR_STATUS, t1 ret -ENDPROC(__fstate_save) +SYM_FUNC_END(__fstate_save) -ENTRY(__fstate_restore) +SYM_FUNC_START(__fstate_restore) li a2, TASK_THREAD_F0 add a0, a0, a2 li t1, SR_FS @@ -103,4 +103,4 @@ ENTRY(__fstate_restore) fscsr t0 csrc CSR_STATUS, t1 ret -ENDPROC(__fstate_restore) +SYM_FUNC_END(__fstate_restore) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f6f2a384abc7b..8eec82355f92f 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -18,7 +18,7 @@ #include "efi-header.S" __HEAD -ENTRY(_start) +SYM_CODE_START(_start) /* * Image header expected by Linux boot-loaders. The image header data * structure is described in asm/image.h. @@ -192,9 +192,9 @@ secondary_start_sbi: wfi j .Lsecondary_park -END(_start) +SYM_CODE_END(_start) -ENTRY(_start_kernel) +SYM_CODE_START(_start_kernel) /* Mask all interrupts */ csrw CSR_IE, zero csrw CSR_IP, zero @@ -357,10 +357,10 @@ ENTRY(_start_kernel) tail .Lsecondary_start_common #endif /* CONFIG_RISCV_BOOT_SPINWAIT */ -END(_start_kernel) +SYM_CODE_END(_start_kernel) #ifdef CONFIG_RISCV_M_MODE -ENTRY(reset_regs) +SYM_CODE_START_LOCAL(reset_regs) li sp, 0 li gp, 0 li tp, 0 @@ -458,5 +458,5 @@ ENTRY(reset_regs) .Lreset_regs_done_vector: #endif /* CONFIG_RISCV_ISA_V */ ret -END(reset_regs) +SYM_CODE_END(reset_regs) #endif /* CONFIG_RISCV_M_MODE */ diff --git a/arch/riscv/kernel/hibernate-asm.S b/arch/riscv/kernel/hibernate-asm.S index d698dd7df637b..d040dcf4add45 100644 --- a/arch/riscv/kernel/hibernate-asm.S +++ b/arch/riscv/kernel/hibernate-asm.S @@ -21,7 +21,7 @@ * * Always returns 0 */ -ENTRY(__hibernate_cpu_resume) +SYM_FUNC_START(__hibernate_cpu_resume) /* switch to hibernated image's page table. */ csrw CSR_SATP, s0 sfence.vma @@ -34,7 +34,7 @@ ENTRY(__hibernate_cpu_resume) mv a0, zero ret -END(__hibernate_cpu_resume) +SYM_FUNC_END(__hibernate_cpu_resume) /* * Prepare to restore the image. @@ -42,7 +42,7 @@ END(__hibernate_cpu_resume) * a1: satp of temporary page tables. * a2: cpu_resume. */ -ENTRY(hibernate_restore_image) +SYM_FUNC_START(hibernate_restore_image) mv s0, a0 mv s1, a1 mv s2, a2 @@ -50,7 +50,7 @@ ENTRY(hibernate_restore_image) REG_L a1, relocated_restore_code jr a1 -END(hibernate_restore_image) +SYM_FUNC_END(hibernate_restore_image) /* * The below code will be executed from a 'safe' page. @@ -58,7 +58,7 @@ END(hibernate_restore_image) * back to the original memory location. Finally, it jumps to __hibernate_cpu_resume() * to restore the CPU context. */ -ENTRY(hibernate_core_restore_code) +SYM_FUNC_START(hibernate_core_restore_code) /* switch to temp page table. */ csrw satp, s1 sfence.vma @@ -73,4 +73,4 @@ ENTRY(hibernate_core_restore_code) bnez s4, .Lcopy jr s2 -END(hibernate_core_restore_code) +SYM_FUNC_END(hibernate_core_restore_code) diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 669b8697aa38a..58dd96a2a1534 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -82,7 +82,7 @@ .endm #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) SAVE_ABI addi a0, t0, -FENTRY_RA_OFFSET @@ -91,8 +91,7 @@ ENTRY(ftrace_caller) mv a1, ra mv a3, sp -ftrace_call: - .global ftrace_call +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -102,16 +101,15 @@ ftrace_call: #ifdef HAVE_FUNCTION_GRAPH_FP_TEST mv a2, s0 #endif -ftrace_graph_call: - .global ftrace_graph_call +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) call ftrace_stub #endif RESTORE_ABI jr t0 -ENDPROC(ftrace_caller) +SYM_FUNC_END(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS -ENTRY(ftrace_regs_caller) +SYM_FUNC_START(ftrace_regs_caller) SAVE_ALL addi a0, t0, -FENTRY_RA_OFFSET @@ -120,8 +118,7 @@ ENTRY(ftrace_regs_caller) mv a1, ra mv a3, sp -ftrace_regs_call: - .global ftrace_regs_call +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -131,12 +128,11 @@ ftrace_regs_call: #ifdef HAVE_FUNCTION_GRAPH_FP_TEST mv a2, s0 #endif -ftrace_graph_regs_call: - .global ftrace_graph_regs_call +SYM_INNER_LABEL(ftrace_graph_regs_call, SYM_L_GLOBAL) call ftrace_stub #endif RESTORE_ALL jr t0 -ENDPROC(ftrace_regs_caller) +SYM_FUNC_END(ftrace_regs_caller) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index ab4dd0594fe7b..b4dd9ed6849e3 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -61,7 +61,7 @@ SYM_TYPED_FUNC_START(ftrace_stub_graph) ret SYM_FUNC_END(ftrace_stub_graph) -ENTRY(return_to_handler) +SYM_FUNC_START(return_to_handler) /* * On implementing the frame point test, the ideal way is to compare the * s0 (frame pointer, if enabled) on entry and the sp (stack pointer) on return. @@ -76,11 +76,11 @@ ENTRY(return_to_handler) mv a2, a0 RESTORE_RET_ABI_STATE jalr a2 -ENDPROC(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif #ifndef CONFIG_DYNAMIC_FTRACE -ENTRY(MCOUNT_NAME) +SYM_FUNC_START(MCOUNT_NAME) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return @@ -126,6 +126,6 @@ ENTRY(MCOUNT_NAME) jalr t5 RESTORE_ABI_STATE ret -ENDPROC(MCOUNT_NAME) +SYM_FUNC_END(MCOUNT_NAME) #endif EXPORT_SYMBOL(MCOUNT_NAME) diff --git a/arch/riscv/kernel/probes/rethook_trampoline.S b/arch/riscv/kernel/probes/rethook_trampoline.S index 21bac92a170a9..f2cd83d9b0f00 100644 --- a/arch/riscv/kernel/probes/rethook_trampoline.S +++ b/arch/riscv/kernel/probes/rethook_trampoline.S @@ -75,7 +75,7 @@ REG_L x31, PT_T6(sp) .endm -ENTRY(arch_rethook_trampoline) +SYM_CODE_START(arch_rethook_trampoline) addi sp, sp, -(PT_SIZE_ON_STACK) save_all_base_regs @@ -90,4 +90,4 @@ ENTRY(arch_rethook_trampoline) addi sp, sp, PT_SIZE_ON_STACK ret -ENDPROC(arch_rethook_trampoline) +SYM_CODE_END(arch_rethook_trampoline) diff --git a/arch/riscv/kernel/suspend_entry.S b/arch/riscv/kernel/suspend_entry.S index f7960c7c5f9e2..a59c4c9036963 100644 --- a/arch/riscv/kernel/suspend_entry.S +++ b/arch/riscv/kernel/suspend_entry.S @@ -16,7 +16,7 @@ .altmacro .option norelax -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) /* Save registers (except A0 and T0-T6) */ REG_S ra, (SUSPEND_CONTEXT_REGS + PT_RA)(a0) REG_S sp, (SUSPEND_CONTEXT_REGS + PT_SP)(a0) @@ -57,7 +57,7 @@ ENTRY(__cpu_suspend_enter) /* Return to C code */ ret -END(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) SYM_TYPED_FUNC_START(__cpu_resume_enter) /* Load the global pointer */ diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S index 82f97d67c23e9..8f884227e8bca 100644 --- a/arch/riscv/kernel/vdso/flush_icache.S +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -8,7 +8,7 @@ .text /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ -ENTRY(__vdso_flush_icache) +SYM_FUNC_START(__vdso_flush_icache) .cfi_startproc #ifdef CONFIG_SMP li a7, __NR_riscv_flush_icache @@ -19,4 +19,4 @@ ENTRY(__vdso_flush_icache) #endif ret .cfi_endproc -ENDPROC(__vdso_flush_icache) +SYM_FUNC_END(__vdso_flush_icache) diff --git a/arch/riscv/kernel/vdso/getcpu.S b/arch/riscv/kernel/vdso/getcpu.S index bb0c05e2ffbae..9c1bd531907f2 100644 --- a/arch/riscv/kernel/vdso/getcpu.S +++ b/arch/riscv/kernel/vdso/getcpu.S @@ -8,11 +8,11 @@ .text /* int __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); */ -ENTRY(__vdso_getcpu) +SYM_FUNC_START(__vdso_getcpu) .cfi_startproc /* For now, just do the syscall. */ li a7, __NR_getcpu ecall ret .cfi_endproc -ENDPROC(__vdso_getcpu) +SYM_FUNC_END(__vdso_getcpu) diff --git a/arch/riscv/kernel/vdso/rt_sigreturn.S b/arch/riscv/kernel/vdso/rt_sigreturn.S index 10438c7c626ac..3dc022aa8931a 100644 --- a/arch/riscv/kernel/vdso/rt_sigreturn.S +++ b/arch/riscv/kernel/vdso/rt_sigreturn.S @@ -7,10 +7,10 @@ #include .text -ENTRY(__vdso_rt_sigreturn) +SYM_FUNC_START(__vdso_rt_sigreturn) .cfi_startproc .cfi_signal_frame li a7, __NR_rt_sigreturn ecall .cfi_endproc -ENDPROC(__vdso_rt_sigreturn) +SYM_FUNC_END(__vdso_rt_sigreturn) diff --git a/arch/riscv/kernel/vdso/sys_hwprobe.S b/arch/riscv/kernel/vdso/sys_hwprobe.S index 4e704146c77a0..77e57f8305216 100644 --- a/arch/riscv/kernel/vdso/sys_hwprobe.S +++ b/arch/riscv/kernel/vdso/sys_hwprobe.S @@ -5,11 +5,11 @@ #include .text -ENTRY(riscv_hwprobe) +SYM_FUNC_START(riscv_hwprobe) .cfi_startproc li a7, __NR_riscv_hwprobe ecall ret .cfi_endproc -ENDPROC(riscv_hwprobe) +SYM_FUNC_END(riscv_hwprobe) diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S index 1a40d01a95439..44e009ec5fef6 100644 --- a/arch/riscv/lib/memcpy.S +++ b/arch/riscv/lib/memcpy.S @@ -7,8 +7,7 @@ #include /* void *memcpy(void *, const void *, size_t) */ -ENTRY(__memcpy) -WEAK(memcpy) +SYM_FUNC_START(__memcpy) move t6, a0 /* Preserve return value */ /* Defer to byte-oriented copy for small sizes */ @@ -105,6 +104,7 @@ WEAK(memcpy) bltu a1, a3, 5b 6: ret -END(__memcpy) +SYM_FUNC_END(__memcpy) +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) SYM_FUNC_ALIAS(__pi_memcpy, __memcpy) SYM_FUNC_ALIAS(__pi___memcpy, __memcpy) diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S index 1930b388c3a0c..cb3e2e7ef0baa 100644 --- a/arch/riscv/lib/memmove.S +++ b/arch/riscv/lib/memmove.S @@ -7,7 +7,6 @@ #include SYM_FUNC_START(__memmove) -SYM_FUNC_START_WEAK(memmove) /* * Returns * a0 - dest @@ -312,7 +311,7 @@ SYM_FUNC_START_WEAK(memmove) .Lreturn_from_memmove: ret -SYM_FUNC_END(memmove) SYM_FUNC_END(__memmove) +SYM_FUNC_ALIAS_WEAK(memmove, __memmove) SYM_FUNC_ALIAS(__pi_memmove, __memmove) SYM_FUNC_ALIAS(__pi___memmove, __memmove) diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S index 34c5360c6705c..35f358e70bdb6 100644 --- a/arch/riscv/lib/memset.S +++ b/arch/riscv/lib/memset.S @@ -8,8 +8,7 @@ #include /* void *memset(void *, int, size_t) */ -ENTRY(__memset) -WEAK(memset) +SYM_FUNC_START(__memset) move t0, a0 /* Preserve return value */ /* Defer to byte-oriented fill for small sizes */ @@ -110,4 +109,5 @@ WEAK(memset) bltu t0, a3, 5b 6: ret -END(__memset) +SYM_FUNC_END(__memset) +SYM_FUNC_ALIAS_WEAK(memset, __memset) diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 09b47ebacf2e8..3ab438f30d132 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -10,8 +10,7 @@ _asm_extable 100b, \lbl .endm -ENTRY(__asm_copy_to_user) -ENTRY(__asm_copy_from_user) +SYM_FUNC_START(__asm_copy_to_user) /* Enable access to user memory */ li t6, SR_SUM @@ -181,13 +180,13 @@ ENTRY(__asm_copy_from_user) csrc CSR_STATUS, t6 sub a0, t5, a0 ret -ENDPROC(__asm_copy_to_user) -ENDPROC(__asm_copy_from_user) +SYM_FUNC_END(__asm_copy_to_user) EXPORT_SYMBOL(__asm_copy_to_user) +SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) EXPORT_SYMBOL(__asm_copy_from_user) -ENTRY(__clear_user) +SYM_FUNC_START(__clear_user) /* Enable access to user memory */ li t6, SR_SUM @@ -233,5 +232,5 @@ ENTRY(__clear_user) csrc CSR_STATUS, t6 sub a0, a3, a0 ret -ENDPROC(__clear_user) +SYM_FUNC_END(__clear_user) EXPORT_SYMBOL(__clear_user) diff --git a/arch/riscv/purgatory/entry.S b/arch/riscv/purgatory/entry.S index a4ede42bc1510..96df126df649c 100644 --- a/arch/riscv/purgatory/entry.S +++ b/arch/riscv/purgatory/entry.S @@ -8,17 +8,12 @@ * */ -.macro size, sym:req - .size \sym, . - \sym -.endm #include #include .text -.globl purgatory_start -purgatory_start: - +SYM_CODE_START(purgatory_start) lla sp, .Lstack mv s0, a0 /* The hartid of the current hart */ mv s1, a1 /* Phys address of the FDT image */ @@ -30,8 +25,7 @@ purgatory_start: mv a1, s1 ld a2, riscv_kernel_entry jr a2 - -size purgatory_start +SYM_CODE_END(purgatory_start) .align 4 .rept 256 @@ -42,9 +36,6 @@ size purgatory_start .data .align LGREG -.globl riscv_kernel_entry -riscv_kernel_entry: - .quad 0 -size riscv_kernel_entry +SYM_DATA(riscv_kernel_entry, .quad 0) .end From 15a4470917e2508df7511bf6d8988711be28d697 Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 4 Aug 2025 19:16:05 +0800 Subject: [PATCH 03/39] riscv: kernel: Use correct SYM_DATA_*() macro for data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7-rc1 commit 4cc0d8a3f109fbdd8100ed88fc9417203a5d5b4e category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- Some data were incorrectly annotated with SYM_FUNC_*() instead of SYM_DATA_*() ones. Use the correct ones. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231024132655.730417-4-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/kernel/entry.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 07e81835a7245..ffc521592687d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -279,7 +279,7 @@ SYM_FUNC_END(__switch_to) .section ".rodata" .align LGREG /* Exception vector table */ -SYM_CODE_START(excp_vect_table) +SYM_DATA_START_LOCAL(excp_vect_table) RISCV_PTR do_trap_insn_misaligned ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault) RISCV_PTR do_trap_insn_illegal @@ -297,12 +297,11 @@ SYM_CODE_START(excp_vect_table) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ -excp_vect_table_end: -SYM_CODE_END(excp_vect_table) +SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end) #ifndef CONFIG_MMU -SYM_CODE_START(__user_rt_sigreturn) +SYM_DATA_START(__user_rt_sigreturn) li a7, __NR_rt_sigreturn ecall -SYM_CODE_END(__user_rt_sigreturn) +SYM_DATA_END(__user_rt_sigreturn) #endif From 309720e985ba948066b434b2bac03264b7df074c Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 09:39:27 +0800 Subject: [PATCH 04/39] riscv: Add support for kernel mode vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit ecd2ada8a5e0b464dab54f71d4ba7bbf5708711f category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- Add kernel_vector_begin() and kernel_vector_end() function declarations and corresponding definitions in kernel_mode_vector.c These are needed to wrap uses of vector in kernel mode. Co-developed-by: Vincent Chen Signed-off-by: Vincent Chen Signed-off-by: Greentime Hu Signed-off-by: Andy Chiu Reviewed-by: Eric Biggers Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-2-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/processor.h | 12 ++- arch/riscv/include/asm/simd.h | 44 ++++++++++ arch/riscv/include/asm/vector.h | 9 ++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/kernel_mode_vector.c | 116 +++++++++++++++++++++++++ arch/riscv/kernel/process.c | 1 + 6 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/simd.h create mode 100644 arch/riscv/kernel/kernel_mode_vector.c diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 0de0e2e29a826..f5957b0c44be5 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -78,6 +78,15 @@ struct task_struct; struct pt_regs; +/* + * We use a flag to track in-kernel Vector context. Currently the flag has the + * following meaning: + * + * - bit 0: indicates whether the in-kernel Vector context is active. The + * activation of this state disables the preemption. + */ +#define RISCV_KERNEL_MODE_V 0x1 + /* CPU-specific state of a task */ struct thread_struct { /* Callee-saved registers */ @@ -86,7 +95,8 @@ struct thread_struct { unsigned long s[12]; /* s[0]: frame pointer */ struct __riscv_d_ext_state fstate; unsigned long bad_cause; - unsigned long vstate_ctrl; + u32 riscv_v_flags; + u32 vstate_ctrl; struct __riscv_v_ext_state vstate; }; diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h new file mode 100644 index 0000000000000..ef8af413a9fc7 --- /dev/null +++ b/arch/riscv/include/asm/simd.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017 Linaro Ltd. + * Copyright (C) 2023 SiFive + */ + +#ifndef __ASM_SIMD_H +#define __ASM_SIMD_H + +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_RISCV_ISA_V +/* + * may_use_simd - whether it is allowable at this time to issue vector + * instructions or access the vector register file + * + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static __must_check inline bool may_use_simd(void) +{ + /* + * RISCV_KERNEL_MODE_V is only set while preemption is disabled, + * and is clear whenever preemption is enabled. + */ + return !in_hardirq() && !in_nmi() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); +} + +#else /* ! CONFIG_RISCV_ISA_V */ + +static __must_check inline bool may_use_simd(void) +{ + return false; +} + +#endif /* ! CONFIG_RISCV_ISA_V */ + +#endif diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index ad4d3d8209b11..d26986484e302 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -23,6 +23,15 @@ extern unsigned long riscv_v_vsize; int riscv_v_setup_vsize(void); bool riscv_v_first_use_handler(struct pt_regs *regs); +void kernel_vector_begin(void); +void kernel_vector_end(void); +void get_cpu_vector_context(void); +void put_cpu_vector_context(void); + +static inline u32 riscv_v_flags(void) +{ + return current->thread.riscv_v_flags; +} static __always_inline bool has_vector(void) { diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index e93c358e5c798..343b8e570607c 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_M_MODE) += traps_misaligned.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_RISCV_ISA_V) += vector.o +obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c new file mode 100644 index 0000000000000..114cf4f0a0eb6 --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Catalin Marinas + * Copyright (C) 2017 Linaro Ltd. + * Copyright (C) 2021 SiFive + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +static inline void riscv_v_flags_set(u32 flags) +{ + current->thread.riscv_v_flags = flags; +} + +static inline void riscv_v_start(u32 flags) +{ + int orig; + + orig = riscv_v_flags(); + BUG_ON((orig & flags) != 0); + riscv_v_flags_set(orig | flags); +} + +static inline void riscv_v_stop(u32 flags) +{ + int orig; + + orig = riscv_v_flags(); + BUG_ON((orig & flags) == 0); + riscv_v_flags_set(orig & ~flags); +} + +/* + * Claim ownership of the CPU vector context for use by the calling context. + * + * The caller may freely manipulate the vector context metadata until + * put_cpu_vector_context() is called. + */ +void get_cpu_vector_context(void) +{ + preempt_disable(); + + riscv_v_start(RISCV_KERNEL_MODE_V); +} + +/* + * Release the CPU vector context. + * + * Must be called from a context in which get_cpu_vector_context() was + * previously called, with no call to put_cpu_vector_context() in the + * meantime. + */ +void put_cpu_vector_context(void) +{ + riscv_v_stop(RISCV_KERNEL_MODE_V); + + preempt_enable(); +} + +/* + * kernel_vector_begin(): obtain the CPU vector registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the vector registers is saved back to memory as necessary. + * + * A matching call to kernel_vector_end() must be made before returning from the + * calling context. + * + * The caller may freely use the vector registers until kernel_vector_end() is + * called. + */ +void kernel_vector_begin(void) +{ + if (WARN_ON(!has_vector())) + return; + + BUG_ON(!may_use_simd()); + + get_cpu_vector_context(); + + riscv_v_vstate_save(current, task_pt_regs(current)); + + riscv_v_enable(); +} +EXPORT_SYMBOL_GPL(kernel_vector_begin); + +/* + * kernel_vector_end(): give the CPU vector registers back to the current task + * + * Must be called from a context in which kernel_vector_begin() was previously + * called, with no call to kernel_vector_end() in the meantime. + * + * The caller must not use the vector registers after this function is called, + * unless kernel_vector_begin() is called again in the meantime. + */ +void kernel_vector_end(void) +{ + if (WARN_ON(!has_vector())) + return; + + riscv_v_vstate_restore(current, task_pt_regs(current)); + + riscv_v_disable(); + + put_cpu_vector_context(); +} +EXPORT_SYMBOL_GPL(kernel_vector_end); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index dd973216e31cf..ed25e9e3b564a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -200,6 +200,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs->a0 = 0; /* Return value of fork() */ p->thread.s[0] = 0; } + p->thread.riscv_v_flags = 0; p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; From 03d73f29f3b428315515e0ce48ac84d704b0717c Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 09:46:42 +0800 Subject: [PATCH 05/39] riscv: vector: make Vector always available for softirq context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit 956895b9d8f74df015636288a81872c07c4fded3 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- The goal of this patch is to provide full support of Vector in kernel softirq context. So that some of the crypto alogrithms won't need scalar fallbacks. By disabling bottom halves in active kernel-mode Vector, softirq will not be able to nest on top of any kernel-mode Vector. So, softirq context is able to use Vector whenever it runs. After this patch, Vector context cannot start with irqs disabled. Otherwise local_bh_enable() may run in a wrong context. Disabling bh is not enough for RT-kernel to prevent preeemption. So we must disable preemption, which also implies disabling bh on RT. Related-to: commit 696207d4258b ("arm64/sve: Make kernel FPU protection RT friendly") Related-to: commit 66c3ec5a7120 ("arm64: neon: Forbid when irqs are disabled") Signed-off-by: Andy Chiu Reviewed-by: Eric Biggers Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-3-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/processor.h | 3 ++- arch/riscv/include/asm/simd.h | 6 +++++- arch/riscv/kernel/kernel_mode_vector.c | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f5957b0c44be5..0350eef4c9358 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -83,7 +83,8 @@ struct pt_regs; * following meaning: * * - bit 0: indicates whether the in-kernel Vector context is active. The - * activation of this state disables the preemption. + * activation of this state disables the preemption. On a non-RT kernel, it + * also disable bh. */ #define RISCV_KERNEL_MODE_V 0x1 diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h index ef8af413a9fc7..4d699e16c9a96 100644 --- a/arch/riscv/include/asm/simd.h +++ b/arch/riscv/include/asm/simd.h @@ -28,8 +28,12 @@ static __must_check inline bool may_use_simd(void) /* * RISCV_KERNEL_MODE_V is only set while preemption is disabled, * and is clear whenever preemption is enabled. + * + * Kernel-mode Vector temporarily disables bh. So we must not return + * true on irq_disabled(). Otherwise we would fail the lockdep check + * calling local_bh_enable() */ - return !in_hardirq() && !in_nmi() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); + return !in_hardirq() && !in_nmi() && !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); } #else /* ! CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 114cf4f0a0eb6..2fc145edae3dd 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -46,7 +46,14 @@ static inline void riscv_v_stop(u32 flags) */ void get_cpu_vector_context(void) { - preempt_disable(); + /* + * disable softirqs so it is impossible for softirqs to nest + * get_cpu_vector_context() when kernel is actively using Vector. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); riscv_v_start(RISCV_KERNEL_MODE_V); } @@ -62,7 +69,10 @@ void put_cpu_vector_context(void) { riscv_v_stop(RISCV_KERNEL_MODE_V); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } /* From 5f43b562a68b633078d81ebba48f0d51e91fff56 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 10:11:51 +0800 Subject: [PATCH 06/39] riscv: Add vector extension XOR implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit c5674d00cacdb1c47c72e19a552fbae401bc3532 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- This patch adds support for vector optimized XOR and it is tested in qemu. Co-developed-by: Han-Kuan Chen Signed-off-by: Han-Kuan Chen Signed-off-by: Greentime Hu Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-4-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/asm-prototypes.h | 18 ++++++ arch/riscv/include/asm/xor.h | 68 +++++++++++++++++++++ arch/riscv/lib/Makefile | 1 + arch/riscv/lib/xor.S | 81 +++++++++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 arch/riscv/include/asm/xor.h create mode 100644 arch/riscv/lib/xor.S diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 36b955c762ba0..6db1a9bbff4ce 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -9,6 +9,24 @@ long long __lshrti3(long long a, int b); long long __ashrti3(long long a, int b); long long __ashlti3(long long a, int b); +#ifdef CONFIG_RISCV_ISA_V + +void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2); +void xor_regs_3_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3); +void xor_regs_4_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4); +void xor_regs_5_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4, + const unsigned long *__restrict p5); + +#endif /* CONFIG_RISCV_ISA_V */ #define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h new file mode 100644 index 0000000000000..96011861e46b4 --- /dev/null +++ b/arch/riscv/include/asm/xor.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ + +#include +#include +#ifdef CONFIG_RISCV_ISA_V +#include +#include +#include + +static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2) +{ + kernel_vector_begin(); + xor_regs_2_(bytes, p1, p2); + kernel_vector_end(); +} + +static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3) +{ + kernel_vector_begin(); + xor_regs_3_(bytes, p1, p2, p3); + kernel_vector_end(); +} + +static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4) +{ + kernel_vector_begin(); + xor_regs_4_(bytes, p1, p2, p3, p4); + kernel_vector_end(); +} + +static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4, + const unsigned long *__restrict p5) +{ + kernel_vector_begin(); + xor_regs_5_(bytes, p1, p2, p3, p4, p5); + kernel_vector_end(); +} + +static struct xor_block_template xor_block_rvv = { + .name = "rvv", + .do_2 = xor_vector_2, + .do_3 = xor_vector_3, + .do_4 = xor_vector_4, + .do_5 = xor_vector_5 +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_32regs); \ + if (has_vector()) { \ + xor_speed(&xor_block_rvv);\ + } \ + } while (0) +#endif diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index df5568beb612f..5d22c8e4678fc 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -16,3 +16,4 @@ lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +lib-$(CONFIG_RISCV_ISA_V) += xor.o diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S new file mode 100644 index 0000000000000..b28f2430e52fa --- /dev/null +++ b/arch/riscv/lib/xor.S @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ +#include +#include +#include + +SYM_FUNC_START(xor_regs_2_) + vsetvli a3, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a3 + vxor.vv v16, v0, v8 + add a2, a2, a3 + vse8.v v16, (a1) + add a1, a1, a3 + bnez a0, xor_regs_2_ + ret +SYM_FUNC_END(xor_regs_2_) +EXPORT_SYMBOL(xor_regs_2_) + +SYM_FUNC_START(xor_regs_3_) + vsetvli a4, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a4 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a4 + vxor.vv v16, v0, v16 + add a3, a3, a4 + vse8.v v16, (a1) + add a1, a1, a4 + bnez a0, xor_regs_3_ + ret +SYM_FUNC_END(xor_regs_3_) +EXPORT_SYMBOL(xor_regs_3_) + +SYM_FUNC_START(xor_regs_4_) + vsetvli a5, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a5 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a5 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a5 + vxor.vv v16, v0, v24 + add a4, a4, a5 + vse8.v v16, (a1) + add a1, a1, a5 + bnez a0, xor_regs_4_ + ret +SYM_FUNC_END(xor_regs_4_) +EXPORT_SYMBOL(xor_regs_4_) + +SYM_FUNC_START(xor_regs_5_) + vsetvli a6, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a6 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a6 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a6 + vxor.vv v0, v0, v24 + vle8.v v8, (a5) + add a4, a4, a6 + vxor.vv v16, v0, v8 + add a5, a5, a6 + vse8.v v16, (a1) + add a1, a1, a6 + bnez a0, xor_regs_5_ + ret +SYM_FUNC_END(xor_regs_5_) +EXPORT_SYMBOL(xor_regs_5_) From 19788c6ea95177a140846a803078589f6f4d8000 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 10:21:16 +0800 Subject: [PATCH 07/39] riscv: sched: defer restoring Vector context for user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit 7df56cbc27e4239807b5d8860f79a7350d63a741 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- User will use its Vector registers only after the kernel really returns to the userspace. So we can delay restoring Vector registers as long as we are still running in kernel mode. So, add a thread flag to indicates the need of restoring Vector and do the restore at the last arch-specific exit-to-user hook. This save the context restoring cost when we switch over multiple processes that run V in kernel mode. For example, if the kernel performs a context swicth from A->B->C, and returns to C's userspace, then there is no need to restore B's V-register. Besides, this also prevents us from repeatedly restoring V context when executing kernel-mode Vector multiple times. The cost of this is that we must disable preemption and mark vector as busy during vstate_{save,restore}. Because then the V context will not get restored back immediately when a trap-causing context switch happens in the middle of vstate_{save,restore}. Signed-off-by: Andy Chiu Acked-by: Conor Dooley Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-5-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/entry-common.h | 17 +++++++++++++++++ arch/riscv/include/asm/thread_info.h | 2 ++ arch/riscv/include/asm/vector.h | 11 ++++++++++- arch/riscv/kernel/kernel_mode_vector.c | 2 +- arch/riscv/kernel/process.c | 2 ++ arch/riscv/kernel/ptrace.c | 5 ++++- arch/riscv/kernel/signal.c | 5 ++++- arch/riscv/kernel/vector.c | 2 +- 8 files changed, 41 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 6e4dee49d84b9..9d3fb04146809 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -4,6 +4,23 @@ #define _ASM_RISCV_ENTRY_COMMON_H #include +#include +#include + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_RISCV_V_DEFER_RESTORE) { + clear_thread_flag(TIF_RISCV_V_DEFER_RESTORE); + /* + * We are already called with irq disabled, so go without + * keeping track of riscv_v_flags. + */ + riscv_v_vstate_restore(current, regs); + } +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare void handle_page_fault(struct pt_regs *regs); void handle_break(struct pt_regs *regs); diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 8c72d1bcdf141..fb10cdf02b97d 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -94,12 +94,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ #define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ #define TIF_32BIT 11 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) #define _TIF_WORK_MASK \ (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index d26986484e302..4889db112b60c 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -293,6 +293,15 @@ static inline void riscv_v_vstate_restore(struct task_struct *task, } } +static inline void riscv_v_vstate_set_restore(struct task_struct *task, + struct pt_regs *regs) +{ + if ((regs->status & SR_VS) != SR_VS_OFF) { + set_tsk_thread_flag(task, TIF_RISCV_V_DEFER_RESTORE); + riscv_v_vstate_on(regs); + } +} + static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { @@ -300,7 +309,7 @@ static inline void __switch_to_vector(struct task_struct *prev, regs = task_pt_regs(prev); riscv_v_vstate_save(prev, regs); - riscv_v_vstate_restore(next, task_pt_regs(next)); + riscv_v_vstate_set_restore(next, task_pt_regs(next)); } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 2fc145edae3dd..8422c881f4529 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -117,7 +117,7 @@ void kernel_vector_end(void) if (WARN_ON(!has_vector())) return; - riscv_v_vstate_restore(current, task_pt_regs(current)); + riscv_v_vstate_set_restore(current, task_pt_regs(current)); riscv_v_disable(); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index ed25e9e3b564a..fcfbcf417cbfd 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -151,6 +151,7 @@ void flush_thread(void) riscv_v_vstate_off(task_pt_regs(current)); kfree(current->thread.vstate.datap); memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(current, TIF_RISCV_V_DEFER_RESTORE); #endif } @@ -167,6 +168,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) *dst = *src; /* clear entire V context, including datap for a new task */ memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(dst, TIF_RISCV_V_DEFER_RESTORE); return 0; } diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2afe460de16a6..7b93bcbdf9fab 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -99,8 +99,11 @@ static int riscv_vr_get(struct task_struct *target, * Ensure the vector registers have been saved to the memory before * copying them to membuf. */ - if (target == current) + if (target == current) { + get_cpu_vector_context(); riscv_v_vstate_save(current, task_pt_regs(current)); + put_cpu_vector_context(); + } ptrace_vstate.vstart = vstate->vstart; ptrace_vstate.vl = vstate->vl; diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 048b9b23d7543..65a973f386e3d 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -86,7 +86,10 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) /* datap is designed to be 16 byte aligned for better performance */ WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16))); + get_cpu_vector_context(); riscv_v_vstate_save(current, regs); + put_cpu_vector_context(); + /* Copy everything of vstate but datap. */ err = __copy_to_user(&state->v_state, ¤t->thread.vstate, offsetof(struct __riscv_v_ext_state, datap)); @@ -134,7 +137,7 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) if (unlikely(err)) return err; - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_set_restore(current, regs); return err; } diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index b4e357363d29a..5ec0a84a9749c 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -179,7 +179,7 @@ bool riscv_v_first_use_handler(struct pt_regs *regs) return true; } riscv_v_vstate_on(regs); - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_set_restore(current, regs); return true; } From fce75af39093623cf809153871298182928255a4 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 10:25:06 +0800 Subject: [PATCH 08/39] riscv: lib: vectorize copy_to_user/copy_from_user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit c2a658d419246108c9bf065ec347355de5ba8a05 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- This patch utilizes Vector to perform copy_to_user/copy_from_user. If Vector is available and the size of copy is large enough for Vector to perform better than scalar, then direct the kernel to do Vector copies for userspace. Though the best programming practice for users is to reduce the copy, this provides a faster variant when copies are inevitable. The optimal size for using Vector, copy_to_user_thres, is only a heuristic for now. We can add DT parsing if people feel the need of customizing it. The exception fixup code of the __asm_vector_usercopy must fallback to the scalar one because accessing user pages might fault, and must be sleepable. Current kernel-mode Vector does not allow tasks to be preemptible, so we must disactivate Vector and perform a scalar fallback in such case. The original implementation of Vector operations comes from https://github.com/sifive/sifive-libc, which we agree to contribute to Linux kernel. Co-developed-by: Jerry Shih Signed-off-by: Jerry Shih Co-developed-by: Nick Knight Signed-off-by: Nick Knight Suggested-by: Guo Ren Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-6-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/Kconfig | 8 ++++ arch/riscv/include/asm/asm-prototypes.h | 4 ++ arch/riscv/lib/Makefile | 6 ++- arch/riscv/lib/riscv_v_helpers.c | 45 +++++++++++++++++++++ arch/riscv/lib/uaccess.S | 10 +++++ arch/riscv/lib/uaccess_vector.S | 53 +++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/lib/riscv_v_helpers.c create mode 100644 arch/riscv/lib/uaccess_vector.S diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index fd97cf49077f5..b90d92f1b18ea 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -606,6 +606,14 @@ config RISCV_ISA_ZACAS If you don't know what to do here, say Y. +config RISCV_ISA_V_UCOPY_THRESHOLD + int "Threshold size for vectorized user copies" + depends on RISCV_ISA_V + default 768 + help + Prefer using vectorized copy_to_user()/copy_from_user() when the + workload size exceeds this value. + config TOOLCHAIN_HAS_ZBB bool default y diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 6db1a9bbff4ce..be438932f321f 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -11,6 +11,10 @@ long long __ashlti3(long long a, int b); #ifdef CONFIG_RISCV_ISA_V +#ifdef CONFIG_MMU +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n); +#endif /* CONFIG_MMU */ + void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, const unsigned long *__restrict p2); void xor_regs_3_(unsigned long bytes, unsigned long *__restrict p1, diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 5d22c8e4678fc..50be5edf0d78e 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -11,9 +11,13 @@ lib-y += memmove.o lib-y += strcmp.o lib-y += strlen.o lib-y += strncmp.o -lib-$(CONFIG_MMU) += uaccess.o +ifeq ($(CONFIG_MMU), y) +lib-y += uaccess.o +lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o +endif lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RISCV_ISA_V) += xor.o +lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c new file mode 100644 index 0000000000000..be38a93cedaec --- /dev/null +++ b/arch/riscv/lib/riscv_v_helpers.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 SiFive + * Author: Andy Chiu + */ +#include +#include + +#include +#include + +#ifdef CONFIG_MMU +#include +#endif + +#ifdef CONFIG_MMU +size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD; +int __asm_vector_usercopy(void *dst, void *src, size_t n); +int fallback_scalar_usercopy(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) +{ + size_t remain, copied; + + /* skip has_vector() check because it has been done by the asm */ + if (!may_use_simd()) + goto fallback; + + kernel_vector_begin(); + remain = __asm_vector_usercopy(dst, src, n); + kernel_vector_end(); + + if (remain) { + copied = n - remain; + dst += copied; + src += copied; + n = remain; + goto fallback; + } + + return remain; + +fallback: + return fallback_scalar_usercopy(dst, src, n); +} +#endif diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 3ab438f30d132..a1e4a3c429254 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -3,6 +3,8 @@ #include #include #include +#include +#include .macro fixup op reg addr lbl 100: @@ -11,6 +13,13 @@ .endm SYM_FUNC_START(__asm_copy_to_user) +#ifdef CONFIG_RISCV_ISA_V + ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_v, CONFIG_RISCV_ISA_V) + REG_L t0, riscv_v_usercopy_threshold + bltu a2, t0, fallback_scalar_usercopy + tail enter_vector_usercopy +#endif +SYM_FUNC_START(fallback_scalar_usercopy) /* Enable access to user memory */ li t6, SR_SUM @@ -181,6 +190,7 @@ SYM_FUNC_START(__asm_copy_to_user) sub a0, t5, a0 ret SYM_FUNC_END(__asm_copy_to_user) +SYM_FUNC_END(fallback_scalar_usercopy) EXPORT_SYMBOL(__asm_copy_to_user) SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) EXPORT_SYMBOL(__asm_copy_from_user) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S new file mode 100644 index 0000000000000..51ab5588e9ff3 --- /dev/null +++ b/arch/riscv/lib/uaccess_vector.S @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include +#include +#include +#include +#include + +#define pDst a0 +#define pSrc a1 +#define iNum a2 + +#define iVL a3 + +#define ELEM_LMUL_SETTING m8 +#define vData v0 + + .macro fixup op reg addr lbl +100: + \op \reg, \addr + _asm_extable 100b, \lbl + .endm + +SYM_FUNC_START(__asm_vector_usercopy) + /* Enable access to user memory */ + li t6, SR_SUM + csrs CSR_STATUS, t6 + +loop: + vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma + fixup vle8.v vData, (pSrc), 10f + sub iNum, iNum, iVL + add pSrc, pSrc, iVL + fixup vse8.v vData, (pDst), 11f + add pDst, pDst, iVL + bnez iNum, loop + + /* Exception fixup for vector load is shared with normal exit */ +10: + /* Disable access to user memory */ + csrc CSR_STATUS, t6 + mv a0, iNum + ret + + /* Exception fixup code for vector store. */ +11: + /* Undo the subtraction after vle8.v */ + add iNum, iNum, iVL + /* Make sure the scalar fallback skip already processed bytes */ + csrr t2, CSR_VSTART + sub iNum, iNum, t2 + j 10b +SYM_FUNC_END(__asm_vector_usercopy) From e3b4083d81bd9b0cbfcd2c4816bf91f4e8b09f1b Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 10:29:40 +0800 Subject: [PATCH 09/39] riscv: fpu: drop SR_SD bit checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit a93fdaf183125fea81f66b9bd756ef5a0c30859e category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- SR_SD summarizes the dirty status of FS/VS/XS. However, the current code structure does not fully utilize it because each extension specific code is divided into an individual segment. So remove the SR_SD check for now. Signed-off-by: Andy Chiu Reviewed-by: Song Shuai Reviewed-by: Guo Ren Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-7-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/switch_to.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 7508f3ec80639..774a17b39c7d3 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -53,8 +53,7 @@ static inline void __switch_to_fpu(struct task_struct *prev, struct pt_regs *regs; regs = task_pt_regs(prev); - if (unlikely(regs->status & SR_SD)) - fstate_save(prev, regs); + fstate_save(prev, regs); fstate_restore(next, task_pt_regs(next)); } From e52ab30bfc22026ecbe70b40d196ac2c56b16aba Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 14:26:36 +0800 Subject: [PATCH 10/39] riscv: vector: do not pass task_struct into riscv_v_vstate_{save,restore}() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit d6c78f1ca3e8ec3fd1afa1bc567cdf083e7af9fe category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- riscv_v_vstate_{save,restore}() can operate only on the knowlege of struct __riscv_v_ext_state, and struct pt_regs. Let the caller decides which should be passed into the function. Meanwhile, the kernel-mode Vector is going to introduce another vstate, so this also makes functions potentially able to be reused. Signed-off-by: Andy Chiu Acked-by: Conor Dooley Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-8-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/entry-common.h | 2 +- arch/riscv/include/asm/vector.h | 12 +++++------- arch/riscv/kernel/kernel_mode_vector.c | 2 +- arch/riscv/kernel/ptrace.c | 2 +- arch/riscv/kernel/signal.c | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 9d3fb04146809..b75b39a4fc100 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -16,7 +16,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, * We are already called with irq disabled, so go without * keeping track of riscv_v_flags. */ - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_restore(¤t->thread.vstate, regs); } } diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 4889db112b60c..0128f79a01df1 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -262,7 +262,7 @@ static inline void riscv_v_vstate_discard(struct pt_regs *regs) __riscv_v_vstate_dirty(regs); } -static inline void riscv_v_vstate_save(struct task_struct *task, +static inline void riscv_v_vstate_save(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { unsigned long sr_vs, sr_vs_dirty; @@ -271,14 +271,13 @@ static inline void riscv_v_vstate_save(struct task_struct *task, ALT_SR_VS(sr_vs_dirty, SR_VS_DIRTY); if ((regs->status & sr_vs) == sr_vs_dirty) { - struct __riscv_v_ext_state *vstate = &task->thread.vstate; if (vstate->datap) __riscv_v_vstate_save(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } -static inline void riscv_v_vstate_restore(struct task_struct *task, +static inline void riscv_v_vstate_restore(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { unsigned long sr_vs; @@ -286,7 +285,6 @@ static inline void riscv_v_vstate_restore(struct task_struct *task, ALT_SR_VS(sr_vs, SR_VS); if ((regs->status & sr_vs) != SR_VS_OFF) { - struct __riscv_v_ext_state *vstate = &task->thread.vstate; if (vstate->datap) __riscv_v_vstate_restore(vstate, vstate->datap); __riscv_v_vstate_clean(regs); @@ -308,7 +306,7 @@ static inline void __switch_to_vector(struct task_struct *prev, struct pt_regs *regs; regs = task_pt_regs(prev); - riscv_v_vstate_save(prev, regs); + riscv_v_vstate_save(&prev->thread.vstate, regs); riscv_v_vstate_set_restore(next, task_pt_regs(next)); } @@ -327,8 +325,8 @@ static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vsize (0) #define riscv_v_vstate_discard(regs) do {} while (0) -#define riscv_v_vstate_save(task, regs) do {} while (0) -#define riscv_v_vstate_restore(task, regs) do {} while (0) +#define riscv_v_vstate_save(vstate, regs) do {} while (0) +#define riscv_v_vstate_restore(vstate, regs) do {} while (0) #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 8422c881f4529..241a8f834e1ce 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -97,7 +97,7 @@ void kernel_vector_begin(void) get_cpu_vector_context(); - riscv_v_vstate_save(current, task_pt_regs(current)); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); riscv_v_enable(); } diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 7b93bcbdf9fab..e8515aa9d80bf 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -101,7 +101,7 @@ static int riscv_vr_get(struct task_struct *target, */ if (target == current) { get_cpu_vector_context(); - riscv_v_vstate_save(current, task_pt_regs(current)); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); put_cpu_vector_context(); } diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 65a973f386e3d..733b460395469 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -87,7 +87,7 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16))); get_cpu_vector_context(); - riscv_v_vstate_save(current, regs); + riscv_v_vstate_save(¤t->thread.vstate, regs); put_cpu_vector_context(); /* Copy everything of vstate but datap. */ From de34628a5a19497f8cfa10cf6ce67d8ee69bf371 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 14:36:32 +0800 Subject: [PATCH 11/39] riscv: vector: use a mask to write vstate_ctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit 5b6048f2ff710196c85ce14373febe8be5115bbe category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- riscv_v_ctrl_set() should only touch bits within PR_RISCV_V_VSTATE_CTRL_MASK. So, use the mask when we really set task's vstate_ctrl. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-9-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/kernel/vector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 5ec0a84a9749c..ad2cd48f2a856 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -131,7 +131,8 @@ static inline void riscv_v_ctrl_set(struct task_struct *tsk, int cur, int nxt, ctrl |= VSTATE_CTRL_MAKE_NEXT(nxt); if (inherit) ctrl |= PR_RISCV_V_VSTATE_CTRL_INHERIT; - tsk->thread.vstate_ctrl = ctrl; + tsk->thread.vstate_ctrl &= ~PR_RISCV_V_VSTATE_CTRL_MASK; + tsk->thread.vstate_ctrl |= ctrl; } bool riscv_v_vstate_ctrl_user_allowed(void) From 47fdb287d2bebfdcfe6355157583189dd225fbb0 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 16:37:07 +0800 Subject: [PATCH 12/39] riscv: vector: use kmem_cache to manage vector context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit bd446f5df5afab212917f6732ba6442a5e8de85e category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- The allocation size of thread.vstate.datap is always riscv_v_vsize. So it is possbile to use kmem_cache_* to manage the allocation. This gives users more information regarding allocation of vector context via /proc/slabinfo. And it potentially reduces the latency of the first-use trap because of the allocation caches. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-10-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/include/asm/vector.h | 4 ++++ arch/riscv/kernel/process.c | 7 ++++++- arch/riscv/kernel/vector.c | 19 ++++++++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 0128f79a01df1..fe659260bdc78 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -27,6 +27,8 @@ void kernel_vector_begin(void); void kernel_vector_end(void); void get_cpu_vector_context(void); void put_cpu_vector_context(void); +void riscv_v_thread_free(struct task_struct *tsk); +void __init riscv_v_setup_ctx_cache(void); static inline u32 riscv_v_flags(void) { @@ -330,6 +332,8 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) +#define riscv_v_thread_free(tsk) do {} while (0) +#define riscv_v_setup_ctx_cache() do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index fcfbcf417cbfd..5ea77a227f304 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -159,7 +159,7 @@ void arch_release_task_struct(struct task_struct *tsk) { /* Free the vector context of datap. */ if (has_vector()) - kfree(tsk->thread.vstate.datap); + riscv_v_thread_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -208,5 +208,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } +void __init arch_task_cache_init(void) +{ + riscv_v_setup_ctx_cache(); +} + EXPORT_SYMBOL_GPL(__fstate_save); EXPORT_SYMBOL_GPL(__fstate_restore); diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index ad2cd48f2a856..b2128bbefb003 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -22,6 +22,7 @@ #include static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); +static struct kmem_cache *riscv_v_user_cachep; unsigned long riscv_v_vsize __read_mostly; EXPORT_SYMBOL_GPL(riscv_v_vsize); @@ -55,6 +56,16 @@ int riscv_v_setup_vsize(void) return 0; } +void __init riscv_v_setup_ctx_cache(void) +{ + if (!has_vector()) + return; + + riscv_v_user_cachep = kmem_cache_create_usercopy("riscv_vector_ctx", + riscv_v_vsize, 16, SLAB_PANIC, + 0, riscv_v_vsize, NULL); +} + static bool insn_is_vector(u32 insn_buf) { u32 opcode = insn_buf & __INSN_OPCODE_MASK; @@ -93,7 +104,7 @@ static int riscv_v_thread_zalloc(void) void *datap; if (!riscv_v_vsize) return -EINVAL; - datap = kzalloc(riscv_v_vsize, GFP_KERNEL); + datap = kmem_cache_zalloc(riscv_v_user_cachep, GFP_KERNEL); if (!datap) return -ENOMEM; @@ -103,6 +114,12 @@ static int riscv_v_thread_zalloc(void) return 0; } +void riscv_v_thread_free(struct task_struct *tsk) +{ + if (tsk->thread.vstate.datap) + kmem_cache_free(riscv_v_user_cachep, tsk->thread.vstate.datap); +} + #define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) #define VSTATE_CTRL_GET_NEXT(x) (((x) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) >> 2) #define VSTATE_CTRL_MAKE_NEXT(x) (((x) << 2) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) From 8737ea8fbd97481e2eb6ab5e0fedf6de53e16c47 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 16:48:06 +0800 Subject: [PATCH 13/39] riscv: vector: allow kernel-mode Vector with preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit 2080ff9493072a94e42b1856d59f5f1bffb761b7 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- Add kernel_vstate to keep track of kernel-mode Vector registers when trap introduced context switch happens. Also, provide riscv_v_flags to let context save/restore routine track context status. Context tracking happens whenever the core starts its in-kernel Vector executions. An active (dirty) kernel task's V contexts will be saved to memory whenever a trap-introduced context switch happens. Or, when a softirq, which happens to nest on top of it, uses Vector. Context retoring happens when the execution transfer back to the original Kernel context where it first enable preempt_v. Also, provide a config CONFIG_RISCV_ISA_V_PREEMPTIVE to give users an option to disable preemptible kernel-mode Vector at build time. Users with constraint memory may want to disable this config as preemptible kernel-mode Vector needs extra space for tracking of per thread's kernel-mode V context. Or, users might as well want to disable it if all kernel-mode Vector code is time sensitive and cannot tolerate context switch overhead. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-11-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/Kconfig | 14 +++ arch/riscv/include/asm/asm-prototypes.h | 5 + arch/riscv/include/asm/processor.h | 30 +++++- arch/riscv/include/asm/simd.h | 26 ++++- arch/riscv/include/asm/vector.h | 58 ++++++++++- arch/riscv/kernel/entry.S | 9 ++ arch/riscv/kernel/kernel_mode_vector.c | 133 ++++++++++++++++++++++-- arch/riscv/kernel/process.c | 3 + arch/riscv/kernel/vector.c | 31 ++++-- 9 files changed, 287 insertions(+), 22 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b90d92f1b18ea..7edcea77410f5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -614,6 +614,20 @@ config RISCV_ISA_V_UCOPY_THRESHOLD Prefer using vectorized copy_to_user()/copy_from_user() when the workload size exceeds this value. +config RISCV_ISA_V_PREEMPTIVE + bool "Run kernel-mode Vector with kernel preemption" + depends on PREEMPTION + depends on RISCV_ISA_V + default y + help + Usually, in-kernel SIMD routines are run with preemption disabled. + Functions which envoke long running SIMD thus must yield core's + vector unit to prevent blocking other tasks for too long. + + This config allows kernel to run SIMD without explicitly disable + preemption. Enabling this config will result in higher memory + consumption due to the allocation of per-task's kernel Vector context. + config TOOLCHAIN_HAS_ZBB bool default y diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index be438932f321f..cd627ec289f16 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -30,6 +30,11 @@ void xor_regs_5_(unsigned long bytes, unsigned long *__restrict p1, const unsigned long *__restrict p4, const unsigned long *__restrict p5); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +asmlinkage void riscv_v_context_nesting_start(struct pt_regs *regs); +asmlinkage void riscv_v_context_nesting_end(struct pt_regs *regs); +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + #endif /* CONFIG_RISCV_ISA_V */ #define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 0350eef4c9358..eb458e4cb93e6 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -85,8 +85,35 @@ struct pt_regs; * - bit 0: indicates whether the in-kernel Vector context is active. The * activation of this state disables the preemption. On a non-RT kernel, it * also disable bh. + * - bits 8: is used for tracking preemptible kernel-mode Vector, when + * RISCV_ISA_V_PREEMPTIVE is enabled. Calling kernel_vector_begin() does not + * disable the preemption if the thread's kernel_vstate.datap is allocated. + * Instead, the kernel set this bit field. Then the trap entry/exit code + * knows if we are entering/exiting the context that owns preempt_v. + * - 0: the task is not using preempt_v + * - 1: the task is actively using preempt_v. But whether does the task own + * the preempt_v context is decided by bits in RISCV_V_CTX_DEPTH_MASK. + * - bit 16-23 are RISCV_V_CTX_DEPTH_MASK, used by context tracking routine + * when preempt_v starts: + * - 0: the task is actively using, and own preempt_v context. + * - non-zero: the task was using preempt_v, but then took a trap within. + * Thus, the task does not own preempt_v. Any use of Vector will have to + * save preempt_v, if dirty, and fallback to non-preemptible kernel-mode + * Vector. + * - bit 30: The in-kernel preempt_v context is saved, and requries to be + * restored when returning to the context that owns the preempt_v. + * - bit 31: The in-kernel preempt_v context is dirty, as signaled by the + * trap entry code. Any context switches out-of current task need to save + * it to the task's in-kernel V context. Also, any traps nesting on-top-of + * preempt_v requesting to use V needs a save. */ -#define RISCV_KERNEL_MODE_V 0x1 +#define RISCV_V_CTX_DEPTH_MASK 0x00ff0000 + +#define RISCV_V_CTX_UNIT_DEPTH 0x00010000 +#define RISCV_KERNEL_MODE_V 0x00000001 +#define RISCV_PREEMPT_V 0x00000100 +#define RISCV_PREEMPT_V_DIRTY 0x80000000 +#define RISCV_PREEMPT_V_NEED_RESTORE 0x40000000 /* CPU-specific state of a task */ struct thread_struct { @@ -99,6 +126,7 @@ struct thread_struct { u32 riscv_v_flags; u32 vstate_ctrl; struct __riscv_v_ext_state vstate; + struct __riscv_v_ext_state kernel_vstate; }; /* Whitelist the fstate from the task_struct for hardened usercopy */ diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h index 4d699e16c9a96..54efbf523d49c 100644 --- a/arch/riscv/include/asm/simd.h +++ b/arch/riscv/include/asm/simd.h @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -28,12 +29,27 @@ static __must_check inline bool may_use_simd(void) /* * RISCV_KERNEL_MODE_V is only set while preemption is disabled, * and is clear whenever preemption is enabled. - * - * Kernel-mode Vector temporarily disables bh. So we must not return - * true on irq_disabled(). Otherwise we would fail the lockdep check - * calling local_bh_enable() */ - return !in_hardirq() && !in_nmi() && !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); + if (in_hardirq() || in_nmi()) + return false; + + /* + * Nesting is acheived in preempt_v by spreading the control for + * preemptible and non-preemptible kernel-mode Vector into two fields. + * Always try to match with prempt_v if kernel V-context exists. Then, + * fallback to check non preempt_v if nesting happens, or if the config + * is not set. + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_V_PREEMPTIVE) && current->thread.kernel_vstate.datap) { + if (!riscv_preempt_v_started(current)) + return true; + } + /* + * Non-preemptible kernel-mode Vector temporarily disables bh. So we + * must not return true on irq_disabled(). Otherwise we would fail the + * lockdep check calling local_bh_enable() + */ + return !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); } #else /* ! CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index fe659260bdc78..9d3a130b78693 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -29,10 +29,11 @@ void get_cpu_vector_context(void); void put_cpu_vector_context(void); void riscv_v_thread_free(struct task_struct *tsk); void __init riscv_v_setup_ctx_cache(void); +void riscv_v_thread_alloc(struct task_struct *tsk); static inline u32 riscv_v_flags(void) { - return current->thread.riscv_v_flags; + return READ_ONCE(current->thread.riscv_v_flags); } static __always_inline bool has_vector(void) @@ -302,14 +303,62 @@ static inline void riscv_v_vstate_set_restore(struct task_struct *task, } } +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static inline bool riscv_preempt_v_dirty(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_DIRTY); +} + +static inline bool riscv_preempt_v_restore(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_NEED_RESTORE); +} + +static inline void riscv_preempt_v_clear_dirty(struct task_struct *task) +{ + barrier(); + task->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_DIRTY; +} + +static inline void riscv_preempt_v_set_restore(struct task_struct *task) +{ + barrier(); + task->thread.riscv_v_flags |= RISCV_PREEMPT_V_NEED_RESTORE; +} + +static inline bool riscv_preempt_v_started(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V); +} + +#else /* !CONFIG_RISCV_ISA_V_PREEMPTIVE */ +static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return false; } +static inline bool riscv_preempt_v_restore(struct task_struct *task) { return false; } +static inline bool riscv_preempt_v_started(struct task_struct *task) { return false; } +#define riscv_preempt_v_clear_dirty(tsk) do {} while (0) +#define riscv_preempt_v_set_restore(tsk) do {} while (0) +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { struct pt_regs *regs; - regs = task_pt_regs(prev); - riscv_v_vstate_save(&prev->thread.vstate, regs); - riscv_v_vstate_set_restore(next, task_pt_regs(next)); + if (riscv_preempt_v_started(prev)) { + if (riscv_preempt_v_dirty(prev)) { + __riscv_v_vstate_save(&prev->thread.kernel_vstate, + prev->thread.kernel_vstate.datap); + riscv_preempt_v_clear_dirty(prev); + } + } else { + regs = task_pt_regs(prev); + riscv_v_vstate_save(&prev->thread.vstate, regs); + } + + if (riscv_preempt_v_started(next)) + riscv_preempt_v_set_restore(next); + else + riscv_v_vstate_set_restore(next, task_pt_regs(next)); } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); @@ -334,6 +383,7 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vstate_on(regs) do {} while (0) #define riscv_v_thread_free(tsk) do {} while (0) #define riscv_v_setup_ctx_cache() do {} while (0) +#define riscv_v_thread_alloc(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index ffc521592687d..6d239ba461ad1 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -81,6 +81,11 @@ SYM_CODE_START(handle_exception) .option norelax la gp, __global_pointer$ .option pop + +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_start +#endif move a0, sp /* pt_regs */ la ra, ret_from_exception @@ -134,6 +139,10 @@ SYM_CODE_START_NOALIGN(ret_from_exception) */ csrw CSR_SCRATCH, tp 1: +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_end +#endif REG_L a0, PT_STATUS(sp) /* * The current load reservation is effectively part of the processor's diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 241a8f834e1ce..6afe80c7f03ab 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -14,10 +14,13 @@ #include #include #include +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +#include +#endif static inline void riscv_v_flags_set(u32 flags) { - current->thread.riscv_v_flags = flags; + WRITE_ONCE(current->thread.riscv_v_flags, flags); } static inline void riscv_v_start(u32 flags) @@ -27,12 +30,14 @@ static inline void riscv_v_start(u32 flags) orig = riscv_v_flags(); BUG_ON((orig & flags) != 0); riscv_v_flags_set(orig | flags); + barrier(); } static inline void riscv_v_stop(u32 flags) { int orig; + barrier(); orig = riscv_v_flags(); BUG_ON((orig & flags) == 0); riscv_v_flags_set(orig & ~flags); @@ -75,6 +80,117 @@ void put_cpu_vector_context(void) preempt_enable(); } +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static __always_inline u32 *riscv_v_flags_ptr(void) +{ + return ¤t->thread.riscv_v_flags; +} + +static inline void riscv_preempt_v_set_dirty(void) +{ + *riscv_v_flags_ptr() |= RISCV_PREEMPT_V_DIRTY; +} + +static inline void riscv_preempt_v_reset_flags(void) +{ + *riscv_v_flags_ptr() &= ~(RISCV_PREEMPT_V_DIRTY | RISCV_PREEMPT_V_NEED_RESTORE); +} + +static inline void riscv_v_ctx_depth_inc(void) +{ + *riscv_v_flags_ptr() += RISCV_V_CTX_UNIT_DEPTH; +} + +static inline void riscv_v_ctx_depth_dec(void) +{ + *riscv_v_flags_ptr() -= RISCV_V_CTX_UNIT_DEPTH; +} + +static inline u32 riscv_v_ctx_get_depth(void) +{ + return *riscv_v_flags_ptr() & RISCV_V_CTX_DEPTH_MASK; +} + +static int riscv_v_stop_kernel_context(void) +{ + if (riscv_v_ctx_get_depth() != 0 || !riscv_preempt_v_started(current)) + return 1; + + riscv_preempt_v_clear_dirty(current); + riscv_v_stop(RISCV_PREEMPT_V); + return 0; +} + +static int riscv_v_start_kernel_context(bool *is_nested) +{ + struct __riscv_v_ext_state *kvstate, *uvstate; + + kvstate = ¤t->thread.kernel_vstate; + if (!kvstate->datap) + return -ENOENT; + + if (riscv_preempt_v_started(current)) { + WARN_ON(riscv_v_ctx_get_depth() == 0); + *is_nested = true; + get_cpu_vector_context(); + if (riscv_preempt_v_dirty(current)) { + __riscv_v_vstate_save(kvstate, kvstate->datap); + riscv_preempt_v_clear_dirty(current); + } + riscv_preempt_v_set_restore(current); + return 0; + } + + /* Transfer the ownership of V from user to kernel, then save */ + riscv_v_start(RISCV_PREEMPT_V | RISCV_PREEMPT_V_DIRTY); + if ((task_pt_regs(current)->status & SR_VS) == SR_VS_DIRTY) { + uvstate = ¤t->thread.vstate; + __riscv_v_vstate_save(uvstate, uvstate->datap); + } + riscv_preempt_v_clear_dirty(current); + return 0; +} + +/* low-level V context handling code, called with irq disabled */ +asmlinkage void riscv_v_context_nesting_start(struct pt_regs *regs) +{ + int depth; + + if (!riscv_preempt_v_started(current)) + return; + + depth = riscv_v_ctx_get_depth(); + if (depth == 0 && (regs->status & SR_VS) == SR_VS_DIRTY) + riscv_preempt_v_set_dirty(); + + riscv_v_ctx_depth_inc(); +} + +asmlinkage void riscv_v_context_nesting_end(struct pt_regs *regs) +{ + struct __riscv_v_ext_state *vstate = ¤t->thread.kernel_vstate; + u32 depth; + + WARN_ON(!irqs_disabled()); + + if (!riscv_preempt_v_started(current)) + return; + + riscv_v_ctx_depth_dec(); + depth = riscv_v_ctx_get_depth(); + if (depth == 0) { + if (riscv_preempt_v_restore(current)) { + __riscv_v_vstate_restore(vstate, vstate->datap); + __riscv_v_vstate_clean(regs); + riscv_preempt_v_reset_flags(); + } + } +} +#else +#define riscv_v_start_kernel_context(nested) (-ENOENT) +#define riscv_v_stop_kernel_context() (-ENOENT) +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + /* * kernel_vector_begin(): obtain the CPU vector registers for use by the calling * context @@ -90,14 +206,20 @@ void put_cpu_vector_context(void) */ void kernel_vector_begin(void) { + bool nested = false; + if (WARN_ON(!has_vector())) return; BUG_ON(!may_use_simd()); - get_cpu_vector_context(); + if (riscv_v_start_kernel_context(&nested)) { + get_cpu_vector_context(); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + } - riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + if (!nested) + riscv_v_vstate_set_restore(current, task_pt_regs(current)); riscv_v_enable(); } @@ -117,10 +239,9 @@ void kernel_vector_end(void) if (WARN_ON(!has_vector())) return; - riscv_v_vstate_set_restore(current, task_pt_regs(current)); - riscv_v_disable(); - put_cpu_vector_context(); + if (riscv_v_stop_kernel_context()) + put_cpu_vector_context(); } EXPORT_SYMBOL_GPL(kernel_vector_end); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 5ea77a227f304..d90413ededd2a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -168,6 +168,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) *dst = *src; /* clear entire V context, including datap for a new task */ memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + memset(&dst->thread.kernel_vstate, 0, sizeof(struct __riscv_v_ext_state)); clear_tsk_thread_flag(dst, TIF_RISCV_V_DEFER_RESTORE); return 0; @@ -203,6 +204,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.s[0] = 0; } p->thread.riscv_v_flags = 0; + if (has_vector()) + riscv_v_thread_alloc(p); p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index b2128bbefb003..e52a94f40d692 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -23,6 +23,9 @@ static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); static struct kmem_cache *riscv_v_user_cachep; +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static struct kmem_cache *riscv_v_kernel_cachep; +#endif unsigned long riscv_v_vsize __read_mostly; EXPORT_SYMBOL_GPL(riscv_v_vsize); @@ -64,6 +67,11 @@ void __init riscv_v_setup_ctx_cache(void) riscv_v_user_cachep = kmem_cache_create_usercopy("riscv_vector_ctx", riscv_v_vsize, 16, SLAB_PANIC, 0, riscv_v_vsize, NULL); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_kernel_cachep = kmem_cache_create("riscv_vector_kctx", + riscv_v_vsize, 16, + SLAB_PANIC, NULL); +#endif } static bool insn_is_vector(u32 insn_buf) @@ -99,25 +107,36 @@ static bool insn_is_vector(u32 insn_buf) return false; } -static int riscv_v_thread_zalloc(void) +static int riscv_v_thread_zalloc(struct kmem_cache *cache, + struct __riscv_v_ext_state *ctx) { void *datap; if (!riscv_v_vsize) return -EINVAL; - datap = kmem_cache_zalloc(riscv_v_user_cachep, GFP_KERNEL); + datap = kmem_cache_zalloc(cache, GFP_KERNEL); if (!datap) return -ENOMEM; - current->thread.vstate.datap = datap; - memset(¤t->thread.vstate, 0, offsetof(struct __riscv_v_ext_state, - datap)); + ctx->datap = datap; + memset(ctx, 0, offsetof(struct __riscv_v_ext_state, datap)); return 0; } +void riscv_v_thread_alloc(struct task_struct *tsk) +{ +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_thread_zalloc(riscv_v_kernel_cachep, &tsk->thread.kernel_vstate); +#endif +} + void riscv_v_thread_free(struct task_struct *tsk) { if (tsk->thread.vstate.datap) kmem_cache_free(riscv_v_user_cachep, tsk->thread.vstate.datap); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + if (tsk->thread.kernel_vstate.datap) + kmem_cache_free(riscv_v_kernel_cachep, tsk->thread.kernel_vstate.datap); +#endif } #define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) @@ -192,7 +211,7 @@ bool riscv_v_first_use_handler(struct pt_regs *regs) * context where VS has been off. So, try to allocate the user's V * context and resume execution. */ - if (riscv_v_thread_zalloc()) { + if (riscv_v_thread_zalloc(riscv_v_user_cachep, ¤t->thread.vstate)) { force_sig(SIGBUS); return true; } From a6c7c54baf67513ba9d208d8253fba753d92a805 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 5 Aug 2025 15:59:11 +0800 Subject: [PATCH 14/39] riscv: Fix vector state restore in rt_sigreturn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7 commit c27fa53b858b4ee6552a719aa599c250cf98a586 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/86 -------------------------------- The RISC-V Vector specification states in "Appendix D: Calling Convention for Vector State" [1] that "Executing a system call causes all caller-saved vector registers (v0-v31, vl, vtype) and vstart to become unspecified.". In the RISC-V kernel this is called "discarding the vstate". Returning from a signal handler via the rt_sigreturn() syscall, vector discard is also performed. However, this is not an issue since the vector state should be restored from the sigcontext, and therefore not care about the vector discard. The "live state" is the actual vector register in the running context, and the "vstate" is the vector state of the task. A dirty live state, means that the vstate and live state are not in synch. When vectorized user_from_copy() was introduced, an bug sneaked in at the restoration code, related to the discard of the live state. An example when this go wrong: 1. A userland application is executing vector code 2. The application receives a signal, and the signal handler is entered. 3. The application returns from the signal handler, using the rt_sigreturn() syscall. 4. The live vector state is discarded upon entering the rt_sigreturn(), and the live state is marked as "dirty", indicating that the live state need to be synchronized with the current vstate. 5. rt_sigreturn() restores the vstate, except the Vector registers, from the sigcontext 6. rt_sigreturn() restores the Vector registers, from the sigcontext, and now the vectorized user_from_copy() is used. The dirty live state from the discard is saved to the vstate, making the vstate corrupt. 7. rt_sigreturn() returns to the application, which crashes due to corrupted vstate. Note that the vectorized user_from_copy() is invoked depending on the value of CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD. Default is 768, which means that vlen has to be larger than 128b for this bug to trigger. The fix is simply to mark the live state as non-dirty/clean prior performing the vstate restore. Link: https://github.com/riscv/riscv-isa-manual/releases/download/riscv-isa-release-8abdb41-2024-03-26/unpriv-isa-asciidoc.pdf # [1] Reported-by: Charlie Jenkins Reported-by: Vineet Gupta Fixes: c2a658d41924 ("riscv: lib: vectorize copy_to_user/copy_from_user") Signed-off-by: Björn Töpel Reviewed-by: Andy Chiu Tested-by: Vineet Gupta Link: https://lore.kernel.org/r/20240403072638.567446-1-bjorn@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt Signed-off-by: Gao Rui --- arch/riscv/kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 733b460395469..0e3bdd520617e 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -119,6 +119,13 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) struct __sc_riscv_v_state __user *state = sc_vec; void __user *datap; + /* + * Mark the vstate as clean prior performing the actual copy, + * to avoid getting the vstate incorrectly clobbered by the + * discarded vector state. + */ + riscv_v_vstate_set_restore(current, regs); + /* Copy everything of __sc_riscv_v_state except datap. */ err = __copy_from_user(¤t->thread.vstate, &state->v_state, offsetof(struct __riscv_v_ext_state, datap)); @@ -133,13 +140,7 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) * Copy the whole vector content from user space datap. Use * copy_from_user to prevent information leak. */ - err = copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); - if (unlikely(err)) - return err; - - riscv_v_vstate_set_restore(current, regs); - - return err; + return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } #else #define save_v_state(task, regs) (0) From a64e6ce37b4482219d5c8d27868497782a31f135 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 29 Jul 2025 19:43:11 +0800 Subject: [PATCH 15/39] Revert "iommu: Handle race with default domain setup" mainline inclusion from mainline-v6.6-rc3 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- There are issues with this patch's integration, as it does not align with the Linux community patch. We will re-apply it after integrating default_domain feature later. Fixes: a6dbcf69d4c4 (iommu: Handle race with default domain setup) Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3fa5699b9ff19..881ad66f22863 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -566,18 +566,6 @@ int iommu_probe_device(struct device *dev) mutex_lock(&iommu_probe_device_lock); ret = __iommu_probe_device(dev, NULL); mutex_unlock(&iommu_probe_device_lock); - - /* - * The dma_configure replay paths need bus_iommu_probe() to - * finish before they can call arch_setup_dma_ops() - */ - if (IS_ENABLED(CONFIG_IOMMU_DMA) && !ret && dev->iommu_group) { - mutex_lock(&dev->iommu_group->mutex); - if (!dev->iommu_group->default_domain && - !dev_iommu_ops(dev)->set_platform_dma_ops) - ret = -EPROBE_DEFER; - mutex_unlock(&dev->iommu_group->mutex); - } if (ret) return ret; @@ -3163,12 +3151,6 @@ int iommu_device_use_default_domain(struct device *dev) return 0; mutex_lock(&group->mutex); - /* We may race against bus_iommu_probe() finalising groups here */ - if (IS_ENABLED(CONFIG_IOMMU_DMA) && !group->default_domain && - !dev_iommu_ops(dev)->set_platform_dma_ops) { - ret = -EPROBE_DEFER; - goto unlock_out; - } if (group->owner_cnt) { if (group->owner || !iommu_is_default_domain(group) || !xa_empty(&group->pasid_array)) { From 6fbad8405b4af3d7685eab216227ec6789d8b1cb Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 28 Jul 2025 16:50:31 +0800 Subject: [PATCH 16/39] iommu: Add iommu_ops->identity_domain mainline inclusion from mainline-v6.6-rc3 commit df31b298477e65a01deff0af352be3a61524d930 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This allows a driver to set a global static to an IDENTITY domain and the core code will automatically use it whenever an IDENTITY domain is requested. By making it always available it means the IDENTITY can be used in error handling paths to force the iommu driver into a known state. Devices implementing global static identity domains should avoid failing their attach_dev ops. To make global static domains simpler allow drivers to omit their free function and update the iommufd selftest. Convert rockchip to use the new mechanism. Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 6 +++++- drivers/iommu/iommufd/selftest.c | 5 ----- drivers/iommu/rockchip-iommu.c | 9 +-------- include/linux/iommu.h | 3 +++ 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 881ad66f22863..5a3fcddad173d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1979,6 +1979,9 @@ static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, if (bus == NULL || bus->iommu_ops == NULL) return NULL; + if (alloc_type == IOMMU_DOMAIN_IDENTITY && bus->iommu_ops->identity_domain) + return bus->iommu_ops->identity_domain; + domain = bus->iommu_ops->domain_alloc(alloc_type); if (!domain) return NULL; @@ -2012,7 +2015,8 @@ void iommu_domain_free(struct iommu_domain *domain) if (domain->type == IOMMU_DOMAIN_SVA) mmdrop(domain->mm); iommu_put_dma_cookie(domain); - domain->ops->free(domain); + if (domain->ops->free) + domain->ops->free(domain); } EXPORT_SYMBOL_GPL(iommu_domain_free); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 00b794d74e03b..d29e438377405 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -126,10 +126,6 @@ struct selftest_obj { }; }; -static void mock_domain_blocking_free(struct iommu_domain *domain) -{ -} - static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { @@ -137,7 +133,6 @@ static int mock_domain_nop_attach(struct iommu_domain *domain, } static const struct iommu_domain_ops mock_blocking_ops = { - .free = mock_domain_blocking_free, .attach_dev = mock_domain_nop_attach, }; diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 36fec26d2a04a..e16761dadf698 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -989,13 +989,8 @@ static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, return 0; } -static void rk_iommu_identity_free(struct iommu_domain *domain) -{ -} - static struct iommu_domain_ops rk_identity_ops = { .attach_dev = rk_iommu_identity_attach, - .free = rk_iommu_identity_free, }; static struct iommu_domain rk_identity_domain = { @@ -1059,9 +1054,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) { struct rk_iommu_domain *rk_domain; - if (type == IOMMU_DOMAIN_IDENTITY) - return &rk_identity_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) return NULL; @@ -1185,6 +1177,7 @@ static int rk_iommu_of_xlate(struct device *dev, } static const struct iommu_ops rk_iommu_ops = { + .identity_domain = &rk_identity_domain, .domain_alloc = rk_iommu_domain_alloc, .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b6ef263e85c06..0274d12a48e1b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -260,6 +260,8 @@ struct iommu_iotlb_gather { * will be blocked by the hardware. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops + * @identity_domain: An always available, always attachable identity + * translation. */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); @@ -294,6 +296,7 @@ struct iommu_ops { const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; struct module *owner; + struct iommu_domain *identity_domain; }; /** From 473d4b537bc76b582cb026bd5fda322e7d99ddf2 Mon Sep 17 00:00:00 2001 From: gaorui Date: Mon, 28 Jul 2025 17:01:56 +0800 Subject: [PATCH 17/39] iommu: Add IOMMU_DOMAIN_PLATFORM mainline inclusion from mainline-v6.6-rc3 commit 1c68cbc64fe6ac01dc242ba562344303031a76fb category: feature bugzilla: https://github.com/RVCK-Project/rvck-olk/issues/75 -------------------------------- This is used when the iommu driver is taking control of the dma_ops, currently only on S390 and power spapr. It is designed to preserve the original ops->detach_dev() semantic that these S390 was built around. Provide an opaque domain type and a 'default_domain' ops value that allows the driver to trivially force any single domain as the default domain. Update iommufd selftest to use this instead of set_platform_dma_ops Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 13 +++++++++++++ drivers/iommu/iommufd/selftest.c | 14 +++++--------- include/linux/iommu.h | 8 ++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5a3fcddad173d..b0f7e48b8b8e7 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -184,6 +184,8 @@ static const char *iommu_domain_type_str(unsigned int t) case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_DMA_FQ: return "Translated"; + case IOMMU_DOMAIN_PLATFORM: + return "Platform"; default: return "Unknown"; } @@ -1751,6 +1753,17 @@ iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) lockdep_assert_held(&group->mutex); + /* + * Allow legacy drivers to specify the domain that will be the default + * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM + * domain. Do not use in new drivers. + */ + if (bus->iommu_ops->default_domain) { + if (req_type) + return ERR_PTR(-EINVAL); + return bus->iommu_ops->default_domain; + } + if (req_type) return __iommu_group_alloc_default_domain(bus, group, req_type); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d29e438377405..aa56b67c44d48 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -296,14 +296,6 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) return cap == IOMMU_CAP_CACHE_COHERENCY; } -static void mock_domain_set_plaform_dma_ops(struct device *dev) -{ - /* - * mock doesn't setup default domains because we can't hook into the - * normal probe path - */ -} - static struct iommu_device mock_iommu_device = { }; @@ -313,12 +305,16 @@ static struct iommu_device *mock_probe_device(struct device *dev) } static const struct iommu_ops mock_ops = { + /* + * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() + * because it is zero. + */ + .default_domain = &mock_blocking_domain, .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, .domain_alloc = mock_domain_alloc, .capable = mock_domain_capable, - .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, .device_group = generic_device_group, .probe_device = mock_probe_device, .default_domain_ops = diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0274d12a48e1b..b4b27624462d6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -64,6 +64,7 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ #define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ +#define __IOMMU_DOMAIN_PLATFORM (1U << 5) #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ /* @@ -81,6 +82,8 @@ struct iommu_domain_geometry { * invalidation. * IOMMU_DOMAIN_SVA - DMA addresses are shared process addresses * represented by mm_struct's. + * IOMMU_DOMAIN_PLATFORM - Legacy domain for drivers that do their own + * dma_api stuff. Do not use in new drivers. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) @@ -91,6 +94,7 @@ struct iommu_domain_geometry { __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) #define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) +#define IOMMU_DOMAIN_PLATFORM (__IOMMU_DOMAIN_PLATFORM) struct iommu_domain { unsigned type; @@ -262,6 +266,9 @@ struct iommu_iotlb_gather { * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity * translation. + * @default_domain: If not NULL this will always be set as the default domain. + * This should be an IDENTITY/BLOCKED/PLATFORM domain. + * Do not use in new drivers. */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); @@ -297,6 +304,7 @@ struct iommu_ops { unsigned long pgsize_bitmap; struct module *owner; struct iommu_domain *identity_domain; + struct iommu_domain *default_domain; }; /** From edf7845d9948a63bb04752f49f45331b3f799727 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 13:53:10 +0800 Subject: [PATCH 18/39] iommu: Add IOMMU_DOMAIN_PLATFORM for S390 mainline inclusion from mainline-v6.6-rc3 commit e04c7487a6655722172e93e8f36e51d6ab279f86 category: feature bugzilla: https://github.com/RVCK-Project/rvck-olk/issues/75 -------------------------------- The PLATFORM domain will be set as the default domain and attached as normal during probe. The driver will ignore the initial attach from a NULL domain to the PLATFORM domain. After this, the PLATFORM domain's attach_dev will be called whenever we detach from an UNMANAGED domain (eg for VFIO). This is the same time the original design would have called op->detach_dev(). This is temporary until the S390 dma-iommu.c conversion is merged. Tested-by: Heiko Stuebner Tested-by: Niklas Schnelle Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/s390-iommu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index fbf59a8db29b1..f0c867c57a5b9 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -142,14 +142,31 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return 0; } -static void s390_iommu_set_platform_dma(struct device *dev) +/* + * Switch control over the IOMMU to S390's internal dma_api ops + */ +static int s390_iommu_platform_attach(struct iommu_domain *platform_domain, + struct device *dev) { struct zpci_dev *zdev = to_zpci_dev(dev); + if (!zdev->s390_domain) + return 0; + __s390_iommu_detach_device(zdev); zpci_dma_init_device(zdev); + return 0; } +static struct iommu_domain_ops s390_iommu_platform_ops = { + .attach_dev = s390_iommu_platform_attach, +}; + +static struct iommu_domain s390_iommu_platform_domain = { + .type = IOMMU_DOMAIN_PLATFORM, + .ops = &s390_iommu_platform_ops, +}; + static void s390_iommu_get_resv_regions(struct device *dev, struct list_head *list) { @@ -428,12 +445,12 @@ void zpci_destroy_iommu(struct zpci_dev *zdev) } static const struct iommu_ops s390_iommu_ops = { + .default_domain = &s390_iommu_platform_domain, .capable = s390_iommu_capable, .domain_alloc = s390_domain_alloc, .probe_device = s390_iommu_probe_device, .release_device = s390_iommu_release_device, .device_group = generic_device_group, - .set_platform_dma_ops = s390_iommu_set_platform_dma, .pgsize_bitmap = SZ_4K, .get_resv_regions = s390_iommu_get_resv_regions, .default_domain_ops = &(const struct iommu_domain_ops) { From 15f49d6392bbe0f83450cad1390bef2743346315 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 13:57:50 +0800 Subject: [PATCH 19/39] iommu/fsl_pamu: Implement a PLATFORM domain mainline inclusion from mainline-v6.6-rc3 commit 8565915e75581ba540527d1d9a2b89620002a0d0 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This driver is nonsensical. To not block migrating the core API away from NULL default_domains give it a hacky of a PLATFORM domain that keeps it working exactly as it always did. Leave some comments around to warn away any future people looking at this. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/fsl_pamu_domain.c | 41 ++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 4ac0e247ec2b5..e9d2bff4659b7 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -196,6 +196,13 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type) { struct fsl_dma_domain *dma_domain; + /* + * FIXME: This isn't creating an unmanaged domain since the + * default_domain_ops do not have any map/unmap function it doesn't meet + * the requirements for __IOMMU_DOMAIN_PAGING. The only purpose seems to + * allow drivers/soc/fsl/qbman/qman_portal.c to do + * fsl_pamu_configure_l1_stash() + */ if (type != IOMMU_DOMAIN_UNMANAGED) return NULL; @@ -283,15 +290,33 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, return ret; } -static void fsl_pamu_set_platform_dma(struct device *dev) +/* + * FIXME: fsl/pamu is completely broken in terms of how it works with the iommu + * API. Immediately after probe the HW is left in an IDENTITY translation and + * the driver provides a non-working UNMANAGED domain that it can switch over + * to. However it cannot switch back to an IDENTITY translation, instead it + * switches to what looks like BLOCKING. + */ +static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); + struct fsl_dma_domain *dma_domain; const u32 *prop; int len; struct pci_dev *pdev = NULL; struct pci_controller *pci_ctl; + /* + * Hack to keep things working as they always have, only leaving an + * UNMANAGED domain makes it BLOCKING. + */ + if (domain == platform_domain || !domain || + domain->type != IOMMU_DOMAIN_UNMANAGED) + return 0; + + dma_domain = to_fsl_dma_domain(domain); + /* * Use LIODN of the PCI controller while detaching a * PCI device. @@ -312,8 +337,18 @@ static void fsl_pamu_set_platform_dma(struct device *dev) detach_device(dev, dma_domain); else pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node); + return 0; } +static struct iommu_domain_ops fsl_pamu_platform_ops = { + .attach_dev = fsl_pamu_platform_attach, +}; + +static struct iommu_domain fsl_pamu_platform_domain = { + .type = IOMMU_DOMAIN_PLATFORM, + .ops = &fsl_pamu_platform_ops, +}; + /* Set the domain stash attribute */ int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu) { @@ -395,11 +430,11 @@ static struct iommu_device *fsl_pamu_probe_device(struct device *dev) } static const struct iommu_ops fsl_pamu_ops = { + .default_domain = &fsl_pamu_platform_domain, .capable = fsl_pamu_capable, .domain_alloc = fsl_pamu_domain_alloc, .probe_device = fsl_pamu_probe_device, .device_group = fsl_pamu_device_group, - .set_platform_dma_ops = fsl_pamu_set_platform_dma, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = fsl_pamu_attach_device, .iova_to_phys = fsl_pamu_iova_to_phys, From b5d3d26fbe5e4bd9d6b0527032d70f4f54f23eef Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:07:42 +0800 Subject: [PATCH 20/39] iommu/mtk_iommu_v1: Implement an IDENTITY domain mainline inclusion from mainline-v6.6-rc3 commit 90057dc095dca023ad3e8cd5a129f392b543fb9e category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- What mtk does during mtk_iommu_v1_set_platform_dma() is actually putting the iommu into identity mode. Make this available as a proper IDENTITY domain. The mtk_iommu_v1_def_domain_type() from commit 8bbe13f52cb7 ("iommu/mediatek-v1: Add def_domain_type") explains this was needed to allow probe_finalize() to be called, but now the IDENTITY domain will do the same job so change the returned def_domain_type. mkt_v1 is the only driver that returns IOMMU_DOMAIN_UNMANAGED from def_domain_type(). This allows the next patch to enforce an IDENTITY domain policy for this driver. Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/mtk_iommu_v1.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index f1754efcfe74e..d0c7e1727ed8a 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -319,11 +319,27 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device return 0; } -static void mtk_iommu_v1_set_platform_dma(struct device *dev) +static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); mtk_iommu_v1_config(data, dev, false); + return 0; +} + +static struct iommu_domain_ops mtk_iommu_v1_identity_ops = { + .attach_dev = mtk_iommu_v1_identity_attach, +}; + +static struct iommu_domain mtk_iommu_v1_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &mtk_iommu_v1_identity_ops, +}; + +static void mtk_iommu_v1_set_platform_dma(struct device *dev) +{ + mtk_iommu_v1_identity_attach(&mtk_iommu_v1_identity_domain, dev); } static int mtk_iommu_v1_map(struct iommu_domain *domain, unsigned long iova, @@ -443,7 +459,7 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, struct of_phandle_arg static int mtk_iommu_v1_def_domain_type(struct device *dev) { - return IOMMU_DOMAIN_UNMANAGED; + return IOMMU_DOMAIN_IDENTITY; } static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) @@ -578,6 +594,7 @@ static int mtk_iommu_v1_hw_init(const struct mtk_iommu_v1_data *data) } static const struct iommu_ops mtk_iommu_v1_ops = { + .identity_domain = &mtk_iommu_v1_identity_domain, .domain_alloc = mtk_iommu_v1_domain_alloc, .probe_device = mtk_iommu_v1_probe_device, .probe_finalize = mtk_iommu_v1_probe_finalize, From d1fbae66e5dce18703aac3b51628626114332989 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:10:47 +0800 Subject: [PATCH 21/39] iommu: Reorganize iommu_get_default_domain_type() to respect def_domain_type() mainline inclusion from mainline-v6.6-rc3 commit 59ddce4418da483c932bc7a08b88d6ba14020e83 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- Except for dart (which forces IOMMU_DOMAIN_DMA) every driver returns 0 or IDENTITY from ops->def_domain_type(). The drivers that return IDENTITY have some kind of good reason, typically that quirky hardware really can't support anything other than IDENTITY. Arrange things so that if the driver says it needs IDENTITY then iommu_get_default_domain_type() either fails or returns IDENTITY. It will not ignore the driver's override to IDENTITY. Split the function into two steps, reducing the group device list to the driver's def_domain_type() and the untrusted flag. Then compute the result based on those two reduced variables. Fully reject combining untrusted with IDENTITY. Remove the debugging print on the iommu_group_store_type() failure path, userspace should not be able to trigger kernel prints. This makes the next patch cleaner that wants to force IDENTITY always for ARM_IOMMU because there is no support for DMA. Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 115 ++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 37 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b0f7e48b8b8e7..c27db4838acca 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1717,19 +1717,6 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); -static int iommu_get_def_domain_type(struct device *dev) -{ - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) - return IOMMU_DOMAIN_DMA; - - if (ops->def_domain_type) - return ops->def_domain_type(dev); - - return 0; -} - static struct iommu_domain * __iommu_group_alloc_default_domain(const struct bus_type *bus, struct iommu_group *group, int req_type) @@ -1739,6 +1726,23 @@ __iommu_group_alloc_default_domain(const struct bus_type *bus, return __iommu_domain_alloc(bus, req_type); } +/* + * Returns the iommu_ops for the devices in an iommu group. + * + * It is assumed that all devices in an iommu group are managed by a single + * IOMMU unit. Therefore, this returns the dev_iommu_ops of the first device + * in the group. + */ +static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) +{ + struct group_device *device = + list_first_entry(&group->devices, struct group_device, list); + + lockdep_assert_held(&group->mutex); + + return dev_iommu_ops(device->dev); +} + /* * req_type of 0 means "auto" which means to select a domain based on * iommu_def_domain_type or what the driver actually supports. @@ -1821,40 +1825,77 @@ static int iommu_bus_notifier(struct notifier_block *nb, return 0; } -/* A target_type of 0 will select the best domain type and cannot fail */ +/* + * Combine the driver's chosen def_domain_type across all the devices in a + * group. Drivers must give a consistent result. + */ +static int iommu_get_def_domain_type(struct iommu_group *group, + struct device *dev, int cur_type) +{ + const struct iommu_ops *ops = group_iommu_ops(group); + int type; + + if (!ops->def_domain_type) + return cur_type; + + type = ops->def_domain_type(dev); + if (!type || cur_type == type) + return cur_type; + if (!cur_type) + return type; + + dev_err_ratelimited( + dev, + "IOMMU driver error, requesting conflicting def_domain_type, %s and %s, for devices in group %u.\n", + iommu_domain_type_str(cur_type), iommu_domain_type_str(type), + group->id); + + /* + * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * takes precedence. + */ + if (type == IOMMU_DOMAIN_IDENTITY) + return type; + return cur_type; +} + +/* + * A target_type of 0 will select the best domain type. 0 can be returned in + * this case meaning the global default should be used. + */ static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { - int best_type = target_type; + struct device *untrusted = NULL; struct group_device *gdev; - struct device *last_dev; + int driver_type = 0; lockdep_assert_held(&group->mutex); - for_each_group_device(group, gdev) { - unsigned int type = iommu_get_def_domain_type(gdev->dev); - - if (best_type && type && best_type != type) { - if (target_type) { - dev_err_ratelimited( - gdev->dev, - "Device cannot be in %s domain\n", - iommu_domain_type_str(target_type)); - return -1; - } + driver_type = iommu_get_def_domain_type(group, gdev->dev, + driver_type); - dev_warn( - gdev->dev, - "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n", - iommu_domain_type_str(type), dev_name(last_dev), - iommu_domain_type_str(best_type)); - return 0; + if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) + untrusted = gdev->dev; + } + + if (untrusted) { + if (driver_type && driver_type != IOMMU_DOMAIN_DMA) { + dev_err_ratelimited( + untrusted, + "Device is not trusted, but driver is overriding group %u to %s, refusing to probe.\n", + group->id, iommu_domain_type_str(driver_type)); + return -1; } - if (!best_type) - best_type = type; - last_dev = gdev->dev; + driver_type = IOMMU_DOMAIN_DMA; + } + + if (target_type) { + if (driver_type && target_type != driver_type) + return -1; + return target_type; } - return best_type; + return driver_type; } static void iommu_group_do_probe_finalize(struct device *dev) From 0e958814d2dda3afd2e85e53aa235232bd87d799 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:14:46 +0800 Subject: [PATCH 22/39] iommu: Allow an IDENTITY domain as the default_domain in ARM32 mainline inclusion from mainline-v6.6-rc3 commit e98befd010bd56b8b3f2afea2d600e30df023e6b category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- Even though dma-iommu.c and CONFIG_ARM_DMA_USE_IOMMU do approximately the same stuff, the way they relate to the IOMMU core is quiet different. dma-iommu.c expects the core code to setup an UNMANAGED domain (of type IOMMU_DOMAIN_DMA) and then configures itself to use that domain. This becomes the default_domain for the group. ARM_DMA_USE_IOMMU does not use the default_domain, instead it directly allocates an UNMANAGED domain and operates it just like an external driver. In this case group->default_domain is NULL. If the driver provides a global static identity_domain then automatically use it as the default_domain when in ARM_DMA_USE_IOMMU mode. This allows drivers that implemented default_domain == NULL as an IDENTITY translation to trivially get a properly labeled non-NULL default_domain on ARM32 configs. With this arrangment when ARM_DMA_USE_IOMMU wants to disconnect from the device the normal detach_domain flow will restore the IDENTITY domain as the default domain. Overall this makes attach_dev() of the IDENTITY domain called in the same places as detach_dev(). This effectively migrates these drivers to default_domain mode. For drivers that support ARM64 they will gain support for the IDENTITY translation mode for the dma_api and behave in a uniform way. Drivers use this by setting ops->identity_domain to a static singleton iommu_domain that implements the identity attach. If the core detects ARM_DMA_USE_IOMMU mode then it automatically attaches the IDENTITY domain during probe. Drivers can continue to prevent the use of DMA translation by returning IOMMU_DOMAIN_IDENTITY from def_domain_type, this will completely prevent IOMMU_DMA from running but will not impact ARM_DMA_USE_IOMMU. This allows removing the set_platform_dma_ops() from every remaining driver. Remove the set_platform_dma_ops from rockchip and mkt_v1 as all it does is set an existing global static identity domain. mkt_v1 does not support IOMMU_DOMAIN_DMA and it does not compile on ARM64 so this transformation is safe. Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/9-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 21 ++++++++++++++++++++- drivers/iommu/mtk_iommu_v1.c | 12 ------------ drivers/iommu/rockchip-iommu.c | 10 ---------- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c27db4838acca..a19102ef518f3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1866,17 +1866,36 @@ static int iommu_get_def_domain_type(struct iommu_group *group, static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { + const struct iommu_ops *ops = group_iommu_ops(group); struct device *untrusted = NULL; struct group_device *gdev; int driver_type = 0; lockdep_assert_held(&group->mutex); + + /* + * ARM32 drivers supporting CONFIG_ARM_DMA_USE_IOMMU can declare an + * identity_domain and it will automatically become their default + * domain. Later on ARM_DMA_USE_IOMMU will install its UNMANAGED domain. + * Override the selection to IDENTITY if we are sure the driver supports + * it. + */ + if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && ops->identity_domain) + driver_type = IOMMU_DOMAIN_IDENTITY; + for_each_group_device(group, gdev) { driver_type = iommu_get_def_domain_type(group, gdev->dev, driver_type); - if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) + if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) { + /* + * No ARM32 using systems will set untrusted, it cannot + * work. + */ + if (WARN_ON(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU))) + return -1; untrusted = gdev->dev; + } } if (untrusted) { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index d0c7e1727ed8a..0e6b2bb4955de 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -337,11 +337,6 @@ static struct iommu_domain mtk_iommu_v1_identity_domain = { .ops = &mtk_iommu_v1_identity_ops, }; -static void mtk_iommu_v1_set_platform_dma(struct device *dev) -{ - mtk_iommu_v1_identity_attach(&mtk_iommu_v1_identity_domain, dev); -} - static int mtk_iommu_v1_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -457,11 +452,6 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, struct of_phandle_arg return 0; } -static int mtk_iommu_v1_def_domain_type(struct device *dev) -{ - return IOMMU_DOMAIN_IDENTITY; -} - static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -599,10 +589,8 @@ static const struct iommu_ops mtk_iommu_v1_ops = { .probe_device = mtk_iommu_v1_probe_device, .probe_finalize = mtk_iommu_v1_probe_finalize, .release_device = mtk_iommu_v1_release_device, - .def_domain_type = mtk_iommu_v1_def_domain_type, .device_group = generic_device_group, .pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE, - .set_platform_dma_ops = mtk_iommu_v1_set_platform_dma, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_v1_attach_device, diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index e16761dadf698..b6f1ba01a341d 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -998,13 +998,6 @@ static struct iommu_domain rk_identity_domain = { .ops = &rk_identity_ops, }; -#ifdef CONFIG_ARM -static void rk_iommu_set_platform_dma(struct device *dev) -{ - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); -} -#endif - static int rk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -1182,9 +1175,6 @@ static const struct iommu_ops rk_iommu_ops = { .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, .device_group = rk_iommu_device_group, -#ifdef CONFIG_ARM - .set_platform_dma_ops = rk_iommu_set_platform_dma, -#endif .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, .of_xlate = rk_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { From ebc96f0841e6fde53bc2f7288d56edad4784b3d1 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:17:03 +0800 Subject: [PATCH 23/39] iommu/exynos: Implement an IDENTITY domain mainline inclusion from mainline-v6.6-rc3 commit b3d14960e629f9ee8c82f8feb211ae43e1cb3246 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- What exynos calls exynos_iommu_detach_device is actually putting the iommu into identity mode. Move to the new core support for ARM_DMA_USE_IOMMU by defining ops->identity_domain. Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/10-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/exynos-iommu.c | 66 +++++++++++++++++------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c275fe71c4db3..5e12b85dfe870 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -24,6 +24,7 @@ typedef u32 sysmmu_iova_t; typedef u32 sysmmu_pte_t; +static struct iommu_domain exynos_identity_domain; /* We do not consider super section mapping (16MB) */ #define SECT_ORDER 20 @@ -829,7 +830,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (data->domain) { + if (&data->domain->domain != &exynos_identity_domain) { dev_dbg(data->sysmmu, "saving state\n"); __sysmmu_disable(data); } @@ -847,7 +848,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (data->domain) { + if (&data->domain->domain != &exynos_identity_domain) { dev_dbg(data->sysmmu, "restoring state\n"); __sysmmu_enable(data); } @@ -980,17 +981,20 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) kfree(domain); } -static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain, - struct device *dev) +static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { - struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); - phys_addr_t pagetable = virt_to_phys(domain->pgtable); + struct exynos_iommu_domain *domain; + phys_addr_t pagetable; struct sysmmu_drvdata *data, *next; unsigned long flags; - if (!has_sysmmu(dev) || owner->domain != iommu_domain) - return; + if (owner->domain == identity_domain) + return 0; + + domain = to_exynos_domain(owner->domain); + pagetable = virt_to_phys(domain->pgtable); mutex_lock(&owner->rpm_lock); @@ -1009,15 +1013,25 @@ static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain, list_del_init(&data->domain_node); spin_unlock(&data->lock); } - owner->domain = NULL; + owner->domain = identity_domain; spin_unlock_irqrestore(&domain->lock, flags); mutex_unlock(&owner->rpm_lock); - dev_dbg(dev, "%s: Detached IOMMU with pgtable %pa\n", __func__, - &pagetable); + dev_dbg(dev, "%s: Restored IOMMU to IDENTITY from pgtable %pa\n", + __func__, &pagetable); + return 0; } +static struct iommu_domain_ops exynos_identity_ops = { + .attach_dev = exynos_iommu_identity_attach, +}; + +static struct iommu_domain exynos_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &exynos_identity_ops, +}; + static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, struct device *dev) { @@ -1026,12 +1040,11 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, struct sysmmu_drvdata *data; phys_addr_t pagetable = virt_to_phys(domain->pgtable); unsigned long flags; + int err; - if (!has_sysmmu(dev)) - return -ENODEV; - - if (owner->domain) - exynos_iommu_detach_device(owner->domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + if (err) + return err; mutex_lock(&owner->rpm_lock); @@ -1407,26 +1420,12 @@ static struct iommu_device *exynos_iommu_probe_device(struct device *dev) return &data->iommu; } -static void exynos_iommu_set_platform_dma(struct device *dev) -{ - struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); - - if (owner->domain) { - struct iommu_group *group = iommu_group_get(dev); - - if (group) { - exynos_iommu_detach_device(owner->domain, dev); - iommu_group_put(group); - } - } -} - static void exynos_iommu_release_device(struct device *dev) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct sysmmu_drvdata *data; - exynos_iommu_set_platform_dma(dev); + WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); @@ -1457,6 +1456,7 @@ static int exynos_iommu_of_xlate(struct device *dev, INIT_LIST_HEAD(&owner->controllers); mutex_init(&owner->rpm_lock); + owner->domain = &exynos_identity_domain; dev_iommu_priv_set(dev, owner); } @@ -1471,11 +1471,9 @@ static int exynos_iommu_of_xlate(struct device *dev, } static const struct iommu_ops exynos_iommu_ops = { + .identity_domain = &exynos_identity_domain, .domain_alloc = exynos_iommu_domain_alloc, .device_group = generic_device_group, -#ifdef CONFIG_ARM - .set_platform_dma_ops = exynos_iommu_set_platform_dma, -#endif .probe_device = exynos_iommu_probe_device, .release_device = exynos_iommu_release_device, .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE, From 6ba9ccf02f533990a7f9203018c0ab865a0fb7f0 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:18:41 +0800 Subject: [PATCH 24/39] iommu/tegra-smmu: Implement an IDENTITY domain mainline inclusion from mainline-v6.6-rc3 commit c8cc2655cc6c7ff832827ad5bc1a8f3df165706d category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- What tegra-smmu does during tegra_smmu_set_platform_dma() is actually putting the iommu into identity mode. Move to the new core support for ARM_DMA_USE_IOMMU by defining ops->identity_domain. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/tegra-smmu.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e445f80d02263..80481e1ba561b 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -511,23 +511,39 @@ static int tegra_smmu_attach_dev(struct iommu_domain *domain, return err; } -static void tegra_smmu_set_platform_dma(struct device *dev) +static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct tegra_smmu_as *as = to_smmu_as(domain); - struct tegra_smmu *smmu = as->smmu; + struct tegra_smmu_as *as; + struct tegra_smmu *smmu; unsigned int index; if (!fwspec) - return; + return -ENODEV; + + if (domain == identity_domain || !domain) + return 0; + as = to_smmu_as(domain); + smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); tegra_smmu_as_unprepare(smmu, as); } + return 0; } +static struct iommu_domain_ops tegra_smmu_identity_ops = { + .attach_dev = tegra_smmu_identity_attach, +}; + +static struct iommu_domain tegra_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &tegra_smmu_identity_ops, +}; + static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova, u32 value) { @@ -962,11 +978,22 @@ static int tegra_smmu_of_xlate(struct device *dev, return iommu_fwspec_add_ids(dev, &id, 1); } +static int tegra_smmu_def_domain_type(struct device *dev) +{ + /* + * FIXME: For now we want to run all translation in IDENTITY mode, due + * to some device quirks. Better would be to just quirk the troubled + * devices. + */ + return IOMMU_DOMAIN_IDENTITY; +} + static const struct iommu_ops tegra_smmu_ops = { + .identity_domain = &tegra_smmu_identity_domain, + .def_domain_type = &tegra_smmu_def_domain_type, .domain_alloc = tegra_smmu_domain_alloc, .probe_device = tegra_smmu_probe_device, .device_group = tegra_smmu_device_group, - .set_platform_dma_ops = tegra_smmu_set_platform_dma, .of_xlate = tegra_smmu_of_xlate, .pgsize_bitmap = SZ_4K, .default_domain_ops = &(const struct iommu_domain_ops) { From 5e98c2567302e6f727bb0ca77d8581c400e2d0c1 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:20:22 +0800 Subject: [PATCH 25/39] iommu/tegra-smmu: Support DMA domains in tegra mainline inclusion from mainline-v6.6-rc3 commit f128094f347f578279e551488796d0a0067cbdff category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- All ARM64 iommu drivers should support IOMMU_DOMAIN_DMA to enable dma-iommu.c. tegra is blocking dma-iommu usage, and also default_domain's, because it wants an identity translation. This is needed for some device quirk. The correct way to do this is to support IDENTITY domains and use ops->def_domain_type() to return IOMMU_DOMAIN_IDENTITY for only the quirky devices. Add support for IOMMU_DOMAIN_DMA and force IOMMU_DOMAIN_IDENTITY mode for everything so no behavior changes. Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/12-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/tegra-smmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 80481e1ba561b..b91ad1b5a20d3 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -276,7 +276,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type) { struct tegra_smmu_as *as; - if (type != IOMMU_DOMAIN_UNMANAGED) + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) return NULL; as = kzalloc(sizeof(*as), GFP_KERNEL); From 9fde0bcd92fe1c7e8c9698f7ad61c36c865a6711 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:25:32 +0800 Subject: [PATCH 26/39] iommu/omap: Implement an IDENTITY domain mainline inclusion from mainline-v6.6-rc3 commit bbe39b76330ecf64f8b3f45cdf8bacbbb7f390c9 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- What omap does during omap_iommu_set_platform_dma() is actually putting the iommu into identity mode. Move to the new core support for ARM_DMA_USE_IOMMU by defining ops->identity_domain. This driver does not support IOMMU_DOMAIN_DMA, however it cannot be compiled on ARM64 either. Most likely it is fine to support dma-iommu.c Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/13-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/omap-iommu.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 537e402f9bba9..34340ef15241b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1555,16 +1555,31 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, omap_domain->dev = NULL; } -static void omap_iommu_set_platform_dma(struct device *dev) +static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct omap_iommu_domain *omap_domain = to_omap_domain(domain); + struct omap_iommu_domain *omap_domain; + if (domain == identity_domain || !domain) + return 0; + + omap_domain = to_omap_domain(domain); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); + return 0; } +static struct iommu_domain_ops omap_iommu_identity_ops = { + .attach_dev = omap_iommu_identity_attach, +}; + +static struct iommu_domain omap_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &omap_iommu_identity_ops, +}; + static struct iommu_domain *omap_iommu_domain_alloc(unsigned type) { struct omap_iommu_domain *omap_domain; @@ -1732,11 +1747,11 @@ static struct iommu_group *omap_iommu_device_group(struct device *dev) } static const struct iommu_ops omap_iommu_ops = { + .identity_domain = &omap_iommu_identity_domain, .domain_alloc = omap_iommu_domain_alloc, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, .device_group = omap_iommu_device_group, - .set_platform_dma_ops = omap_iommu_set_platform_dma, .pgsize_bitmap = OMAP_IOMMU_PGSIZES, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = omap_iommu_attach_dev, From 3b3a8310ee0cf7d7747cce5f1ae9837b21c88e6c Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:27:02 +0800 Subject: [PATCH 27/39] iommu/msm: Implement an IDENTITY domain mainline inclusion from mainline-v6.6-rc3 commit 78fc30b4bb3540c88edd19c36d0a72bb0dd02d0b category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- What msm does during msm_iommu_set_platform_dma() is actually putting the iommu into identity mode. Move to the new core support for ARM_DMA_USE_IOMMU by defining ops->identity_domain. This driver does not support IOMMU_DOMAIN_DMA, however it cannot be compiled on ARM64 either. Most likely it is fine to support dma-iommu.c Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/14-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/msm_iommu.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 79d89bad5132b..26ed81cfeee89 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -443,15 +443,20 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; } -static void msm_iommu_set_platform_dma(struct device *dev) +static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct msm_priv *priv = to_msm_priv(domain); + struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; - int ret; + int ret = 0; + + if (domain == identity_domain || !domain) + return 0; + priv = to_msm_priv(domain); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); @@ -468,8 +473,18 @@ static void msm_iommu_set_platform_dma(struct device *dev) } fail: spin_unlock_irqrestore(&msm_iommu_lock, flags); + return ret; } +static struct iommu_domain_ops msm_iommu_identity_ops = { + .attach_dev = msm_iommu_identity_attach, +}; + +static struct iommu_domain msm_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &msm_iommu_identity_ops, +}; + static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t pa, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -675,10 +690,10 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id) } static struct iommu_ops msm_iommu_ops = { + .identity_domain = &msm_iommu_identity_domain, .domain_alloc = msm_iommu_domain_alloc, .probe_device = msm_iommu_probe_device, .device_group = generic_device_group, - .set_platform_dma_ops = msm_iommu_set_platform_dma, .pgsize_bitmap = MSM_IOMMU_PGSIZES, .of_xlate = qcom_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { From d058ec9ccd30bfd66dadd0f56e9b6c8f6af80e1d Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:29:23 +0800 Subject: [PATCH 28/39] iommu: Remove ops->set_platform_dma_ops() mainline inclusion from mainline-v6.6-rc3 commit 24b1d476167df3e30c7a53b67765bf3c787c5160 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- All drivers are now using IDENTITY or PLATFORM domains for what this did, we can remove it now. It is no longer possible to attach to a NULL domain. Tested-by: Heiko Stuebner Tested-by: Niklas Schnelle Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 30 +++++------------------------- include/linux/iommu.h | 4 ---- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a19102ef518f3..afca07b5f1a1e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2352,21 +2352,8 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, if (group->domain == new_domain) return 0; - /* - * New drivers should support default domains, so set_platform_dma() - * op will never be called. Otherwise the NULL domain represents some - * platform specific behavior. - */ - if (!new_domain) { - for_each_group_device(group, gdev) { - const struct iommu_ops *ops = dev_iommu_ops(gdev->dev); - - if (!WARN_ON(!ops->set_platform_dma_ops)) - ops->set_platform_dma_ops(gdev->dev); - } - group->domain = NULL; - return 0; - } + if (WARN_ON(!new_domain)) + return -EINVAL; /* * Changing the domain is done by calling attach_dev() on the new @@ -2402,19 +2389,15 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, */ last_gdev = gdev; for_each_group_device(group, gdev) { - const struct iommu_ops *ops = dev_iommu_ops(gdev->dev); - /* - * If set_platform_dma_ops is not present a NULL domain can - * happen only for first probe, in which case we leave - * group->domain as NULL and let release clean everything up. + * A NULL domain can happen only for first probe, in which case + * we leave group->domain as NULL and let release clean + * everything up. */ if (group->domain) WARN_ON(__iommu_device_set_domain( group, gdev->dev, group->domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); - else if (ops->set_platform_dma_ops) - ops->set_platform_dma_ops(gdev->dev); if (gdev == last_gdev) break; } @@ -3039,9 +3022,6 @@ static int iommu_setup_default_domain(struct iommu_group *group, /* * There are still some drivers which don't support default domains, so * we ignore the failure and leave group->default_domain NULL. - * - * We assume that the iommu driver starts up the device in - * 'set_platform_dma_ops' mode if it does not support default domains. */ dom = iommu_group_alloc_default_domain(group, req_type); if (!dom) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b4b27624462d6..3192c5599864a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -243,9 +243,6 @@ struct iommu_iotlb_gather { * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU * group and attached to the groups domain - * @set_platform_dma_ops: Returning control back to the platform DMA ops. This op - * is to support old IOMMU drivers, new drivers should use - * default domains, and the common IOMMU DMA ops. * @device_group: find iommu group for a particular device * @get_resv_regions: Request list of reserved regions for a device * @of_xlate: add OF master IDs to iommu grouping @@ -280,7 +277,6 @@ struct iommu_ops { struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); void (*probe_finalize)(struct device *dev); - void (*set_platform_dma_ops)(struct device *dev); struct iommu_group *(*device_group)(struct device *dev); /* Request/Free a list of reserved regions for a device */ From 2413d9d710d1cd4325068bf28884e371a356fe45 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:34:39 +0800 Subject: [PATCH 29/39] iommu/qcom_iommu: Add an IOMMU_IDENTITIY_DOMAIN mainline inclusion from mainline-v6.6-rc3 commit 786478a90294ea5b149ed1156a43e82d63ea61ff category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This brings back the ops->detach_dev() code that commit 1b932ceddd19 ("iommu: Remove detach_dev callbacks") deleted and turns it into an IDENTITY domain. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/16-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 775a3cbaff4ed..bc45d18f350cb 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -400,6 +400,44 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev return 0; } +static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct qcom_iommu_domain *qcom_domain; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + unsigned int i; + + if (domain == identity_domain || !domain) + return 0; + + qcom_domain = to_qcom_iommu_domain(domain); + if (WARN_ON(!qcom_domain->iommu)) + return -EINVAL; + + pm_runtime_get_sync(qcom_iommu->dev); + for (i = 0; i < fwspec->num_ids; i++) { + struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]); + + /* Disable the context bank: */ + iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0); + + ctx->domain = NULL; + } + pm_runtime_put_sync(qcom_iommu->dev); + return 0; +} + +static struct iommu_domain_ops qcom_iommu_identity_ops = { + .attach_dev = qcom_iommu_identity_attach, +}; + +static struct iommu_domain qcom_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &qcom_iommu_identity_ops, +}; + static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -565,6 +603,7 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) } static const struct iommu_ops qcom_iommu_ops = { + .identity_domain = &qcom_iommu_identity_domain, .capable = qcom_iommu_capable, .domain_alloc = qcom_iommu_domain_alloc, .probe_device = qcom_iommu_probe_device, From f7c775e6668101690a1f9f84a570a6c9638ed846 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:42:48 +0800 Subject: [PATCH 30/39] iommu/ipmmu: Add an IOMMU_IDENTITIY_DOMAIN mainline inclusion from mainline-v6.6-rc3 commit 666c9f1ef7fa9330ea041aba36658aeb48145846 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This brings back the ops->detach_dev() code that commit 1b932ceddd19 ("iommu: Remove detach_dev callbacks") deleted and turns it into an IDENTITY domain. Also reverts commit 584d334b1393 ("iommu/ipmmu-vmsa: Remove ipmmu_utlb_disable()") Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/17-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/ipmmu-vmsa.c | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 65ff69477c43e..04830d3931d23 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -295,6 +295,18 @@ static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain, mmu->utlb_ctx[utlb] = domain->context_id; } +/* + * Disable MMU translation for the microTLB. + */ +static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain, + unsigned int utlb) +{ + struct ipmmu_vmsa_device *mmu = domain->mmu; + + ipmmu_imuctr_write(mmu, utlb, 0); + mmu->utlb_ctx[utlb] = IPMMU_CTX_INVALID; +} + static void ipmmu_tlb_flush_all(void *cookie) { struct ipmmu_vmsa_domain *domain = cookie; @@ -627,6 +639,36 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, return 0; } +static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct ipmmu_vmsa_domain *domain; + unsigned int i; + + if (io_domain == identity_domain || !io_domain) + return 0; + + domain = to_vmsa_domain(io_domain); + for (i = 0; i < fwspec->num_ids; ++i) + ipmmu_utlb_disable(domain, fwspec->ids[i]); + + /* + * TODO: Optimize by disabling the context when no device is attached. + */ + return 0; +} + +static struct iommu_domain_ops ipmmu_iommu_identity_ops = { + .attach_dev = ipmmu_iommu_identity_attach, +}; + +static struct iommu_domain ipmmu_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &ipmmu_iommu_identity_ops, +}; + static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -849,6 +891,7 @@ static struct iommu_group *ipmmu_find_group(struct device *dev) } static const struct iommu_ops ipmmu_ops = { + .identity_domain = &ipmmu_iommu_identity_domain, .domain_alloc = ipmmu_domain_alloc, .probe_device = ipmmu_probe_device, .release_device = ipmmu_release_device, From 845098dd40fd0ad3e4b02d205b635066fb3402e7 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:49:42 +0800 Subject: [PATCH 31/39] iommu/mtk_iommu: Add an IOMMU_IDENTITIY_DOMAIN mainline inclusion from mainline-v6.6-rc3 commit b01b12573837f237cec13b9ddf8e5e623ee6f981 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This brings back the ops->detach_dev() code that commit 1b932ceddd19 ("iommu: Remove detach_dev callbacks") deleted and turns it into an IDENTITY domain. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/18-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/mtk_iommu.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 06c0770ff894e..06192aff5eb8e 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -776,6 +776,28 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, return ret; } +static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct mtk_iommu_data *data = dev_iommu_priv_get(dev); + + if (domain == identity_domain || !domain) + return 0; + + mtk_iommu_config(data, dev, false, 0); + return 0; +} + +static struct iommu_domain_ops mtk_iommu_identity_ops = { + .attach_dev = mtk_iommu_identity_attach, +}; + +static struct iommu_domain mtk_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &mtk_iommu_identity_ops, +}; + static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -995,6 +1017,7 @@ static void mtk_iommu_get_resv_regions(struct device *dev, } static const struct iommu_ops mtk_iommu_ops = { + .identity_domain = &mtk_iommu_identity_domain, .domain_alloc = mtk_iommu_domain_alloc, .probe_device = mtk_iommu_probe_device, .release_device = mtk_iommu_release_device, From 1064cd3e0a7649c33a300314e509aed6fde3e30d Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:51:28 +0800 Subject: [PATCH 32/39] iommu/sun50i: Add an IOMMU_IDENTITIY_DOMAIN mainline inclusion from mainline-v6.6-rc3 commit 5b167dea64a3fb72907e33f30211d4a5711076f1 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- Prior to commit 1b932ceddd19 ("iommu: Remove detach_dev callbacks") the sun50i_iommu_detach_device() function was being called by ops->detach_dev(). This is an IDENTITY domain so convert sun50i_iommu_detach_device() into sun50i_iommu_identity_attach() and a full IDENTITY domain and thus hook it back up the same was as the old ops->detach_dev(). Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/sun50i-iommu.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 94bd7f25f6f26..2a61b50c97f83 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -758,21 +758,32 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, iommu->domain = NULL; } -static void sun50i_iommu_detach_device(struct iommu_domain *domain, - struct device *dev) +static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { - struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); + struct sun50i_iommu_domain *sun50i_domain; dev_dbg(dev, "Detaching from IOMMU domain\n"); - if (iommu->domain != domain) - return; + if (iommu->domain == identity_domain) + return 0; + sun50i_domain = to_sun50i_domain(iommu->domain); if (refcount_dec_and_test(&sun50i_domain->refcnt)) sun50i_iommu_detach_domain(iommu, sun50i_domain); + return 0; } +static struct iommu_domain_ops sun50i_iommu_identity_ops = { + .attach_dev = sun50i_iommu_identity_attach, +}; + +static struct iommu_domain sun50i_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &sun50i_iommu_identity_ops, +}; + static int sun50i_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -790,8 +801,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - if (iommu->domain) - sun50i_iommu_detach_device(iommu->domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); sun50i_iommu_attach_domain(iommu, sun50i_domain); @@ -828,6 +838,7 @@ static int sun50i_iommu_of_xlate(struct device *dev, } static const struct iommu_ops sun50i_iommu_ops = { + .identity_domain = &sun50i_iommu_identity_domain, .pgsize_bitmap = SZ_4K, .device_group = sun50i_iommu_device_group, .domain_alloc = sun50i_iommu_domain_alloc, @@ -986,6 +997,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev) if (!iommu) return -ENOMEM; spin_lock_init(&iommu->iommu_lock); + iommu->domain = &sun50i_iommu_identity_domain; platform_set_drvdata(pdev, iommu); iommu->dev = &pdev->dev; From c9071a4dec692824b0a2b55c4aafaeb5fed56101 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:54:05 +0800 Subject: [PATCH 33/39] iommu: Require a default_domain for all iommu drivers mainline inclusion from mainline-v6.6-rc3 commit 98ac73f99bc44fba8a14252ffb0bad02459f7008 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- At this point every iommu driver will cause a default_domain to be selected, so we can finally remove this gap from the core code. The following table explains what each driver supports and what the resulting default_domain will be: ops->defaut_domain IDENTITY DMA PLATFORM v ARM32 dma-iommu ARCH amd/iommu.c Y Y N/A either apple-dart.c Y Y N/A either arm-smmu.c Y Y IDENTITY either qcom_iommu.c G Y IDENTITY either arm-smmu-v3.c Y Y N/A either exynos-iommu.c G Y IDENTITY either fsl_pamu_domain.c Y Y N/A N/A PLATFORM intel/iommu.c Y Y N/A either ipmmu-vmsa.c G Y IDENTITY either msm_iommu.c G IDENTITY N/A mtk_iommu.c G Y IDENTITY either mtk_iommu_v1.c G IDENTITY N/A omap-iommu.c G IDENTITY N/A rockchip-iommu.c G Y IDENTITY either s390-iommu.c Y Y N/A N/A PLATFORM sprd-iommu.c Y N/A DMA sun50i-iommu.c G Y IDENTITY either tegra-smmu.c G Y IDENTITY IDENTITY virtio-iommu.c Y Y N/A either spapr Y Y N/A N/A PLATFORM * G means ops->identity_domain is used * N/A means the driver will not compile in this configuration ARM32 drivers select an IDENTITY default domain through either the ops->identity_domain or directly requesting an IDENTIY domain through alloc_domain(). In ARM64 mode tegra-smmu will still block the use of dma-iommu.c and forces an IDENTITY domain. S390 uses a PLATFORM domain to represent when the dma_ops are set to the s390 iommu code. fsl_pamu uses an PLATFORM domain. POWER SPAPR uses PLATFORM and blocking to enable its weird VFIO mode. The x86 drivers continue unchanged. After this patch group->default_domain is only NULL for a short period during bus iommu probing while all the groups are constituted. Otherwise it is always !NULL. This completes changing the iommu subsystem driver contract to a system where the current iommu_domain always represents some form of translation and the driver is continuously asserting a definable translation mode. It resolves the confusion that the original ops->detach_dev() caused around what translation, exactly, is the IOMMU performing after detach. There were at least three different answers to that question in the tree, they are all now clearly named with domain types. Tested-by: Heiko Stuebner Tested-by: Niklas Schnelle Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index afca07b5f1a1e..cf5d1fd84e3c3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1866,7 +1866,6 @@ static int iommu_get_def_domain_type(struct iommu_group *group, static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { - const struct iommu_ops *ops = group_iommu_ops(group); struct device *untrusted = NULL; struct group_device *gdev; int driver_type = 0; @@ -1877,11 +1876,13 @@ static int iommu_get_default_domain_type(struct iommu_group *group, * ARM32 drivers supporting CONFIG_ARM_DMA_USE_IOMMU can declare an * identity_domain and it will automatically become their default * domain. Later on ARM_DMA_USE_IOMMU will install its UNMANAGED domain. - * Override the selection to IDENTITY if we are sure the driver supports - * it. + * Override the selection to IDENTITY. */ - if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && ops->identity_domain) + if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) { + static_assert(!(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && + IS_ENABLED(CONFIG_IOMMU_DMA))); driver_type = IOMMU_DOMAIN_IDENTITY; + } for_each_group_device(group, gdev) { driver_type = iommu_get_def_domain_type(group, gdev->dev, @@ -3019,18 +3020,9 @@ static int iommu_setup_default_domain(struct iommu_group *group, if (req_type < 0) return -EINVAL; - /* - * There are still some drivers which don't support default domains, so - * we ignore the failure and leave group->default_domain NULL. - */ dom = iommu_group_alloc_default_domain(group, req_type); - if (!dom) { - /* Once in default_domain mode we never leave */ - if (group->default_domain) - return -ENODEV; - group->default_domain = NULL; - return 0; - } + if (!dom) + return -ENODEV; if (group->default_domain == dom) return 0; From 78db90d31d984d995370b6736097a8d7e67c5899 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 14:57:17 +0800 Subject: [PATCH 34/39] iommu: Add __iommu_group_domain_alloc() mainline inclusion from mainline-v6.6-rc3 commit 8359cf39acba72606736f453f54432ac9dc28523 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- Allocate a domain from a group. Automatically obtains the iommu_ops to use from the device list of the group. Convert the internal callers to use it. Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/21-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 59 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index cf5d1fd84e3c3..e878777b65821 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -96,8 +96,8 @@ static const char * const iommu_group_resv_type_string[] = { static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); -static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, - unsigned type); +static struct iommu_domain * +__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, @@ -1718,12 +1718,11 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) EXPORT_SYMBOL_GPL(fsl_mc_device_group); static struct iommu_domain * -__iommu_group_alloc_default_domain(const struct bus_type *bus, - struct iommu_group *group, int req_type) +__iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; - return __iommu_domain_alloc(bus, req_type); + return __iommu_group_domain_alloc(group, req_type); } /* @@ -1750,9 +1749,7 @@ static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { - const struct bus_type *bus = - list_first_entry(&group->devices, struct group_device, list) - ->dev->bus; + const struct iommu_ops *ops = group_iommu_ops(group); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); @@ -1762,24 +1759,24 @@ iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM * domain. Do not use in new drivers. */ - if (bus->iommu_ops->default_domain) { + if (ops->default_domain) { if (req_type) return ERR_PTR(-EINVAL); - return bus->iommu_ops->default_domain; + return ops->default_domain; } if (req_type) - return __iommu_group_alloc_default_domain(bus, group, req_type); + return __iommu_group_alloc_default_domain(group, req_type); /* The driver gave no guidance on what type to use, try the default */ - dom = __iommu_group_alloc_default_domain(bus, group, iommu_def_domain_type); + dom = __iommu_group_alloc_default_domain(group, iommu_def_domain_type); if (dom) return dom; /* Otherwise IDENTITY and DMA_FQ defaults will try DMA */ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA) return NULL; - dom = __iommu_group_alloc_default_domain(bus, group, IOMMU_DOMAIN_DMA); + dom = __iommu_group_alloc_default_domain(group, IOMMU_DOMAIN_DMA); if (!dom) return NULL; @@ -2044,19 +2041,16 @@ void iommu_set_fault_handler(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); -static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, - unsigned type) +static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, + unsigned int type) { struct iommu_domain *domain; unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS; - if (bus == NULL || bus->iommu_ops == NULL) - return NULL; + if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) + return ops->identity_domain; - if (alloc_type == IOMMU_DOMAIN_IDENTITY && bus->iommu_ops->identity_domain) - return bus->iommu_ops->identity_domain; - - domain = bus->iommu_ops->domain_alloc(alloc_type); + domain = ops->domain_alloc(alloc_type); if (!domain) return NULL; @@ -2066,10 +2060,10 @@ static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, * may override this later */ if (!domain->pgsize_bitmap) - domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap; + domain->pgsize_bitmap = ops->pgsize_bitmap; if (!domain->ops) - domain->ops = bus->iommu_ops->default_domain_ops; + domain->ops = ops->default_domain_ops; if (iommu_is_dma_domain(domain) && iommu_get_dma_cookie(domain)) { iommu_domain_free(domain); @@ -2078,9 +2072,17 @@ static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, return domain; } +static struct iommu_domain * +__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) +{ + return __iommu_domain_alloc(group_iommu_ops(group), type); +} + struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) { - return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED); + if (bus == NULL || bus->iommu_ops == NULL) + return NULL; + return __iommu_domain_alloc(bus->iommu_ops, IOMMU_DOMAIN_UNMANAGED); } EXPORT_SYMBOL_GPL(iommu_domain_alloc); @@ -3242,21 +3244,18 @@ void iommu_device_unuse_default_domain(struct device *dev) static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { - struct group_device *dev = - list_first_entry(&group->devices, struct group_device, list); - if (group->blocking_domain) return 0; group->blocking_domain = - __iommu_domain_alloc(dev->dev->bus, IOMMU_DOMAIN_BLOCKED); + __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); if (!group->blocking_domain) { /* * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED * create an empty domain instead. */ - group->blocking_domain = __iommu_domain_alloc( - dev->dev->bus, IOMMU_DOMAIN_UNMANAGED); + group->blocking_domain = __iommu_group_domain_alloc( + group, IOMMU_DOMAIN_UNMANAGED); if (!group->blocking_domain) return -EINVAL; } From 78c3a76ce45ea15bf6e7b3f7594016c21cdf4a6a Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 15:05:16 +0800 Subject: [PATCH 35/39] iommu: Add ops->domain_alloc_paging() mainline inclusion from mainline-v6.6-rc3 commit 4601cd2d7c4c82c4bafc822e1ff630a709eff206 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- This callback requests the driver to create only a __IOMMU_DOMAIN_PAGING domain, so it saves a few lines in a lot of drivers needlessly checking the type. More critically, this allows us to sweep out all the IOMMU_DOMAIN_UNMANAGED and IOMMU_DOMAIN_DMA checks from a lot of the drivers, simplifying what is going on in the code and ultimately removing the now-unused special cases in drivers where they did not support IOMMU_DOMAIN_DMA. domain_alloc_paging() should return a struct iommu_domain that is functionally compatible with ARM_DMA_USE_IOMMU, dma-iommu.c and iommufd. Be forwards looking and pass in a 'struct device *' argument. We can provide this when allocating the default_domain. No drivers will look at this. Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 17 ++++++++++++++--- include/linux/iommu.h | 3 +++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e878777b65821..969e4f7e63d90 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2042,6 +2042,7 @@ void iommu_set_fault_handler(struct iommu_domain *domain, EXPORT_SYMBOL_GPL(iommu_set_fault_handler); static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, + struct device *dev, unsigned int type) { struct iommu_domain *domain; @@ -2049,8 +2050,13 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) return ops->identity_domain; + else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) + domain = ops->domain_alloc_paging(dev); + else if (ops->domain_alloc) + domain = ops->domain_alloc(alloc_type); + else + return NULL; - domain = ops->domain_alloc(alloc_type); if (!domain) return NULL; @@ -2075,14 +2081,19 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, static struct iommu_domain * __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) { - return __iommu_domain_alloc(group_iommu_ops(group), type); + struct device *dev = + list_first_entry(&group->devices, struct group_device, list) + ->dev; + + return __iommu_domain_alloc(group_iommu_ops(group), dev, type); } struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) { if (bus == NULL || bus->iommu_ops == NULL) return NULL; - return __iommu_domain_alloc(bus->iommu_ops, IOMMU_DOMAIN_UNMANAGED); + return __iommu_domain_alloc(bus->iommu_ops, NULL, + IOMMU_DOMAIN_UNMANAGED); } EXPORT_SYMBOL_GPL(iommu_domain_alloc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3192c5599864a..d87464a63ff49 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -239,6 +239,8 @@ struct iommu_iotlb_gather { * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. * @domain_alloc: allocate iommu domain + * @domain_alloc_paging: Allocate an iommu_domain that can be used for + * UNMANAGED, DMA, and DMA_FQ domain types. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -273,6 +275,7 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); From b92f9e7f84fe9eb9ddbc20ca7c7bbe0b6bb17c8c Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 15:08:33 +0800 Subject: [PATCH 36/39] iommu: Convert simple drivers with DOMAIN_DMA to domain_alloc_paging() mainline inclusion from mainline-v6.6-rc3 commit 3529375e7777b0f9b77c53df9a7da122bd165ae7 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- These drivers are all trivially converted since the function is only called if the domain type is going to be IOMMU_DOMAIN_UNMANAGED/DMA. Tested-by: Heiko Stuebner Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Tested-by: Yong Wu #For mtk_iommu.c Link: https://lore.kernel.org/r/23-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 6 ++---- drivers/iommu/exynos-iommu.c | 7 ++----- drivers/iommu/ipmmu-vmsa.c | 7 ++----- drivers/iommu/mtk_iommu.c | 7 ++----- drivers/iommu/rockchip-iommu.c | 7 ++----- drivers/iommu/sprd-iommu.c | 7 ++----- drivers/iommu/sun50i-iommu.c | 9 +++------ drivers/iommu/tegra-smmu.c | 7 ++----- 8 files changed, 17 insertions(+), 40 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index bc45d18f350cb..97b2122032b23 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -332,12 +332,10 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, return ret; } -static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type) +static struct iommu_domain *qcom_iommu_domain_alloc_paging(struct device *dev) { struct qcom_iommu_domain *qcom_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -605,7 +603,7 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) static const struct iommu_ops qcom_iommu_ops = { .identity_domain = &qcom_iommu_identity_domain, .capable = qcom_iommu_capable, - .domain_alloc = qcom_iommu_domain_alloc, + .domain_alloc_paging = qcom_iommu_domain_alloc_paging, .probe_device = qcom_iommu_probe_device, .device_group = generic_device_group, .of_xlate = qcom_iommu_of_xlate, diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 5e12b85dfe870..d6dead2ed10c1 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -887,7 +887,7 @@ static inline void exynos_iommu_set_pte(sysmmu_pte_t *ent, sysmmu_pte_t val) DMA_TO_DEVICE); } -static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type) +static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) { struct exynos_iommu_domain *domain; dma_addr_t handle; @@ -896,9 +896,6 @@ static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type) /* Check if correct PTE offsets are initialized */ BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev); - if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -1472,7 +1469,7 @@ static int exynos_iommu_of_xlate(struct device *dev, static const struct iommu_ops exynos_iommu_ops = { .identity_domain = &exynos_identity_domain, - .domain_alloc = exynos_iommu_domain_alloc, + .domain_alloc_paging = exynos_iommu_domain_alloc_paging, .device_group = generic_device_group, .probe_device = exynos_iommu_probe_device, .release_device = exynos_iommu_release_device, diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 04830d3931d23..eaabae7615776 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -563,13 +563,10 @@ static irqreturn_t ipmmu_irq(int irq, void *dev) * IOMMU Operations */ -static struct iommu_domain *ipmmu_domain_alloc(unsigned type) +static struct iommu_domain *ipmmu_domain_alloc_paging(struct device *dev) { struct ipmmu_vmsa_domain *domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -892,7 +889,7 @@ static struct iommu_group *ipmmu_find_group(struct device *dev) static const struct iommu_ops ipmmu_ops = { .identity_domain = &ipmmu_iommu_identity_domain, - .domain_alloc = ipmmu_domain_alloc, + .domain_alloc_paging = ipmmu_domain_alloc_paging, .probe_device = ipmmu_probe_device, .release_device = ipmmu_release_device, .probe_finalize = ipmmu_probe_finalize, diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 06192aff5eb8e..f7e2b9cd39a2a 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -688,13 +688,10 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, return 0; } -static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type) +static struct iommu_domain *mtk_iommu_domain_alloc_paging(struct device *dev) { struct mtk_iommu_domain *dom; - if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -1018,7 +1015,7 @@ static void mtk_iommu_get_resv_regions(struct device *dev, static const struct iommu_ops mtk_iommu_ops = { .identity_domain = &mtk_iommu_identity_domain, - .domain_alloc = mtk_iommu_domain_alloc, + .domain_alloc_paging = mtk_iommu_domain_alloc_paging, .probe_device = mtk_iommu_probe_device, .release_device = mtk_iommu_release_device, .device_group = mtk_iommu_device_group, diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index b6f1ba01a341d..527a4395a1b40 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -1043,13 +1043,10 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return ret; } -static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) +static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) { struct rk_iommu_domain *rk_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; - if (!dma_dev) return NULL; @@ -1171,7 +1168,7 @@ static int rk_iommu_of_xlate(struct device *dev, static const struct iommu_ops rk_iommu_ops = { .identity_domain = &rk_identity_domain, - .domain_alloc = rk_iommu_domain_alloc, + .domain_alloc_paging = rk_iommu_domain_alloc_paging, .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, .device_group = rk_iommu_device_group, diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index c8e79a2d8b4c6..1a4bc7287de22 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -134,13 +134,10 @@ sprd_iommu_pgt_size(struct iommu_domain *domain) SPRD_IOMMU_PAGE_SHIFT) * sizeof(u32); } -static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type) +static struct iommu_domain *sprd_iommu_domain_alloc_paging(struct device *dev) { struct sprd_iommu_domain *dom; - if (domain_type != IOMMU_DOMAIN_DMA && domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -421,7 +418,7 @@ static int sprd_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) static const struct iommu_ops sprd_iommu_ops = { - .domain_alloc = sprd_iommu_domain_alloc, + .domain_alloc_paging = sprd_iommu_domain_alloc_paging, .probe_device = sprd_iommu_probe_device, .device_group = sprd_iommu_device_group, .of_xlate = sprd_iommu_of_xlate, diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 2a61b50c97f83..85ce3dc0bdd59 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -668,14 +668,11 @@ static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain, sun50i_iova_get_page_offset(iova); } -static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) +static struct iommu_domain * +sun50i_iommu_domain_alloc_paging(struct device *dev) { struct sun50i_iommu_domain *sun50i_domain; - if (type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL); if (!sun50i_domain) return NULL; @@ -841,7 +838,7 @@ static const struct iommu_ops sun50i_iommu_ops = { .identity_domain = &sun50i_iommu_identity_domain, .pgsize_bitmap = SZ_4K, .device_group = sun50i_iommu_device_group, - .domain_alloc = sun50i_iommu_domain_alloc, + .domain_alloc_paging = sun50i_iommu_domain_alloc_paging, .of_xlate = sun50i_iommu_of_xlate, .probe_device = sun50i_iommu_probe_device, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index b91ad1b5a20d3..1764a63347b04 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -272,13 +272,10 @@ static void tegra_smmu_free_asid(struct tegra_smmu *smmu, unsigned int id) clear_bit(id, smmu->asids); } -static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type) +static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) { struct tegra_smmu_as *as; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; - as = kzalloc(sizeof(*as), GFP_KERNEL); if (!as) return NULL; @@ -991,7 +988,7 @@ static int tegra_smmu_def_domain_type(struct device *dev) static const struct iommu_ops tegra_smmu_ops = { .identity_domain = &tegra_smmu_identity_domain, .def_domain_type = &tegra_smmu_def_domain_type, - .domain_alloc = tegra_smmu_domain_alloc, + .domain_alloc_paging = tegra_smmu_domain_alloc_paging, .probe_device = tegra_smmu_probe_device, .device_group = tegra_smmu_device_group, .of_xlate = tegra_smmu_of_xlate, From 46127cae148279cc529b679a38281c70ab6af3b7 Mon Sep 17 00:00:00 2001 From: gaorui Date: Tue, 22 Apr 2025 15:09:38 +0800 Subject: [PATCH 37/39] iommu: Convert remaining simple drivers to domain_alloc_paging() mainline inclusion from mainline-v6.6-rc3 commit 4efd98d41ea71835f4d291176ff1fd0a1803dfd5 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- These drivers don't support IOMMU_DOMAIN_DMA, so this commit effectively allows them to support that mode. The prior work to require default_domains makes this safe because every one of these drivers is either compilation incompatible with dma-iommu.c, or already establishing a default_domain. In both cases alloc_domain() will never be called with IOMMU_DOMAIN_DMA for these drivers so it is safe to drop the test. Removing these tests clarifies that the domain allocation path is only about the functionality of a paging domain and has nothing to do with policy of how the paging domain is used for UNMANAGED/DMA/DMA_FQ. Tested-by: Niklas Schnelle Tested-by: Steven Price Tested-by: Marek Szyprowski Tested-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/24-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/msm_iommu.c | 7 ++----- drivers/iommu/mtk_iommu_v1.c | 7 ++----- drivers/iommu/omap-iommu.c | 7 ++----- drivers/iommu/s390-iommu.c | 7 ++----- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 26ed81cfeee89..a163cee0b7242 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -302,13 +302,10 @@ static void __program_context(void __iomem *base, int ctx, SET_M(base, ctx, 1); } -static struct iommu_domain *msm_iommu_domain_alloc(unsigned type) +static struct iommu_domain *msm_iommu_domain_alloc_paging(struct device *dev) { struct msm_priv *priv; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) goto fail_nomem; @@ -691,7 +688,7 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id) static struct iommu_ops msm_iommu_ops = { .identity_domain = &msm_iommu_identity_domain, - .domain_alloc = msm_iommu_domain_alloc, + .domain_alloc_paging = msm_iommu_domain_alloc_paging, .probe_device = msm_iommu_probe_device, .device_group = generic_device_group, .pgsize_bitmap = MSM_IOMMU_PGSIZES, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 0e6b2bb4955de..574c6e38cfb3d 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -270,13 +270,10 @@ static int mtk_iommu_v1_domain_finalise(struct mtk_iommu_v1_data *data) return 0; } -static struct iommu_domain *mtk_iommu_v1_domain_alloc(unsigned type) +static struct iommu_domain *mtk_iommu_v1_domain_alloc_paging(struct device *dev) { struct mtk_iommu_v1_domain *dom; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -585,7 +582,7 @@ static int mtk_iommu_v1_hw_init(const struct mtk_iommu_v1_data *data) static const struct iommu_ops mtk_iommu_v1_ops = { .identity_domain = &mtk_iommu_v1_identity_domain, - .domain_alloc = mtk_iommu_v1_domain_alloc, + .domain_alloc_paging = mtk_iommu_v1_domain_alloc_paging, .probe_device = mtk_iommu_v1_probe_device, .probe_finalize = mtk_iommu_v1_probe_finalize, .release_device = mtk_iommu_v1_release_device, diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 34340ef15241b..fcf99bd195b32 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1580,13 +1580,10 @@ static struct iommu_domain omap_iommu_identity_domain = { .ops = &omap_iommu_identity_ops, }; -static struct iommu_domain *omap_iommu_domain_alloc(unsigned type) +static struct iommu_domain *omap_iommu_domain_alloc_paging(struct device *dev) { struct omap_iommu_domain *omap_domain; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL); if (!omap_domain) return NULL; @@ -1748,7 +1745,7 @@ static struct iommu_group *omap_iommu_device_group(struct device *dev) static const struct iommu_ops omap_iommu_ops = { .identity_domain = &omap_iommu_identity_domain, - .domain_alloc = omap_iommu_domain_alloc, + .domain_alloc_paging = omap_iommu_domain_alloc_paging, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, .device_group = omap_iommu_device_group, diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index f0c867c57a5b9..5695ad71d60e2 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -39,13 +39,10 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap) } } -static struct iommu_domain *s390_domain_alloc(unsigned domain_type) +static struct iommu_domain *s390_domain_alloc_paging(struct device *dev) { struct s390_domain *s390_domain; - if (domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL); if (!s390_domain) return NULL; @@ -447,7 +444,7 @@ void zpci_destroy_iommu(struct zpci_dev *zdev) static const struct iommu_ops s390_iommu_ops = { .default_domain = &s390_iommu_platform_domain, .capable = s390_iommu_capable, - .domain_alloc = s390_domain_alloc, + .domain_alloc_paging = s390_domain_alloc_paging, .probe_device = s390_iommu_probe_device, .release_device = s390_iommu_release_device, .device_group = generic_device_group, From 3fa05bfc77589feb53bc2f5d7ac542f602e560de Mon Sep 17 00:00:00 2001 From: gaorui Date: Thu, 31 Jul 2025 09:49:43 +0800 Subject: [PATCH 38/39] iommu: Do not use IOMMU_DOMAIN_DMA if CONFIG_IOMMU_DMA is not enabled mainline inclusion from mainline-v6.6-rc5 commit 0f6a90436a5771fc9f6ca0d1e64f7549219e6c3c category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- msm_iommu platforms do not select either CONFIG_IOMMU_DMA or CONFIG_ARM_DMA_USE_IOMMU so they create a IOMMU_DOMAIN_DMA domain by default and never populate it. This acts like a BLOCKED domain and breaks the GPU driver on the platform. Detect this and force use of IDENTITY instead. Fixes: 98ac73f99bc4 ("iommu: Require a default_domain for all iommu drivers") Reported-by: Dmitry Baryshkov Link: https://lore.kernel.org/linux-iommu/CAA8EJprz7VVmBG68U9zLuqPd0UdSRHYoLDJSP6tCj6H6qanuTQ@mail.gmail.com/ Signed-off-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Tested-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/0-v1-20700abdf239+19c-iommu_no_dma_iommu_jgg@nvidia.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 969e4f7e63d90..1392e4e289b71 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1896,6 +1896,18 @@ static int iommu_get_default_domain_type(struct iommu_group *group, } } + /* + * If the common dma ops are not selected in kconfig then we cannot use + * IOMMU_DOMAIN_DMA at all. Force IDENTITY if nothing else has been + * selected. + */ + if (!IS_ENABLED(CONFIG_IOMMU_DMA)) { + if (WARN_ON(driver_type == IOMMU_DOMAIN_DMA)) + return -1; + if (!driver_type) + driver_type = IOMMU_DOMAIN_IDENTITY; + } + if (untrusted) { if (driver_type && driver_type != IOMMU_DOMAIN_DMA) { dev_err_ratelimited( From ceb2003a1c6c130c73d3c4f1f807a9ed19eed1e0 Mon Sep 17 00:00:00 2001 From: gaorui Date: Thu, 31 Jul 2025 09:59:26 +0800 Subject: [PATCH 39/39] iommu: Handle race with default domain setup mainline inclusion from mainline-v6.14-rc7 commit b46064a18810bad3aea089a79993ca5ea7a3d2b2 category: feature bugzilla: https://github.com/RVCK-Project/rvck/issues/75 -------------------------------- It turns out that deferred default domain creation leaves a subtle race window during iommu_device_register() wherein a client driver may asynchronously probe in parallel and get as far as performing DMA API operations with dma-direct, only to be switched to iommu-dma underfoot once the default domain attachment finally happens, with obviously disastrous consequences. Even the wonky of_iommu_configure() path is at risk, since iommu_fwspec_init() will no longer defer client probe as the instance ops are (necessarily) already registered, and the "replay" iommu_probe_device() call can see dev->iommu_group already set and so think there's nothing to do either. Fortunately we already have the right tool in the right place in the form of iommu_device_use_default_domain(), which just needs to ensure that said default domain is actually ready to *be* used. Deferring the client probe shouldn't have too much impact, given that this only happens while the IOMMU driver is probing, and thus due to kick the deferred probe list again once it finishes. Reported-by: Charan Teja Kalla Fixes: 98ac73f99bc4 ("iommu: Require a default_domain for all iommu drivers") Reviewed-by: Jason Gunthorpe Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e88b94c9b575034a2c98a48b3d383654cbda7902.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel Signed-off-by: Gao Rui --- drivers/iommu/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1392e4e289b71..a7f2bbf665ce5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3225,6 +3225,11 @@ int iommu_device_use_default_domain(struct device *dev) return 0; mutex_lock(&group->mutex); + /* We may race against bus_iommu_probe() finalising groups here */ + if (!group->default_domain) { + ret = -EPROBE_DEFER; + goto unlock_out; + } if (group->owner_cnt) { if (group->owner || !iommu_is_default_domain(group) || !xa_empty(&group->pasid_array)) {